diff options
Diffstat (limited to 'fs')
169 files changed, 3617 insertions, 2302 deletions
diff --git a/fs/adfs/super.c b/fs/adfs/super.c index f0b999a4961b..017c48a80203 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -6,7 +6,8 @@ */ #include <linux/module.h> #include <linux/init.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -115,87 +116,61 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root) return 0; } -enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err}; +enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix}; -static const match_table_t tokens = { - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_ownmask, "ownmask=%o"}, - {Opt_othmask, "othmask=%o"}, - {Opt_ftsuffix, "ftsuffix=%u"}, - {Opt_err, NULL} +static const struct fs_parameter_spec adfs_param_spec[] = { + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("ownmask", Opt_ownmask), + fsparam_u32oct ("othmask", Opt_othmask), + fsparam_u32 ("ftsuffix", Opt_ftsuffix), + {} }; -static int parse_options(struct super_block *sb, struct adfs_sb_info *asb, - char *options) +static int adfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - int option; - - if (!options) - return 0; - - while ((p = strsep(&options, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(args, &option)) - return -EINVAL; - asb->s_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(asb->s_uid)) - return -EINVAL; - break; - case Opt_gid: - if (match_int(args, &option)) - return -EINVAL; - asb->s_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(asb->s_gid)) - return -EINVAL; - break; - case Opt_ownmask: - if (match_octal(args, &option)) - return -EINVAL; - asb->s_owner_mask = option; - break; - case Opt_othmask: - if (match_octal(args, &option)) - return -EINVAL; - asb->s_other_mask = option; - break; - case Opt_ftsuffix: - if (match_int(args, &option)) - return -EINVAL; - asb->s_ftsuffix = option; - break; - default: - adfs_msg(sb, KERN_ERR, - "unrecognised mount option \"%s\" or missing value", - p); - return -EINVAL; - } + struct adfs_sb_info *asb = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, adfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + asb->s_uid = result.uid; + break; + case Opt_gid: + asb->s_gid = result.gid; + break; + case Opt_ownmask: + asb->s_owner_mask = result.uint_32; + break; + case Opt_othmask: + asb->s_other_mask = result.uint_32; + break; + case Opt_ftsuffix: + asb->s_ftsuffix = result.uint_32; + break; + default: + return -EINVAL; } return 0; } -static int adfs_remount(struct super_block *sb, int *flags, char *data) +static int adfs_reconfigure(struct fs_context *fc) { - struct adfs_sb_info temp_asb; - int ret; + struct adfs_sb_info *new_asb = fc->s_fs_info; + struct adfs_sb_info *asb = ADFS_SB(fc->root->d_sb); - sync_filesystem(sb); - *flags |= ADFS_SB_FLAGS; + sync_filesystem(fc->root->d_sb); + fc->sb_flags |= ADFS_SB_FLAGS; - temp_asb = *ADFS_SB(sb); - ret = parse_options(sb, &temp_asb, data); - if (ret == 0) - *ADFS_SB(sb) = temp_asb; + /* Structure copy newly parsed options */ + *asb = *new_asb; - return ret; + return 0; } static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -273,7 +248,6 @@ static const struct super_operations adfs_sops = { .write_inode = adfs_write_inode, .put_super = adfs_put_super, .statfs = adfs_statfs, - .remount_fs = adfs_remount, .show_options = adfs_show_options, }; @@ -361,34 +335,21 @@ static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh, return 0; } -static int adfs_fill_super(struct super_block *sb, void *data, int silent) +static int adfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct adfs_discrecord *dr; struct object_info root_obj; - struct adfs_sb_info *asb; + struct adfs_sb_info *asb = sb->s_fs_info; struct inode *root; int ret = -EINVAL; + int silent = fc->sb_flags & SB_SILENT; sb->s_flags |= ADFS_SB_FLAGS; - asb = kzalloc(sizeof(*asb), GFP_KERNEL); - if (!asb) - return -ENOMEM; - sb->s_fs_info = asb; sb->s_magic = ADFS_SUPER_MAGIC; sb->s_time_gran = 10000000; - /* set default options */ - asb->s_uid = GLOBAL_ROOT_UID; - asb->s_gid = GLOBAL_ROOT_GID; - asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; - asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; - asb->s_ftsuffix = 0; - - if (parse_options(sb, asb, data)) - goto error; - /* Try to probe the filesystem boot block */ ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk); if (ret == -EILSEQ) @@ -453,18 +414,61 @@ error: return ret; } -static struct dentry *adfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int adfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, adfs_fill_super); +} + +static void adfs_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super); + struct adfs_context *asb = fc->s_fs_info; + + kfree(asb); +} + +static const struct fs_context_operations adfs_context_ops = { + .parse_param = adfs_parse_param, + .get_tree = adfs_get_tree, + .reconfigure = adfs_reconfigure, + .free = adfs_free_fc, +}; + +static int adfs_init_fs_context(struct fs_context *fc) +{ + struct adfs_sb_info *asb; + + asb = kzalloc(sizeof(struct adfs_sb_info), GFP_KERNEL); + if (!asb) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct adfs_sb_info *old_asb = ADFS_SB(sb); + + /* structure copy existing options before parsing */ + *asb = *old_asb; + } else { + /* set default options */ + asb->s_uid = GLOBAL_ROOT_UID; + asb->s_gid = GLOBAL_ROOT_GID; + asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; + asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; + asb->s_ftsuffix = 0; + } + + fc->ops = &adfs_context_ops; + fc->s_fs_info = asb; + + return 0; } static struct file_system_type adfs_fs_type = { .owner = THIS_MODULE, .name = "adfs", - .mount = adfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = adfs_init_fs_context, + .parameters = adfs_param_spec, }; MODULE_ALIAS_FS("adfs"); diff --git a/fs/affs/super.c b/fs/affs/super.c index 3c5821339609..2fa40337776d 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -14,7 +14,8 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/statfs.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/magic.h> #include <linux/sched.h> #include <linux/cred.h> @@ -27,7 +28,6 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); static int affs_show_options(struct seq_file *m, struct dentry *root); -static int affs_remount (struct super_block *sb, int *flags, char *data); static void affs_commit_super(struct super_block *sb, int wait) @@ -155,140 +155,114 @@ static const struct super_operations affs_sops = { .put_super = affs_put_super, .sync_fs = affs_sync_fs, .statfs = affs_statfs, - .remount_fs = affs_remount, .show_options = affs_show_options, }; enum { Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect, Opt_reserved, Opt_root, Opt_setgid, Opt_setuid, - Opt_verbose, Opt_volume, Opt_ignore, Opt_err, + Opt_verbose, Opt_volume, Opt_ignore, }; -static const match_table_t tokens = { - {Opt_bs, "bs=%u"}, - {Opt_mode, "mode=%o"}, - {Opt_mufs, "mufs"}, - {Opt_notruncate, "nofilenametruncate"}, - {Opt_prefix, "prefix=%s"}, - {Opt_protect, "protect"}, - {Opt_reserved, "reserved=%u"}, - {Opt_root, "root=%u"}, - {Opt_setgid, "setgid=%u"}, - {Opt_setuid, "setuid=%u"}, - {Opt_verbose, "verbose"}, - {Opt_volume, "volume=%s"}, - {Opt_ignore, "grpquota"}, - {Opt_ignore, "noquota"}, - {Opt_ignore, "quota"}, - {Opt_ignore, "usrquota"}, - {Opt_err, NULL}, +struct affs_context { + kuid_t uid; /* uid to override */ + kgid_t gid; /* gid to override */ + unsigned int mode; /* mode to override */ + unsigned int reserved; /* Number of reserved blocks */ + int root_block; /* FFS root block number */ + int blocksize; /* Initial device blksize */ + char *prefix; /* Prefix for volumes and assigns */ + char volume[32]; /* Vol. prefix for absolute symlinks */ + unsigned long mount_flags; /* Options */ }; -static int -parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root, - int *blocksize, char **prefix, char *volume, unsigned long *mount_opts) +static const struct fs_parameter_spec affs_param_spec[] = { + fsparam_u32 ("bs", Opt_bs), + fsparam_u32oct ("mode", Opt_mode), + fsparam_flag ("mufs", Opt_mufs), + fsparam_flag ("nofilenametruncate", Opt_notruncate), + fsparam_string ("prefix", Opt_prefix), + fsparam_flag ("protect", Opt_protect), + fsparam_u32 ("reserved", Opt_reserved), + fsparam_u32 ("root", Opt_root), + fsparam_gid ("setgid", Opt_setgid), + fsparam_uid ("setuid", Opt_setuid), + fsparam_flag ("verbose", Opt_verbose), + fsparam_string ("volume", Opt_volume), + fsparam_flag ("grpquota", Opt_ignore), + fsparam_flag ("noquota", Opt_ignore), + fsparam_flag ("quota", Opt_ignore), + fsparam_flag ("usrquota", Opt_ignore), + {}, +}; + +static int affs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - - /* Fill in defaults */ - - *uid = current_uid(); - *gid = current_gid(); - *reserved = 2; - *root = -1; - *blocksize = -1; - volume[0] = ':'; - volume[1] = 0; - *mount_opts = 0; - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token, n, option; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_bs: - if (match_int(&args[0], &n)) - return 0; - if (n != 512 && n != 1024 && n != 2048 - && n != 4096) { - pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); - return 0; - } - *blocksize = n; - break; - case Opt_mode: - if (match_octal(&args[0], &option)) - return 0; - *mode = option & 0777; - affs_set_opt(*mount_opts, SF_SETMODE); - break; - case Opt_mufs: - affs_set_opt(*mount_opts, SF_MUFS); - break; - case Opt_notruncate: - affs_set_opt(*mount_opts, SF_NO_TRUNCATE); - break; - case Opt_prefix: - kfree(*prefix); - *prefix = match_strdup(&args[0]); - if (!*prefix) - return 0; - affs_set_opt(*mount_opts, SF_PREFIX); - break; - case Opt_protect: - affs_set_opt(*mount_opts, SF_IMMUTABLE); - break; - case Opt_reserved: - if (match_int(&args[0], reserved)) - return 0; - break; - case Opt_root: - if (match_int(&args[0], root)) - return 0; - break; - case Opt_setgid: - if (match_int(&args[0], &option)) - return 0; - *gid = make_kgid(current_user_ns(), option); - if (!gid_valid(*gid)) - return 0; - affs_set_opt(*mount_opts, SF_SETGID); - break; - case Opt_setuid: - if (match_int(&args[0], &option)) - return 0; - *uid = make_kuid(current_user_ns(), option); - if (!uid_valid(*uid)) - return 0; - affs_set_opt(*mount_opts, SF_SETUID); - break; - case Opt_verbose: - affs_set_opt(*mount_opts, SF_VERBOSE); - break; - case Opt_volume: { - char *vol = match_strdup(&args[0]); - if (!vol) - return 0; - strscpy(volume, vol, 32); - kfree(vol); - break; - } - case Opt_ignore: - /* Silently ignore the quota options */ - break; - default: - pr_warn("Unrecognized mount option \"%s\" or missing value\n", - p); - return 0; + struct affs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int n; + int opt; + + opt = fs_parse(fc, affs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_bs: + n = result.uint_32; + if (n != 512 && n != 1024 && n != 2048 + && n != 4096) { + pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); + return -EINVAL; } + ctx->blocksize = n; + break; + case Opt_mode: + ctx->mode = result.uint_32 & 0777; + affs_set_opt(ctx->mount_flags, SF_SETMODE); + break; + case Opt_mufs: + affs_set_opt(ctx->mount_flags, SF_MUFS); + break; + case Opt_notruncate: + affs_set_opt(ctx->mount_flags, SF_NO_TRUNCATE); + break; + case Opt_prefix: + kfree(ctx->prefix); + ctx->prefix = param->string; + param->string = NULL; + affs_set_opt(ctx->mount_flags, SF_PREFIX); + break; + case Opt_protect: + affs_set_opt(ctx->mount_flags, SF_IMMUTABLE); + break; + case Opt_reserved: + ctx->reserved = result.uint_32; + break; + case Opt_root: + ctx->root_block = result.uint_32; + break; + case Opt_setgid: + ctx->gid = result.gid; + affs_set_opt(ctx->mount_flags, SF_SETGID); + break; + case Opt_setuid: + ctx->uid = result.uid; + affs_set_opt(ctx->mount_flags, SF_SETUID); + break; + case Opt_verbose: + affs_set_opt(ctx->mount_flags, SF_VERBOSE); + break; + case Opt_volume: + strscpy(ctx->volume, param->string, 32); + break; + case Opt_ignore: + /* Silently ignore the quota options */ + break; + default: + return -EINVAL; } - return 1; + return 0; } static int affs_show_options(struct seq_file *m, struct dentry *root) @@ -329,27 +303,22 @@ static int affs_show_options(struct seq_file *m, struct dentry *root) * hopefully have the guts to do so. Until then: sorry for the mess. */ -static int affs_fill_super(struct super_block *sb, void *data, int silent) +static int affs_fill_super(struct super_block *sb, struct fs_context *fc) { struct affs_sb_info *sbi; + struct affs_context *ctx = fc->fs_private; struct buffer_head *root_bh = NULL; struct buffer_head *boot_bh; struct inode *root_inode = NULL; - s32 root_block; + int silent = fc->sb_flags & SB_SILENT; int size, blocksize; u32 chksum; int num_bm; int i, j; - kuid_t uid; - kgid_t gid; - int reserved; - unsigned long mount_flags; int tmp_flags; /* fix remount prototype... */ u8 sig[4]; int ret; - pr_debug("read_super(%s)\n", data ? (const char *)data : "no options"); - sb->s_magic = AFFS_SUPER_MAGIC; sb->s_op = &affs_sops; sb->s_flags |= SB_NODIRATIME; @@ -369,19 +338,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock); - if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, - &blocksize,&sbi->s_prefix, - sbi->s_volume, &mount_flags)) { - pr_err("Error parsing options\n"); - return -EINVAL; - } - /* N.B. after this point s_prefix must be released */ + sbi->s_flags = ctx->mount_flags; + sbi->s_mode = ctx->mode; + sbi->s_uid = ctx->uid; + sbi->s_gid = ctx->gid; + sbi->s_reserved = ctx->reserved; + sbi->s_prefix = ctx->prefix; + ctx->prefix = NULL; + memcpy(sbi->s_volume, ctx->volume, 32); - sbi->s_flags = mount_flags; - sbi->s_mode = i; - sbi->s_uid = uid; - sbi->s_gid = gid; - sbi->s_reserved= reserved; + /* N.B. after this point s_prefix must be released */ /* Get the size of the device in 512-byte blocks. * If we later see that the partition uses bigger @@ -396,15 +362,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) i = bdev_logical_block_size(sb->s_bdev); j = PAGE_SIZE; + blocksize = ctx->blocksize; if (blocksize > 0) { i = j = blocksize; size = size / (blocksize / 512); } for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) { - sbi->s_root_block = root_block; - if (root_block < 0) - sbi->s_root_block = (reserved + size - 1) / 2; + sbi->s_root_block = ctx->root_block; + if (ctx->root_block < 0) + sbi->s_root_block = (ctx->reserved + size - 1) / 2; pr_debug("setting blocksize to %d\n", blocksize); affs_set_blocksize(sb, blocksize); sbi->s_partition_size = size; @@ -424,7 +391,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) "size=%d, reserved=%d\n", sb->s_id, sbi->s_root_block + num_bm, - blocksize, size, reserved); + ctx->blocksize, size, ctx->reserved); root_bh = affs_bread(sb, sbi->s_root_block + num_bm); if (!root_bh) continue; @@ -447,7 +414,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) got_root: /* Keep super block in cache */ sbi->s_root_bh = root_bh; - root_block = sbi->s_root_block; + ctx->root_block = sbi->s_root_block; /* Find out which kind of FS we have */ boot_bh = sb_bread(sb, 0); @@ -506,7 +473,7 @@ got_root: return -EINVAL; } - if (affs_test_opt(mount_flags, SF_VERBOSE)) { + if (affs_test_opt(ctx->mount_flags, SF_VERBOSE)) { u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0]; pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", len > 31 ? 31 : len, @@ -528,7 +495,7 @@ got_root: /* set up enough so that it can read an inode */ - root_inode = affs_iget(sb, root_block); + root_inode = affs_iget(sb, ctx->root_block); if (IS_ERR(root_inode)) return PTR_ERR(root_inode); @@ -548,56 +515,43 @@ got_root: return 0; } -static int -affs_remount(struct super_block *sb, int *flags, char *data) +static int affs_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + struct affs_context *ctx = fc->fs_private; struct affs_sb_info *sbi = AFFS_SB(sb); - int blocksize; - kuid_t uid; - kgid_t gid; - int mode; - int reserved; - int root_block; - unsigned long mount_flags; int res = 0; - char volume[32]; - char *prefix = NULL; - - pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); sync_filesystem(sb); - *flags |= SB_NODIRATIME; - - memcpy(volume, sbi->s_volume, 32); - if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, - &blocksize, &prefix, volume, - &mount_flags)) { - kfree(prefix); - return -EINVAL; - } + fc->sb_flags |= SB_NODIRATIME; flush_delayed_work(&sbi->sb_work); - sbi->s_flags = mount_flags; - sbi->s_mode = mode; - sbi->s_uid = uid; - sbi->s_gid = gid; + /* + * NB: Historically, only mount_flags, mode, uid, gic, prefix, + * and volume are accepted during remount. + */ + sbi->s_flags = ctx->mount_flags; + sbi->s_mode = ctx->mode; + sbi->s_uid = ctx->uid; + sbi->s_gid = ctx->gid; /* protect against readers */ spin_lock(&sbi->symlink_lock); - if (prefix) { + if (ctx->prefix) { kfree(sbi->s_prefix); - sbi->s_prefix = prefix; + sbi->s_prefix = ctx->prefix; + ctx->prefix = NULL; } - memcpy(sbi->s_volume, volume, 32); + memcpy(sbi->s_volume, ctx->volume, 32); spin_unlock(&sbi->symlink_lock); - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (*flags & SB_RDONLY) + if (fc->sb_flags & SB_RDONLY) affs_free_bitmap(sb); else - res = affs_init_bitmap(sb, flags); + res = affs_init_bitmap(sb, &fc->sb_flags); return res; } @@ -624,10 +578,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static struct dentry *affs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int affs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); + return get_tree_bdev(fc, affs_fill_super); } static void affs_kill_sb(struct super_block *sb) @@ -643,12 +596,61 @@ static void affs_kill_sb(struct super_block *sb) } } +static void affs_free_fc(struct fs_context *fc) +{ + struct affs_context *ctx = fc->fs_private; + + kfree(ctx->prefix); + kfree(ctx); +} + +static const struct fs_context_operations affs_context_ops = { + .parse_param = affs_parse_param, + .get_tree = affs_get_tree, + .reconfigure = affs_reconfigure, + .free = affs_free_fc, +}; + +static int affs_init_fs_context(struct fs_context *fc) +{ + struct affs_context *ctx; + + ctx = kzalloc(sizeof(struct affs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct affs_sb_info *sbi = AFFS_SB(sb); + + /* + * NB: historically, no options other than volume were + * preserved across a remount unless they were explicitly + * passed in. + */ + memcpy(ctx->volume, sbi->s_volume, 32); + } else { + ctx->uid = current_uid(); + ctx->gid = current_gid(); + ctx->reserved = 2; + ctx->root_block = -1; + ctx->blocksize = -1; + ctx->volume[0] = ':'; + } + + fc->ops = &affs_context_ops; + fc->fs_private = ctx; + + return 0; +} + static struct file_system_type affs_fs_type = { .owner = THIS_MODULE, .name = "affs", - .mount = affs_mount, .kill_sb = affs_kill_sb, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = affs_init_fs_context, + .parameters = affs_param_spec, }; MODULE_ALIAS_FS("affs"); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index f8622ed72e08..ada363af5aab 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -12,6 +12,7 @@ #include <linux/swap.h> #include <linux/ctype.h> #include <linux/sched.h> +#include <linux/iversion.h> #include <linux/task_io_accounting_ops.h> #include "internal.h" #include "afs_fs.h" @@ -1823,6 +1824,8 @@ error: static void afs_rename_success(struct afs_operation *op) { + struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry)); + _enter("op=%08x", op->debug_id); op->ctime = op->file[0].scb.status.mtime_client; @@ -1832,6 +1835,22 @@ static void afs_rename_success(struct afs_operation *op) op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); } + + /* If we're moving a subdir between dirs, we need to update + * its DV counter too as the ".." will be altered. + */ + if (S_ISDIR(vnode->netfs.inode.i_mode) && + op->file[0].vnode != op->file[1].vnode) { + u64 new_dv; + + write_seqlock(&vnode->cb_lock); + + new_dv = vnode->status.data_version + 1; + vnode->status.data_version = new_dv; + inode_set_iversion_raw(&vnode->netfs.inode, new_dv); + + write_sequnlock(&vnode->cb_lock); + } } static void afs_rename_edit_dir(struct afs_operation *op) @@ -1873,6 +1892,12 @@ static void afs_rename_edit_dir(struct afs_operation *op) &vnode->fid, afs_edit_dir_for_rename_2); } + if (S_ISDIR(vnode->netfs.inode.i_mode) && + new_dvnode != orig_dvnode && + test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + afs_edit_dir_update_dotdot(vnode, new_dvnode, + afs_edit_dir_for_rename_sub); + new_inode = d_inode(new_dentry); if (new_inode) { spin_lock(&new_inode->i_lock); diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index a71bff10496b..fe223fb78111 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -127,10 +127,10 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) /* * Scan a directory block looking for a dirent of the right name. */ -static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name, +static int afs_dir_scan_block(const union afs_xdr_dir_block *block, const struct qstr *name, unsigned int blocknum) { - union afs_xdr_dirent *de; + const union afs_xdr_dirent *de; u64 bitmap; int d, len, n; @@ -492,3 +492,90 @@ error: clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); goto out_unmap; } + +/* + * Edit a subdirectory that has been moved between directories to update the + * ".." entry. + */ +void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, + enum afs_edit_dir_reason why) +{ + union afs_xdr_dir_block *block; + union afs_xdr_dirent *de; + struct folio *folio; + unsigned int nr_blocks, b; + pgoff_t index; + loff_t i_size; + int slot; + + _enter(""); + + i_size = i_size_read(&vnode->netfs.inode); + if (i_size < AFS_DIR_BLOCK_SIZE) { + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + return; + } + nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; + + /* Find a block that has sufficient slots available. Each folio + * contains two or more directory blocks. + */ + for (b = 0; b < nr_blocks; b++) { + index = b / AFS_DIR_BLOCKS_PER_PAGE; + folio = afs_dir_get_folio(vnode, index); + if (!folio) + goto error; + + block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio)); + + /* Abandon the edit if we got a callback break. */ + if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) + goto invalidated; + + slot = afs_dir_scan_block(block, &dotdot_name, b); + if (slot >= 0) + goto found_dirent; + + kunmap_local(block); + folio_unlock(folio); + folio_put(folio); + } + + /* Didn't find the dirent to clobber. Download the directory again. */ + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, + 0, 0, 0, 0, ".."); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out; + +found_dirent: + de = &block->dirents[slot]; + de->u.vnode = htonl(new_dvnode->fid.vnode); + de->u.unique = htonl(new_dvnode->fid.unique); + + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot, + ntohl(de->u.vnode), ntohl(de->u.unique), ".."); + + kunmap_local(block); + folio_unlock(folio); + folio_put(folio); + inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); + +out: + _leave(""); + return; + +invalidated: + kunmap_local(block); + folio_unlock(folio); + folio_put(folio); + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, + 0, 0, 0, 0, ".."); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out; + +error: + trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, + 0, 0, 0, 0, ".."); + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + goto out; +} diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 52aab09a32a9..c9d620175e80 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1073,6 +1073,8 @@ extern void afs_check_for_remote_deletion(struct afs_operation *); extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); +void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, + enum afs_edit_dir_reason why); /* * dir_silly.c @@ -2191,7 +2191,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); diff --git a/fs/attr.c b/fs/attr.c index c04d19b58f12..9caf63d20d03 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -272,6 +272,47 @@ out_big: EXPORT_SYMBOL(inode_newsize_ok); /** + * setattr_copy_mgtime - update timestamps for mgtime inodes + * @inode: inode timestamps to be updated + * @attr: attrs for the update + * + * With multigrain timestamps, take more care to prevent races when + * updating the ctime. Always update the ctime to the very latest using + * the standard mechanism, and use that to populate the atime and mtime + * appropriately (unless those are being set to specific values). + */ +static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + struct timespec64 now; + + if (ia_valid & ATTR_CTIME) { + /* + * In the case of an update for a write delegation, we must respect + * the value in ia_ctime and not use the current time. + */ + if (ia_valid & ATTR_DELEG) + now = inode_set_ctime_deleg(inode, attr->ia_ctime); + else + now = inode_set_ctime_current(inode); + } else { + /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */ + WARN_ON_ONCE(ia_valid & ATTR_MTIME); + now = current_time(inode); + } + + if (ia_valid & ATTR_ATIME_SET) + inode_set_atime_to_ts(inode, attr->ia_atime); + else if (ia_valid & ATTR_ATIME) + inode_set_atime_to_ts(inode, now); + + if (ia_valid & ATTR_MTIME_SET) + inode_set_mtime_to_ts(inode, attr->ia_mtime); + else if (ia_valid & ATTR_MTIME) + inode_set_mtime_to_ts(inode, now); +} + +/** * setattr_copy - copy simple metadata updates into the generic inode * @idmap: idmap of the mount the inode was found from * @inode: the inode to be updated @@ -303,12 +344,6 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); - if (ia_valid & ATTR_ATIME) - inode_set_atime_to_ts(inode, attr->ia_atime); - if (ia_valid & ATTR_MTIME) - inode_set_mtime_to_ts(inode, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) - inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, @@ -316,6 +351,20 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, mode &= ~S_ISGID; inode->i_mode = mode; } + + if (is_mgtime(inode)) + return setattr_copy_mgtime(inode, attr); + + if (ia_valid & ATTR_ATIME) + inode_set_atime_to_ts(inode, attr->ia_atime); + if (ia_valid & ATTR_MTIME) + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (ia_valid & ATTR_CTIME) { + if (ia_valid & ATTR_DELEG) + inode_set_ctime_deleg(inode, attr->ia_ctime); + else + inode_set_ctime_to_ts(inode, attr->ia_ctime); + } } EXPORT_SYMBOL(setattr_copy); diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index f011e026358e..6d57efbb8110 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -110,6 +110,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) */ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { + unsigned int inr = _IOC_NR(cmd); int err; err = check_dev_ioctl_version(cmd, param); @@ -133,7 +134,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD. */ err = check_name(param->path); - if (cmd == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) + if (inr == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) err = err ? 0 : -EINVAL; if (err) { pr_warn("invalid path supplied for cmd(0x%08x)\n", @@ -141,8 +142,6 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) goto out; } } else { - unsigned int inr = _IOC_NR(cmd); - if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD || inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD || inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) { diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index f8e87c6721b1..163a67b97a40 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -168,6 +168,9 @@ static inline bool data_type_movable(enum bch_data_type type) static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, struct bch_dev *ca) { + if (a.data_type >= BCH_DATA_NR) + return 0; + if (!data_type_movable(a.data_type) || !bch2_bucket_sectors_fragmented(ca, a)) return 0; diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 5836870ab882..372178c8d416 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -162,6 +162,10 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) ARRAY_SIZE(c->open_buckets_partial)); spin_lock(&c->freelist_lock); + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; + rcu_read_unlock(); + ob->on_partial_list = true; c->open_buckets_partial[c->open_buckets_partial_nr++] = ob - c->open_buckets; @@ -972,7 +976,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, u64 avail; bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, watermark); + avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; if (!avail) continue; @@ -981,6 +985,10 @@ static int bucket_alloc_set_partial(struct bch_fs *c, i); ob->on_partial_list = false; + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; + rcu_read_unlock(); + ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, have_cache, ob); @@ -1191,7 +1199,13 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, --c->open_buckets_partial_nr; swap(c->open_buckets_partial[i], c->open_buckets_partial[c->open_buckets_partial_nr]); + ob->on_partial_list = false; + + rcu_read_lock(); + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; + rcu_read_unlock(); + spin_unlock(&c->freelist_lock); bch2_open_bucket_put(c, ob); spin_lock(&c->freelist_lock); @@ -1610,8 +1624,7 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list && - (!ca || ob->dev == ca->dev_idx)) + if (ob->valid && (!ca || ob->dev == ca->dev_idx)) bch2_open_bucket_to_text(out, c, ob); spin_unlock(&ob->lock); } diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 47455a85c909..654a58132a4d 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -52,6 +52,12 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + int ret = 0; + + bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, + c, backpointer_level_bad, + "backpointer level bad: %u >= %u", + bp.v->level, BTREE_MAX_DEPTH); rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); @@ -64,7 +70,6 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); rcu_read_unlock(); - int ret = 0; bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || !bpos_eq(bp.k->p, bp_pos), @@ -947,9 +952,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, - struct bkey_s_c_backpointer bp, + struct bkey_s_c bp_k, struct bkey_buf *last_flushed) { + if (bp_k.k->type != KEY_TYPE_backpointer) + return 0; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); struct bch_fs *c = trans->c; struct btree_iter iter; struct bbpos pos = bp_to_bbpos(*bp.v); @@ -1004,9 +1013,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); - check_one_backpointer(trans, start, end, - bkey_s_c_to_backpointer(k), - &last_flushed); + check_one_backpointer(trans, start, end, k, &last_flushed); })); bch2_bkey_buf_exit(&last_flushed, c); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index f4151ee51b03..e94a83b8113e 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -555,6 +555,7 @@ struct bch_dev { u64 alloc_cursor[3]; unsigned nr_open_buckets; + unsigned nr_partial_buckets; unsigned nr_btree_reserve; size_t inc_gen_needs_gc; diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 587d7318a2e8..995ba32e9b6e 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -643,7 +643,7 @@ int bch2_bkey_format_invalid(struct bch_fs *c, enum bch_validate_flags flags, struct printbuf *err) { - unsigned i, bits = KEY_PACKED_BITS_START; + unsigned bits = KEY_PACKED_BITS_START; if (f->nr_fields != BKEY_NR_FIELDS) { prt_printf(err, "incorrect number of fields: got %u, should be %u", @@ -655,9 +655,8 @@ int bch2_bkey_format_invalid(struct bch_fs *c, * Verify that the packed format can't represent fields larger than the * unpacked format: */ - for (i = 0; i < f->nr_fields; i++) { - if ((!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) && - bch2_bkey_format_field_overflows(f, i)) { + for (unsigned i = 0; i < f->nr_fields; i++) { + if (bch2_bkey_format_field_overflows(f, i)) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); unsigned packed_bits = min(64, f->bits_per_field[i]); diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 6e4afb2b5441..7123019ab3bc 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -59,16 +59,38 @@ static inline size_t btree_cache_can_free(struct btree_cache_list *list) static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) { + BUG_ON(!list_empty(&b->list)); + if (b->c.lock.readers) - list_move(&b->list, &bc->freed_pcpu); + list_add(&b->list, &bc->freed_pcpu); else - list_move(&b->list, &bc->freed_nonpcpu); + list_add(&b->list, &bc->freed_nonpcpu); +} + +static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(!list_empty(&b->list)); + BUG_ON(!b->data); + + bc->nr_freeable++; + list_add(&b->list, &bc->freeable); } -static void btree_node_data_free(struct bch_fs *c, struct btree *b) +void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; + mutex_lock(&bc->lock); + __bch2_btree_node_to_freelist(bc, b); + mutex_unlock(&bc->lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +} + +static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); /* @@ -94,11 +116,17 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) #endif b->aux_data = NULL; - bc->nr_freeable--; - btree_node_to_freedlist(bc, b); } +static void btree_node_data_free(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(list_empty(&b->list)); + list_del_init(&b->list); + --bc->nr_freeable; + __btree_node_data_free(bc, b); +} + static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -174,21 +202,10 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) bch2_btree_lock_init(&b->c, 0); - bc->nr_freeable++; - list_add(&b->list, &bc->freeable); + __bch2_btree_node_to_freelist(bc, b); return b; } -void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) -{ - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); -} - static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) { struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); @@ -236,11 +253,11 @@ void bch2_btree_cache_unpin(struct bch_fs *c) /* Btree in memory cache - hash table */ -void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { lockdep_assert_held(&bc->lock); - int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); BUG_ON(ret); /* Cause future lookups for this node to fail: */ @@ -248,17 +265,22 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) if (b->c.btree_id < BTREE_ID_NR) --bc->nr_by_btree[b->c.btree_id]; + --bc->live[btree_node_pinned(b)].nr; + list_del_init(&b->list); +} - bc->live[btree_node_pinned(b)].nr--; - bc->nr_freeable++; - list_move(&b->list, &bc->freeable); +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +{ + __bch2_btree_node_hash_remove(bc, b); + __bch2_btree_node_to_freelist(bc, b); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) { + BUG_ON(!list_empty(&b->list)); BUG_ON(b->hash_val); - b->hash_val = btree_ptr_hash_val(&b->key); + b->hash_val = btree_ptr_hash_val(&b->key); int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); if (ret) @@ -270,10 +292,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) bool p = __btree_node_pinned(bc, b); mod_bit(BTREE_NODE_pinned, &b->flags, p); - list_move_tail(&b->list, &bc->live[p].list); + list_add_tail(&b->list, &bc->live[p].list); bc->live[p].nr++; - - bc->nr_freeable--; return 0; } @@ -485,7 +505,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, goto out; if (!btree_node_reclaim(c, b, true)) { - btree_node_data_free(c, b); + btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; @@ -501,10 +521,10 @@ restart: bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; --touched;; } else if (!btree_node_reclaim(c, b, true)) { - bch2_btree_node_hash_remove(bc, b); + __bch2_btree_node_hash_remove(bc, b); + __btree_node_data_free(bc, b); freed++; - btree_node_data_free(c, b); bc->nr_freed++; six_unlock_write(&b->c.lock); @@ -587,7 +607,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); - btree_node_data_free(c, b); + btree_node_data_free(bc, b); } BUG_ON(!bch2_journal_error(&c->journal) && @@ -786,8 +806,8 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); -got_node: +got_node: /* * btree_free() doesn't free memory; it sticks the node on the end of * the list. Check if there's any freed nodes there: @@ -796,7 +816,12 @@ got_node: if (!btree_node_reclaim(c, b2, false)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); + + list_del_init(&b2->list); + --bc->nr_freeable; btree_node_to_freedlist(bc, b2); + mutex_unlock(&bc->lock); + six_unlock_write(&b2->c.lock); six_unlock_intent(&b2->c.lock); goto got_mem; @@ -810,11 +835,8 @@ got_node: goto err; } - mutex_lock(&bc->lock); - bc->nr_freeable++; got_mem: - mutex_unlock(&bc->lock); - + BUG_ON(!list_empty(&b->list)); BUG_ON(btree_node_hashed(b)); BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_write_in_flight(b)); @@ -845,7 +867,7 @@ err: if (bc->alloc_lock == current) { b2 = btree_node_cannibalize(c); clear_btree_node_just_written(b2); - bch2_btree_node_hash_remove(bc, b2); + __bch2_btree_node_hash_remove(bc, b2); if (b) { swap(b->data, b2->data); @@ -855,9 +877,9 @@ err: six_unlock_intent(&b2->c.lock); } else { b = b2; - list_del_init(&b->list); } + BUG_ON(!list_empty(&b->list)); mutex_unlock(&bc->lock); trace_and_count(c, btree_cache_cannibalize, trans); @@ -936,7 +958,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, b->hash_val = 0; mutex_lock(&bc->lock); - list_add(&b->list, &bc->freeable); + __bch2_btree_node_to_freelist(bc, b); mutex_unlock(&bc->lock); six_unlock_write(&b->c.lock); @@ -1312,9 +1334,12 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); - if (!IS_ERR_OR_NULL(b)) + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret; + if (b) six_unlock_read(&b->c.lock); - return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b); + return 0; } void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) @@ -1353,7 +1378,7 @@ wait_on_io: mutex_lock(&bc->lock); bch2_btree_node_hash_remove(bc, b); - btree_node_data_free(c, b); + btree_node_data_free(bc, b); mutex_unlock(&bc->lock); out: six_unlock_write(&b->c.lock); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 367acd217c6a..66e86d1a178d 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -14,7 +14,9 @@ void bch2_recalc_btree_reserve(struct bch_fs *); void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *); +void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); + int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 0ca3feeb42c8..81dcf9e512c0 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -182,7 +182,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) bch2_btree_node_drop_keys_outside_node(b); mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, &new->k_i); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 6296a11ccb09..839d68802e42 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -733,11 +733,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, c, ca, b, i, NULL, bset_past_end_of_btree_node, "bset past end of btree node (offset %u len %u but written %zu)", - offset, sectors, ptr_written ?: btree_sectors(c))) { + offset, sectors, ptr_written ?: btree_sectors(c))) i->u64s = 0; - ret = 0; - goto out; - } btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, @@ -829,7 +826,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BSET_BIG_ENDIAN(i), write, &bn->format); } -out: fsck_err: printbuf_exit(&buf2); printbuf_exit(&buf1); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 0883cf6e1a3e..eef9b89c561d 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -882,6 +882,18 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); k = bch2_btree_and_journal_iter_peek(&jiter); + if (!k.k) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " at btree "); + bch2_btree_pos_to_text(&buf, c, l->b); + + ret = bch2_fs_topology_error(c, "%s", buf.buf); + printbuf_exit(&buf); + goto err; + } bch2_bkey_buf_reassemble(out, c, k); @@ -889,6 +901,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, c->opts.btree_node_prefetch) ret = btree_path_prefetch_j(trans, path, &jiter); +err: bch2_btree_and_journal_iter_exit(&jiter); return ret; } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index a7aedb134e9f..30131c3bdd97 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -186,7 +186,7 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, .ptrs[0].offset = offset, .ptrs[0].dev = ca->dev_idx, - .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), + .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)), }; rcu_read_unlock(); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 64f0928e1137..d596ef93239f 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -237,10 +237,6 @@ static void __btree_node_free(struct btree_trans *trans, struct btree *b) BUG_ON(b->will_make_reachable); clear_btree_node_noevict(b); - - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); - mutex_unlock(&c->btree_cache.lock); } static void bch2_btree_node_free_inmem(struct btree_trans *trans, @@ -252,12 +248,12 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, bch2_btree_node_lock_write_nofail(trans, path, &b->c); + __btree_node_free(trans, b); + mutex_lock(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); - __btree_node_free(trans, b); - six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); @@ -289,7 +285,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, clear_btree_node_need_write(b); mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); BUG_ON(p->nr >= ARRAY_SIZE(p->b)); @@ -521,8 +517,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); __btree_node_free(trans, b); - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); + bch2_btree_node_to_freelist(c, b); } } } @@ -1434,6 +1429,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as, } } +static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) +{ + if (insert_keys) + for_each_keylist_key(insert_keys, k) + if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos)) + return true; + return false; +} + /* * Move keys from n1 (original replacement node, now lower node) to n2 (higher * node) @@ -1441,7 +1445,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, static void __btree_split_node(struct btree_update *as, struct btree_trans *trans, struct btree *b, - struct btree *n[2]) + struct btree *n[2], + struct keylist *insert_keys) { struct bkey_packed *k; struct bpos n1_pos = POS_MIN; @@ -1476,7 +1481,8 @@ static void __btree_split_node(struct btree_update *as, if (b->c.level && u64s < n1_u64s && u64s + k->u64s >= n1_u64s && - bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p)) + (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) || + key_deleted_in_insert(insert_keys, uk.p))) n1_u64s += k->u64s; i = u64s >= n1_u64s; @@ -1603,7 +1609,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); - __btree_split_node(as, trans, b, n); + __btree_split_node(as, trans, b, n, keys); if (keys) { btree_split_insert_keys(as, trans, path, n1, keys); @@ -2392,7 +2398,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, if (new_hash) { mutex_lock(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, new_hash); - bch2_btree_node_hash_remove(&c->btree_cache, b); + + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, new_key); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 3f56b584f8ec..1639c60dffa0 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -277,6 +277,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); int ret = 0; + ret = bch2_journal_error(&c->journal); + if (ret) + return ret; + bch2_trans_unlock(trans); bch2_trans_begin(trans); @@ -491,7 +495,8 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) return ret; } -static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, + bool *did_work) { struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; @@ -502,6 +507,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); + *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; + /* * On memory allocation failure, bch2_btree_write_buffer_flush_locked() * is not guaranteed to empty wb->inc: @@ -521,17 +528,34 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, struct journal_entry_pin *_pin, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool did_work = false; - return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); + return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work)); } int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { struct bch_fs *c = trans->c; + bool did_work = false; trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); - return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); + return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work); +} + +/* + * The write buffer requires flushing when going RO: keys in the journal for the + * write buffer don't have a journal pin yet + */ +bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c) +{ + if (bch2_journal_error(&c->journal)) + return false; + + bool did_work = false; + bch2_trans_run(c, btree_write_buffer_flush_seq(trans, + journal_cur_seq(&c->journal), &did_work)); + return did_work; } int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index 725e79654216..d535cea28bde 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -21,6 +21,7 @@ static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) struct btree_trans; int bch2_btree_write_buffer_flush_sync(struct btree_trans *); +bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *); int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index fd5e6ccad45e..ccc78bfe2fd4 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -103,12 +103,18 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) return gens->b + b; } -static inline u8 bucket_gen_get(struct bch_dev *ca, size_t b) +static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) +{ + u8 *gen = bucket_gen(ca, b); + return gen ? *gen : -1; +} + +static inline int bucket_gen_get(struct bch_dev *ca, size_t b) { rcu_read_lock(); - u8 gen = *bucket_gen(ca, b); + int ret = bucket_gen_get_rcu(ca, b); rcu_read_unlock(); - return gen; + return ret; } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -169,10 +175,8 @@ static inline int gen_after(u8 a, u8 b) static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)); - if (!gen) - return -1; - return gen_after(*gen, ptr->gen); + int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr)); + return gen < 0 ? gen : gen_after(gen, ptr->gen); } /** @@ -184,7 +188,6 @@ static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr rcu_read_lock(); int ret = dev_ptr_stale_rcu(ca, ptr); rcu_read_unlock(); - return ret; } diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index a6ee0beee6b0..8e75a852b358 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -236,7 +236,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (((1U << i) & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) { - bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), ptr); rewrites_found |= 1U << i; } i++; @@ -284,7 +285,8 @@ restart_drop_extra_replicas: durability - ptr_durability >= m->op.opts.data_replicas) { durability -= ptr_durability; - bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), &entry->ptr); goto restart_drop_extra_replicas; } } @@ -295,7 +297,7 @@ restart_drop_extra_replicas: bch2_extent_ptr_decoded_append(insert, &p); bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize(c, bkey_i_to_s(insert)); + bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); ret = bch2_sum_sector_overwrites(trans, &iter, insert, &should_check_enospc, @@ -558,7 +560,8 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - struct data_update_opts data_opts) + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; struct bkey_i *n; @@ -569,11 +572,11 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, if (ret) return ret; - while (data_opts.kill_ptrs) { - unsigned i = 0, drop = __fls(data_opts.kill_ptrs); + while (data_opts->kill_ptrs) { + unsigned i = 0, drop = __fls(data_opts->kill_ptrs); bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); - data_opts.kill_ptrs ^= 1U << drop; + data_opts->kill_ptrs ^= 1U << drop; } /* @@ -581,7 +584,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, * will do the appropriate thing with it (turning it into a * KEY_TYPE_error key, or just a discard if it was a cached extent) */ - bch2_extent_normalize(c, bkey_i_to_s(n)); + bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); /* * Since we're not inserting through an extent iterator @@ -720,7 +723,7 @@ int bch2_data_update_init(struct btree_trans *trans, m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); + ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); goto out; } diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 8d36365bdea8..e4b50723428e 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -40,7 +40,8 @@ void bch2_data_update_read_done(struct data_update *, int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, struct bkey_s_c, - struct data_update_opts); + struct bch_io_opts *, + struct data_update_opts *); void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index a0aa5bb467d9..749dcf368841 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1870,6 +1870,10 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, } h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); + if (!h) { + h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); + goto err; + } found: if (h->rw_devs_change_count != c->rw_devs_change_count) ec_stripe_head_devs_update(c, h); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 649263516ab1..9c4fe5cdbfb7 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -83,6 +83,8 @@ x(ENOMEM, ENOMEM_fs_other_alloc) \ x(ENOMEM, ENOMEM_dev_alloc) \ x(ENOMEM, ENOMEM_disk_accounting) \ + x(ENOMEM, ENOMEM_stripe_head_alloc) \ + x(ENOMEM, ENOMEM_journal_read_bucket) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ @@ -222,6 +224,7 @@ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \ x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ x(BCH_ERR_invalid_sb, invalid_sb_members) \ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index cc0d22085aef..37e3d69bec06 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -978,31 +978,54 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke return NULL; } -void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) +static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, + struct bch_extent_ptr *ptr) +{ + if (!opts->promote_target || + !bch2_dev_in_target(c, ptr->dev, opts->promote_target)) + return false; + + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + + return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); +} + +void bch2_extent_ptr_set_cached(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; - union bch_extent_entry *ec = NULL; + struct extent_ptr_decoded p; - bkey_extent_entry_for_each(ptrs, entry) { + rcu_read_lock(); + if (!want_cached_ptr(c, opts, ptr)) { + bch2_bkey_drop_ptr_noerror(k, ptr); + goto out; + } + + /* + * Stripes can't contain cached data, for - reasons. + * + * Possibly something we can fix in the future? + */ + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (&entry->ptr == ptr) { - ptr->cached = true; - if (ec) - extent_entry_drop(k, ec); - return; + if (p.has_ec) + bch2_bkey_drop_ptr_noerror(k, ptr); + else + ptr->cached = true; + goto out; } - if (extent_entry_is_stripe_ptr(entry)) - ec = entry; - else if (extent_entry_is_ptr(entry)) - ec = NULL; - } - BUG(); +out: + rcu_read_unlock(); } /* - * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. * * Returns true if @k should be dropped entirely * @@ -1016,8 +1039,39 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) rcu_read_lock(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && - (ca = bch2_dev_rcu(c, ptr->dev)) && - dev_ptr_stale_rcu(ca, ptr) > 0); + (!(ca = bch2_dev_rcu(c, ptr->dev)) || + dev_ptr_stale_rcu(ca, ptr) > 0)); + rcu_read_unlock(); + + return bkey_deleted(k.k); +} + +/* + * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. + * + * Like bch2_extent_normalize(), but also only keeps a single cached pointer on + * the promote target. + */ +bool bch2_extent_normalize_by_opts(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s k) +{ + struct bkey_ptrs ptrs; + bool have_cached_ptr; + + rcu_read_lock(); +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(k); + have_cached_ptr = false; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->cached) { + if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) { + bch2_bkey_drop_ptr(k, ptr); + goto restart_drop_ptrs; + } + have_cached_ptr = true; + } rcu_read_unlock(); return bkey_deleted(k.k); @@ -1310,7 +1364,7 @@ void bch2_ptr_swab(struct bkey_s k) for (entry = ptrs.start; entry < ptrs.end; entry = extent_entry_next(entry)) { - switch (extent_entry_type(entry)) { + switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: break; case BCH_EXTENT_ENTRY_crc32: @@ -1330,6 +1384,9 @@ void bch2_ptr_swab(struct bkey_s k) break; case BCH_EXTENT_ENTRY_rebalance: break; + default: + /* Bad entry type: will be caught by validate() */ + return; } } } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 923a5f1849a8..bcffcf60aaaf 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -686,9 +686,12 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); struct bch_extent_ptr * bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); -void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); +void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *, + struct bkey_s, struct bch_extent_ptr *); +bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); + void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 15d3f073b824..2456c41b215e 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -587,7 +587,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, POS(inode->v.i_ino, start_sector), BTREE_ITER_slots|BTREE_ITER_intent); - while (!ret && bkey_lt(iter.pos, end_pos)) { + while (!ret) { s64 i_sectors_delta = 0; struct quota_res quota_res = { 0 }; struct bkey_s_c k; @@ -598,6 +598,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_begin(trans); + if (bkey_ge(iter.pos, end_pos)) + break; + ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); if (ret) @@ -634,12 +637,15 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, - opts.data_replicas, true)) + opts.data_replicas, true)) { ret = drop_locks_do(trans, (bch2_clamp_data_hole(&inode->v, &hole_start, &hole_end, opts.data_replicas, false), 0)); + if (ret) + goto bkey_err; + } bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); if (ret) @@ -667,10 +673,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); if (bch2_mark_pagecache_reserved(inode, &hole_start, - iter.pos.offset, true)) - drop_locks_do(trans, + iter.pos.offset, true)) { + ret = drop_locks_do(trans, bch2_mark_pagecache_reserved(inode, &hole_start, iter.pos.offset, false)); + if (ret) + goto bkey_err; + } bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index fc246f342820..b3b934a87c6d 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -262,7 +262,8 @@ err: bio_free_pages(&(*rbio)->bio); kfree(*rbio); *rbio = NULL; - kfree(op); + /* We may have added to the rhashtable and thus need rcu freeing: */ + kfree_rcu(op, rcu); bch2_write_ref_put(c, BCH_WRITE_REF_promote); return ERR_PTR(ret); } @@ -802,16 +803,15 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, PTR_BUCKET_POS(ca, &ptr), BTREE_ITER_cached); - u8 *gen = bucket_gen(ca, iter.pos.offset); - if (gen) { - + int gen = bucket_gen_get(ca, iter.pos.offset); + if (gen >= 0) { prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); printbuf_indent_add(&buf, 2); bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); - prt_printf(&buf, "memory gen: %u", *gen); + prt_printf(&buf, "memory gen: %u", gen); ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); if (!ret) { diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 8609e25e450f..96720adcfee0 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1300,11 +1300,8 @@ retry: bucket_to_u64(i->b), BUCKET_NOCOW_LOCK_UPDATE); - rcu_read_lock(); - u8 *gen = bucket_gen(ca, i->b.offset); - stale = !gen ? -1 : gen_after(*gen, i->gen); - rcu_read_unlock(); - + int gen = bucket_gen_get(ca, i->b.offset); + stale = gen < 0 ? gen : gen_after(gen, i->gen); if (unlikely(stale)) { stale_at = i; goto err_bucket_stale; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 954f6a96e0f4..fb35dd336331 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -708,6 +708,9 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs container_of(entry, struct jset_entry_dev_usage, entry); unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + if (vstruct_bytes(entry) < sizeof(*u)) + return; + prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); printbuf_indent_add(out, 2); @@ -1012,6 +1015,8 @@ reread: nr_bvecs = buf_pages(buf->data, sectors_read << 9); bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!bio) + return -BCH_ERR_ENOMEM_journal_read_bucket; bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 8c456d8b8b99..0ef4a86850bb 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -266,7 +266,7 @@ int bch2_move_extent(struct moving_context *ctxt, if (!data_opts.rewrite_ptrs && !data_opts.extra_replicas) { if (data_opts.kill_ptrs) - return bch2_extent_drop_ptrs(trans, iter, k, data_opts); + return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); return 0; } diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 6673cbd8bdb9..0e2ee262fbd4 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -226,7 +226,7 @@ const struct bch_option bch2_opt_table[] = { #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ .min = _min, .max = _max #define OPT_STR(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = ARRAY_SIZE(_choices), \ + .min = 0, .max = ARRAY_SIZE(_choices) - 1, \ .choices = _choices #define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ .min = 0, .max = U64_MAX, \ @@ -428,7 +428,7 @@ void bch2_opt_to_text(struct printbuf *out, prt_printf(out, "%lli", v); break; case BCH_OPT_STR: - if (v < opt->min || v >= opt->max - 1) + if (v < opt->min || v >= opt->max) prt_printf(out, "(invalid option %lli)", v); else if (flags & OPT_SHOW_FULL_LIST) prt_string_option(out, opt->choices, v); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 454b5a32dd7f..3c7f941dde39 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -94,11 +94,10 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); @@ -863,6 +862,13 @@ use_clean: if (ret) goto err; + /* + * Normally set by the appropriate recovery pass: when cleared, this + * indicates we're in early recovery and btree updates should be done by + * being applied to the journal replay keys. _Must_ be cleared before + * multithreaded use: + */ + set_bit(BCH_FS_may_go_rw, &c->flags); clear_bit(BCH_FS_fsck_running, &c->flags); /* in case we don't run journal replay, i.e. norecovery mode */ @@ -1002,6 +1008,7 @@ int bch2_fs_initialize(struct bch_fs *c) struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; struct qstr lostfound = QSTR("lost+found"); + struct bch_member *m; int ret; bch_notice(c, "initializing new filesystem"); @@ -1018,6 +1025,14 @@ int bch2_fs_initialize(struct bch_fs *c) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); bch2_write_super(c); } + + for_each_member_device(c, ca) { + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); + ca->mi = bch2_mi_to_cpu(m); + } + + bch2_write_super(c); mutex_unlock(&c->sb_lock); c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 735b8adc8f9d..dff589ddc984 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -27,6 +27,12 @@ const char * const bch2_recovery_passes[] = { NULL }; +/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ +static int bch2_recovery_pass_empty(struct bch_fs *c) +{ + return 0; +} + static int bch2_set_may_go_rw(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; @@ -221,6 +227,12 @@ int bch2_run_recovery_passes(struct bch_fs *c) { int ret = 0; + /* + * We can't allow set_may_go_rw to be excluded; that would cause us to + * use the journal replay keys for updates where it's not expected. + */ + c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { if (c->opts.recovery_pass_last && c->curr_recovery_pass > c->opts.recovery_pass_last) diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 9d96c06e365c..94dc20ca2065 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -13,6 +13,7 @@ * must never change: */ #define BCH_RECOVERY_PASSES() \ + x(recovery_pass_empty, 41, PASS_SILENT) \ x(scan_for_btree_nodes, 37, 0) \ x(check_topology, 4, 0) \ x(accounting_read, 39, PASS_ALWAYS) \ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index ae715ff658e8..8767c33c2b51 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -143,6 +143,9 @@ UPGRADE_TABLE() static int have_stripes(struct bch_fs *c) { + if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b)) + return 0; + return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b); } diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 937275d061fe..9feb6739f77a 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -136,7 +136,9 @@ enum bch_fsck_flags { x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ x(need_discard_freespace_key_bad, 124, 0) \ + x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_level_bad, 294, 0) \ x(backpointer_to_missing_device, 126, 0) \ x(backpointer_to_missing_alloc, 127, 0) \ x(backpointer_to_missing_ptr, 128, 0) \ @@ -177,7 +179,9 @@ enum bch_fsck_flags { x(ptr_stripe_redundant, 163, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_v_pos_bad, 292, 0) \ x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(reflink_refcount_underflow, 293, 0) \ x(stripe_pos_bad, 167, 0) \ x(stripe_val_size_bad, 168, 0) \ x(stripe_csum_granularity_bad, 290, 0) \ @@ -302,7 +306,7 @@ enum bch_fsck_flags { x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(MAX, 291, 0) + x(MAX, 295, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index fb08dd680dac..116131f95815 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -163,7 +163,7 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } - if (m.btree_bitmap_shift >= 64) { + if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) { prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift); return -BCH_ERR_invalid_sb_members; } @@ -450,7 +450,7 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns m->btree_bitmap_shift += resize; } - BUG_ON(m->btree_bitmap_shift > 57); + BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX); BUG_ON(end > 64ULL << m->btree_bitmap_shift); for (unsigned bit = start >> m->btree_bitmap_shift; diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index d727d2dfda08..2adf1221a440 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -66,6 +66,12 @@ struct bch_member { }; /* + * btree_allocated_bitmap can represent sector addresses of a u64: it itself has + * 64 elements, so 64 - ilog2(64) + */ +#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58 + +/* * This limit comes from the bucket_gens array - it's a single allocation, and * kernel allocation are limited to INT_MAX */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ce7410d72089..7c71594f6a8b 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -287,6 +287,11 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out return -BCH_ERR_invalid_sb_layout_nr_superblocks; } + if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) { + prt_printf(out, "Invalid superblock layout: max_size_bits too high"); + return -BCH_ERR_invalid_sb_layout_sb_max_size_bits; + } + max_sectors = 1 << layout->sb_max_size_bits; prev_offset = le64_to_cpu(layout->sb_offset[0]); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 657fd3759e7b..a6ed9a0bf1c7 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -272,6 +272,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) clean_passes++; if (bch2_btree_interior_updates_flush(c) || + bch2_btree_write_buffer_flush_going_ro(c) || bch2_journal_flush_all_pins(&c->journal) || bch2_btree_flush_all_writes(c) || seq != atomic64_read(&c->journal.seq)) { diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 315038a0a92d..fb5c1543e52f 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -809,6 +809,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, unsigned i; u64 time; + if (nr == 0 || nr_threads == 0) { + pr_err("nr of iterations or threads is not allowed to be 0"); + return -EINVAL; + } + atomic_set(&j.ready, nr_threads); init_waitqueue_head(&j.ready_wait); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index f92f108840f5..8f430ff8e445 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -11,12 +11,13 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/nls.h> #include <linux/buffer_head.h> #include <linux/vfs.h> -#include <linux/parser.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/cred.h> @@ -54,22 +55,20 @@ static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static void befs_put_super(struct super_block *); -static int befs_remount(struct super_block *, int *, char *); static int befs_statfs(struct dentry *, struct kstatfs *); static int befs_show_options(struct seq_file *, struct dentry *); -static int parse_options(char *, struct befs_mount_options *); static struct dentry *befs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); static struct dentry *befs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); static struct dentry *befs_get_parent(struct dentry *child); +static void befs_free_fc(struct fs_context *fc); static const struct super_operations befs_sops = { .alloc_inode = befs_alloc_inode, /* allocate a new inode */ .free_inode = befs_free_inode, /* deallocate an inode */ .put_super = befs_put_super, /* uninit super */ .statfs = befs_statfs, /* statfs */ - .remount_fs = befs_remount, .show_options = befs_show_options, }; @@ -672,92 +671,53 @@ static struct dentry *befs_get_parent(struct dentry *child) } enum { - Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, + Opt_uid, Opt_gid, Opt_charset, Opt_debug, }; -static const match_table_t befs_tokens = { - {Opt_uid, "uid=%d"}, - {Opt_gid, "gid=%d"}, - {Opt_charset, "iocharset=%s"}, - {Opt_debug, "debug"}, - {Opt_err, NULL} +static const struct fs_parameter_spec befs_param_spec[] = { + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_string ("iocharset", Opt_charset), + fsparam_flag ("debug", Opt_debug), + {} }; static int -parse_options(char *options, struct befs_mount_options *opts) +befs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - kuid_t uid; - kgid_t gid; - - /* Initialize options */ - opts->uid = GLOBAL_ROOT_UID; - opts->gid = GLOBAL_ROOT_GID; - opts->use_uid = 0; - opts->use_gid = 0; - opts->iocharset = NULL; - opts->debug = 0; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, befs_tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return 0; - uid = INVALID_UID; - if (option >= 0) - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) { - pr_err("Invalid uid %d, " - "using default\n", option); - break; - } - opts->uid = uid; - opts->use_uid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - gid = INVALID_GID; - if (option >= 0) - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) { - pr_err("Invalid gid %d, " - "using default\n", option); - break; - } - opts->gid = gid; - opts->use_gid = 1; - break; - case Opt_charset: - kfree(opts->iocharset); - opts->iocharset = match_strdup(&args[0]); - if (!opts->iocharset) { - pr_err("allocation failure for " - "iocharset string\n"); - return 0; - } - break; - case Opt_debug: - opts->debug = 1; - break; - default: - pr_err("Unrecognized mount option \"%s\" " - "or missing value\n", p); - return 0; - } + struct befs_mount_options *opts = fc->fs_private; + int token; + struct fs_parse_result result; + + /* befs ignores all options on remount */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + return 0; + + token = fs_parse(fc, befs_param_spec, param, &result); + if (token < 0) + return token; + + switch (token) { + case Opt_uid: + opts->uid = result.uid; + opts->use_uid = 1; + break; + case Opt_gid: + opts->gid = result.gid; + opts->use_gid = 1; + break; + case Opt_charset: + kfree(opts->iocharset); + opts->iocharset = param->string; + param->string = NULL; + break; + case Opt_debug: + opts->debug = 1; + break; + default: + return -EINVAL; } - return 1; + return 0; } static int befs_show_options(struct seq_file *m, struct dentry *root) @@ -793,6 +753,21 @@ befs_put_super(struct super_block *sb) sb->s_fs_info = NULL; } +/* + * Copy the parsed options into the sbi mount_options member + */ +static void +befs_set_options(struct befs_sb_info *sbi, struct befs_mount_options *opts) +{ + sbi->mount_opts.uid = opts->uid; + sbi->mount_opts.gid = opts->gid; + sbi->mount_opts.use_uid = opts->use_uid; + sbi->mount_opts.use_gid = opts->use_gid; + sbi->mount_opts.debug = opts->debug; + sbi->mount_opts.iocharset = opts->iocharset; + opts->iocharset = NULL; +} + /* Allocate private field of the superblock, fill it. * * Finish filling the public superblock fields @@ -800,7 +775,7 @@ befs_put_super(struct super_block *sb) * Load a set of NLS translations if needed. */ static int -befs_fill_super(struct super_block *sb, void *data, int silent) +befs_fill_super(struct super_block *sb, struct fs_context *fc) { struct buffer_head *bh; struct befs_sb_info *befs_sb; @@ -810,6 +785,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) const unsigned long sb_block = 0; const off_t x86_sb_off = 512; int blocksize; + struct befs_mount_options *parsed_opts = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); if (sb->s_fs_info == NULL) @@ -817,11 +794,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) befs_sb = BEFS_SB(sb); - if (!parse_options((char *) data, &befs_sb->mount_opts)) { - if (!silent) - befs_error(sb, "cannot parse mount options"); - goto unacquire_priv_sbp; - } + befs_set_options(befs_sb, parsed_opts); befs_debug(sb, "---> %s", __func__); @@ -934,10 +907,10 @@ unacquire_none: } static int -befs_remount(struct super_block *sb, int *flags, char *data) +befs_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - if (!(*flags & SB_RDONLY)) + sync_filesystem(fc->root->d_sb); + if (!(fc->sb_flags & SB_RDONLY)) return -EINVAL; return 0; } @@ -965,19 +938,51 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static struct dentry * -befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data) +static int befs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, befs_fill_super); +} + +static const struct fs_context_operations befs_context_ops = { + .parse_param = befs_parse_param, + .get_tree = befs_get_tree, + .reconfigure = befs_reconfigure, + .free = befs_free_fc, +}; + +static int befs_init_fs_context(struct fs_context *fc) +{ + struct befs_mount_options *opts; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + /* Initialize options */ + opts->uid = GLOBAL_ROOT_UID; + opts->gid = GLOBAL_ROOT_GID; + + fc->fs_private = opts; + fc->ops = &befs_context_ops; + + return 0; +} + +static void befs_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super); + struct befs_mount_options *opts = fc->fs_private; + + kfree(opts->iocharset); + kfree(fc->fs_private); } static struct file_system_type befs_fs_type = { .owner = THIS_MODULE, .name = "befs", - .mount = befs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = befs_init_fs_context, + .parameters = befs_param_spec, }; MODULE_ALIAS_FS("befs"); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index fec5c6cde0a7..7e0f9600b80c 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); + WRITE_ONCE(bbio->status, BLK_STS_OK); } /* @@ -113,41 +114,29 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) } } -static void btrfs_orig_write_end_io(struct bio *bio); - -static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, - struct btrfs_bio *orig_bbio) -{ - /* - * For writes we tolerate nr_mirrors - 1 write failures, so we can't - * just blindly propagate a write failure here. Instead increment the - * error count in the original I/O context so that it is guaranteed to - * be larger than the error tolerance. - */ - if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { - struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; - struct btrfs_io_context *orig_bioc = orig_stripe->bioc; - - atomic_add(orig_bioc->max_errors, &orig_bioc->error); - } else { - orig_bbio->bio.bi_status = bbio->bio.bi_status; - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - if (bbio->bio.bi_status) - btrfs_bbio_propagate_error(bbio, orig_bbio); btrfs_cleanup_bio(bbio); bbio = orig_bbio; } - if (atomic_dec_and_test(&bbio->pending_ios)) + /* + * At this point, bbio always points to the original btrfs_bio. Save + * the first error in it. + */ + if (status != BLK_STS_OK) + cmpxchg(&bbio->status, BLK_STS_OK, status); + + if (atomic_dec_and_test(&bbio->pending_ios)) { + /* Load split bio's error which might be set above. */ + if (status == BLK_STS_OK) + bbio->bio.bi_status = READ_ONCE(bbio->status); __btrfs_bio_end_io(bbio); + } } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index e48612340745..e2fe16074ad6 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -79,6 +79,9 @@ struct btrfs_bio { /* File system that this I/O operates on. */ struct btrfs_fs_info *fs_info; + /* Save the first error status of split bio. */ + blk_status_t status; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 317a3712270f..307dedf95c70 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -744,16 +744,11 @@ const char *btrfs_super_csum_driver(u16 csum_type); size_t __attribute_const__ btrfs_get_num_csums(void); /* - * We use page status Private2 to indicate there is an ordered extent with + * We use folio flag owner_2 to indicate there is an ordered extent with * unfinished IO. - * - * Rename the Private2 accessors to Ordered, to improve readability. */ -#define PageOrdered(page) PagePrivate2(page) -#define SetPageOrdered(page) SetPagePrivate2(page) -#define ClearPageOrdered(page) ClearPagePrivate2(page) -#define folio_test_ordered(folio) folio_test_private_2(folio) -#define folio_set_ordered(folio) folio_set_private_2(folio) -#define folio_clear_ordered(folio) folio_clear_private_2(folio) +#define folio_test_ordered(folio) folio_test_owner_2(folio) +#define folio_set_ordered(folio) folio_set_owner_2(folio) +#define folio_clear_ordered(folio) folio_clear_owner_2(folio) #endif diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index b95ef44c326b..968dae953948 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -763,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * We can get a merged extent, in that case, we need to re-search * tree to get the original em for defrag. * - * If @newer_than is 0 or em::generation < newer_than, we can trust - * this em, as either we don't care about the generation, or the - * merged extent map will be rejected anyway. + * This is because even if we have adjacent extents that are contiguous + * and compatible (same type and flags), we still want to defrag them + * so that we use less metadata (extent items in the extent tree and + * file extent items in the inode's subvolume tree). */ - if (em && (em->flags & EXTENT_FLAG_MERGED) && - newer_than && em->generation >= newer_than) { + if (em && (em->flags & EXTENT_FLAG_MERGED)) { free_extent_map(em); em = NULL; } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 115b90d29b1d..cab94d141f66 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -298,7 +298,7 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, if (ref1->ref_root < ref2->ref_root) return -1; if (ref1->ref_root > ref2->ref_root) - return -1; + return 1; if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) ret = comp_data_refs(ref1, ref2); } @@ -649,7 +649,7 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, &href->ref_add_list); else if (ref->action == BTRFS_DROP_DELAYED_REF) { ASSERT(!list_empty(&exist->add_list)); - list_del(&exist->add_list); + list_del_init(&exist->add_list); } else { ASSERT(0); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 872cca54cc6c..42c9899d9241 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -786,7 +786,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page, + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; @@ -1708,7 +1708,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, ret = bio_add_folio(&bbio->bio, folio, eb->len, eb->start - folio_pos(folio)); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len); + wbc_account_cgroup_owner(wbc, folio, eb->len); folio_unlock(folio); } else { int num_folios = num_extent_folios(eb); @@ -1722,8 +1722,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_start_writeback(folio); ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - eb->folio_size); + wbc_account_cgroup_owner(wbc, folio, eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 668c617444a5..1d93e1202c33 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -230,7 +230,12 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma if (extent_map_end(prev) != next->start) return false; - if (prev->flags != next->flags) + /* + * The merged flag is not an on-disk flag, it just indicates we had the + * extent maps of 2 (or more) adjacent extents merged, so factor it out. + */ + if ((prev->flags & ~EXTENT_FLAG_MERGED) != + (next->flags & ~EXTENT_FLAG_MERGED)) return false; if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4fb521d91b06..e5384ceb8acf 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1120,26 +1120,6 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } -static void update_time_for_write(struct inode *inode) -{ - struct timespec64 now, ts; - - if (IS_NOCMTIME(inode)) - return; - - now = current_time(inode); - ts = inode_get_mtime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_mtime_to_ts(inode, now); - - ts = inode_get_ctime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_ctime_to_ts(inode, now); - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); -} - int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) { struct file *file = iocb->ki_filp; @@ -1170,7 +1150,10 @@ int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ - update_time_for_write(inode); + if (!IS_NOCMTIME(inode)) { + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + inode_inc_iversion(inode); + } start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index da51edbad6a0..e41bc7842577 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1513,7 +1513,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * (which the caller expects to stay locked), don't clear any * dirty bits and don't set any writeback bits * - * Do set the Ordered (Private2) bit so we know this page was + * Do set the Ordered flag so we know this page was * properly setup for writepage. */ page_ops = (keep_locked ? 0 : PAGE_UNLOCK); @@ -1618,7 +1618,7 @@ out_unlock: clear_bits |= EXTENT_CLEAR_DATA_RESV; extent_clear_unlock_delalloc(inode, start, end, locked_folio, &cached, clear_bits, page_ops); - btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL); } return ret; } @@ -1729,7 +1729,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, &locked_folio->page, + wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); async_chunk[i].locked_folio = locked_folio; locked_folio = NULL; @@ -7294,7 +7294,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * * But already submitted bio can still be finished on this folio. * Furthermore, endio function won't skip folio which has Ordered - * (Private2) already cleared, so it's possible for endio and + * already cleared, so it's possible for endio and * invalidate_folio to do the same ordered extent accounting twice * on one folio. * @@ -7360,7 +7360,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, range_len = range_end + 1 - cur; if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* - * If Ordered (Private2) is cleared, it means endio has + * If Ordered is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. @@ -7433,7 +7433,7 @@ next: } /* * We have iterated through all ordered extents of the page, the page - * should not have Ordered (Private2) anymore, or the above iteration + * should not have Ordered anymore, or the above iteration * did something wrong. */ ASSERT(!folio_test_ordered(folio)); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2104d60c2161..95c8499a159a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -346,10 +346,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); /* - * Ordered (Private2) bit indicates whether we still have + * Ordered flag indicates whether we still have * pending io unfinished for the ordered extent. * - * If there's no such bit, we need to skip to next range. + * If it's not set, we need to skip to next range. */ if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) return false; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 926d7a9ed99d..881d62d81b9f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1979,25 +1979,10 @@ error: * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem * in fc->sb_flags. * - * This disambiguation has rather positive consequences. Mounting a subvolume - * ro will not also turn the superblock ro. Only the mount for the subvolume - * will become ro. - * - * So, if the superblock creation request comes from the new mount API the - * caller must have explicitly done: - * - * fsconfig(FSCONFIG_SET_FLAG, "ro") - * fsmount/mount_setattr(MOUNT_ATTR_RDONLY) - * - * IOW, at some point the caller must have explicitly turned the whole - * superblock ro and we shouldn't just undo it like we did for the old mount - * API. In any case, it lets us avoid the hack in the new mount API. - * - * Consequently, the remounting hack must only be used for requests originating - * from the old mount API and should be marked for full deprecation so it can be - * turned off in a couple of years. - * - * The new mount API has no reason to support this hack. + * But, currently the util-linux mount command already utilizes the new mount + * API and is still setting fsconfig(FSCONFIG_SET_FLAG, "ro") no matter if it's + * btrfs or not, setting the whole super block RO. To make per-subvolume mounting + * work with different options work we need to keep backward compatibility. */ static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) { @@ -2019,7 +2004,7 @@ static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) if (IS_ERR(mnt)) return mnt; - if (!fc->oldapi || !ro2rw) + if (!ro2rw) return mnt; /* We need to convert to rw, call reconfigure. */ @@ -2206,7 +2191,8 @@ static struct file_system_type btrfs_fs_type = { .init_fs_context = btrfs_init_fs_context, .parameters = btrfs_fs_parameters, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | + FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("btrfs"); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8f340ad1d938..eb51b609190f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1105,6 +1105,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->bdev) { fs_devices->open_devices--; device->bdev = NULL; + device->bdev_file = NULL; } clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); btrfs_destroy_dev_zone_info(device); diff --git a/fs/buffer.c b/fs/buffer.c index 1fc9a50def0b..bb4a31b9559d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1649,6 +1649,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length) if (length == folio_size(folio)) filemap_release_folio(folio, 0); out: + folio_clear_mappedtodisk(folio); return; } EXPORT_SYMBOL(block_invalidate_folio); @@ -2803,7 +2804,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_write_hint = write_hint; - __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh)); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; @@ -2813,7 +2814,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); + wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); } submit_bio(bio); diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 35ba2117a6f6..3e63cfe15874 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -327,6 +327,8 @@ static void cachefiles_commit_object(struct cachefiles_object *object, static void cachefiles_clean_up_object(struct cachefiles_object *object, struct cachefiles_cache *cache) { + struct file *file; + if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) { if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) { cachefiles_see_object(object, cachefiles_obj_see_clean_delete); @@ -342,10 +344,14 @@ static void cachefiles_clean_up_object(struct cachefiles_object *object, } cachefiles_unmark_inode_in_use(object, object->file); - if (object->file) { - fput(object->file); - object->file = NULL; - } + + spin_lock(&object->lock); + file = object->file; + object->file = NULL; + spin_unlock(&object->lock); + + if (file) + fput(file); } /* diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 2b3f9935dbb4..7cf59713f0f7 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -691,11 +691,6 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, } if (!d_is_negative(dentry)) { - if (d_backing_inode(dentry) == file_inode(object->file)) { - success = true; - goto out_dput; - } - ret = cachefiles_unlink(volume->cache, object, fan, dentry, FSCACHE_OBJECT_IS_STALE); if (ret < 0) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 470c96658385..fe3de9ad57bf 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -60,26 +60,36 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb, { struct cachefiles_object *object = kiocb->ki_filp->private_data; struct cachefiles_cache *cache = object->volume->cache; - struct file *file = object->file; - size_t len = iter->count; + struct file *file; + size_t len = iter->count, aligned_len = len; loff_t pos = kiocb->ki_pos; const struct cred *saved_cred; int ret; - if (!file) + spin_lock(&object->lock); + file = object->file; + if (!file) { + spin_unlock(&object->lock); return -ENOBUFS; + } + get_file(file); + spin_unlock(&object->lock); cachefiles_begin_secure(cache, &saved_cred); - ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true); + ret = __cachefiles_prepare_write(object, file, &pos, &aligned_len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) - return ret; + goto out; trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len); ret = __cachefiles_write(object, file, pos, iter, NULL, NULL); - if (!ret) + if (!ret) { ret = len; + kiocb->ki_pos += ret; + } +out: + fput(file); return ret; } @@ -87,12 +97,22 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos, int whence) { struct cachefiles_object *object = filp->private_data; - struct file *file = object->file; + struct file *file; + loff_t ret; - if (!file) + spin_lock(&object->lock); + file = object->file; + if (!file) { + spin_unlock(&object->lock); return -ENOBUFS; + } + get_file(file); + spin_unlock(&object->lock); - return vfs_llseek(file, pos, whence); + ret = vfs_llseek(file, pos, whence); + fput(file); + + return ret; } static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c2a9e2cc03de..4c82348fe1e6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1054,7 +1054,9 @@ get_more_pages: if (!nr_folios && !locked_pages) break; for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { - page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; + + page = &folio->page; doutc(cl, "? %p idx %lu\n", page, page->index); if (locked_pages == 0) lock_page(page); /* first page */ @@ -1081,8 +1083,6 @@ get_more_pages: continue; } if (page_offset(page) >= ceph_wbc.i_size) { - struct folio *folio = page_folio(page); - doutc(cl, "folio at %lu beyond eof %llu\n", folio->index, ceph_wbc.i_size); if ((ceph_wbc.size_stable || @@ -1098,16 +1098,16 @@ get_more_pages: unlock_page(page); break; } - if (PageWriteback(page) || - PagePrivate2(page) /* [DEPRECATED] */) { + if (folio_test_writeback(folio) || + folio_test_private_2(folio) /* [DEPRECATED] */) { if (wbc->sync_mode == WB_SYNC_NONE) { - doutc(cl, "%p under writeback\n", page); - unlock_page(page); + doutc(cl, "%p under writeback\n", folio); + folio_unlock(folio); continue; } - doutc(cl, "waiting on writeback %p\n", page); - wait_on_page_writeback(page); - folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ + doutc(cl, "waiting on writeback %p\n", folio); + folio_wait_writeback(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ } if (!clear_page_dirty_for_io(page)) { diff --git a/fs/char_dev.c b/fs/char_dev.c index 57cc096c498a..c2ddb998f3c9 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -562,8 +562,8 @@ int cdev_device_add(struct cdev *cdev, struct device *dev) /** * cdev_device_del() - inverse of cdev_device_add - * @dev: the device structure * @cdev: the cdev structure + * @dev: the device structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. diff --git a/fs/coredump.c b/fs/coredump.c index 45737b43dda5..d48edb37bc35 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -951,6 +951,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, } else { dump_skip(cprm, PAGE_SIZE); } + cond_resched(); } dump_page_free(dump_page); return 1; @@ -1262,35 +1262,46 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); + loff_t copy_pos = iter->pos; + u64 copy_len = iomap_length(iter); + u32 mod; int id = 0; s64 ret = 0; void *daddr = NULL, *saddr = NULL; - /* don't bother with blocks that are not shared to start with */ - if (!(iomap->flags & IOMAP_F_SHARED)) - return length; + if (!iomap_want_unshare_iter(iter)) + return iomap_length(iter); + + /* + * Extend the file range to be aligned to fsblock/pagesize, because + * we need to copy entire blocks, not just the byte range specified. + * Invalidate the mapping because we're about to CoW. + */ + mod = offset_in_page(copy_pos); + if (mod) { + copy_len += mod; + copy_pos -= mod; + } + + mod = offset_in_page(copy_pos + copy_len); + if (mod) + copy_len += PAGE_SIZE - mod; + + invalidate_inode_pages2_range(iter->inode->i_mapping, + copy_pos >> PAGE_SHIFT, + (copy_pos + copy_len - 1) >> PAGE_SHIFT); id = dax_read_lock(); - ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); + ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL); if (ret < 0) goto out_unlock; - /* zero the distance if srcmap is HOLE or UNWRITTEN */ - if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) { - memset(daddr, 0, length); - dax_flush(iomap->dax_dev, daddr, length); - ret = length; - goto out_unlock; - } - - ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); + ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL); if (ret < 0) goto out_unlock; - if (copy_mc_to_kernel(daddr, saddr, length) == 0) - ret = length; + if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0) + ret = iomap_length(iter); else ret = -EIO; diff --git a/fs/dcache.c b/fs/dcache.c index 0f6b16ba30d0..0099077a2982 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -135,6 +135,7 @@ struct dentry_stat_t { static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); +static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ @@ -199,6 +200,15 @@ static struct ctl_table fs_dcache_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_dentry, }, + { + .procname = "dentry-negative", + .data = &dentry_negative_policy, + .maxlen = sizeof(dentry_negative_policy), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static int __init init_fs_dcache_sysctls(void) @@ -2039,8 +2049,8 @@ EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name - * @inode: the inode case-insensitive lookup has found * @dentry: the negative dentry that was passed to the parent's lookup func + * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the @@ -2093,8 +2103,8 @@ EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name - * @parent: parent dentry * @dentry: the negative dentry that was passed to the parent's lookup func + * @parent: parent dentry * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false @@ -2401,6 +2411,8 @@ void d_delete(struct dentry * dentry) * Are we the only user? */ if (dentry->d_lockref.count == 1) { + if (dentry_negative_policy) + __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { diff --git a/fs/efs/super.c b/fs/efs/super.c index e4421c10caeb..c59086b7eabf 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -15,7 +15,6 @@ #include <linux/vfs.h> #include <linux/blkdev.h> #include <linux/fs_context.h> -#include <linux/fs_parser.h> #include "efs.h" #include <linux/efs_vh.h> #include <linux/efs_fs_sb.h> @@ -49,15 +48,6 @@ static struct pt_types sgi_pt_types[] = { {0, NULL} }; -enum { - Opt_explicit_open, -}; - -static const struct fs_parameter_spec efs_param_spec[] = { - fsparam_flag ("explicit-open", Opt_explicit_open), - {} -}; - /* * File system definition and registration. */ @@ -67,7 +57,6 @@ static struct file_system_type efs_fs_type = { .kill_sb = efs_kill_sb, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = efs_init_fs_context, - .parameters = efs_param_spec, }; MODULE_ALIAS_FS("efs"); @@ -265,7 +254,8 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc) if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); - return -EINVAL; + return invalf(fc, "device does not support %d byte blocks\n", + EFS_BLOCKSIZE); } /* read the vh (volume header) block */ @@ -327,43 +317,22 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc) return 0; } -static void efs_free_fc(struct fs_context *fc) -{ - kfree(fc->fs_private); -} - static int efs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, efs_fill_super); } -static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param) -{ - int token; - struct fs_parse_result result; - - token = fs_parse(fc, efs_param_spec, param, &result); - if (token < 0) - return token; - return 0; -} - static int efs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); + fc->sb_flags |= SB_RDONLY; return 0; } -struct efs_context { - unsigned long s_mount_opts; -}; - static const struct fs_context_operations efs_context_opts = { - .parse_param = efs_parse_param, .get_tree = efs_get_tree, .reconfigure = efs_reconfigure, - .free = efs_free_fc, }; /* @@ -371,12 +340,6 @@ static const struct fs_context_operations efs_context_opts = { */ static int efs_init_fs_context(struct fs_context *fc) { - struct efs_context *ctx; - - ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - fc->fs_private = ctx; fc->ops = &efs_context_opts; return 0; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 320d586c3896..bed3dbe5b7cb 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -709,7 +709,9 @@ static int erofs_fc_get_tree(struct fs_context *fc) if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); - ret = get_tree_bdev(fc, erofs_fc_fill_super); + ret = get_tree_bdev_flags(fc, erofs_fc_fill_super, + IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ? + GET_TREE_BDEV_QUIET_LOOKUP : 0); #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE if (ret == -ENOTBLK) { if (!fc->source) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1ae4542f0bd8..826d21c4486b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -823,7 +823,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { - file->f_ep = NULL; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); @@ -1002,7 +1003,7 @@ static struct file *epi_fget(const struct epitem *epi) struct file *file; file = epi->ffd.file; - if (!atomic_long_inc_not_zero(&file->f_count)) + if (!file_ref_get(&file->f_ref)) file = NULL; return file; } @@ -1372,7 +1373,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v break; } } - wake_up(&ep->wq); + if (sync) + wake_up_sync(&ep->wq); + else + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1603,7 +1607,8 @@ allocate: spin_unlock(&file->f_lock); goto allocate; } - file->f_ep = head; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 44b0d418143c..494d443e9fc9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1729,6 +1729,10 @@ struct ext4_sb_info { */ struct work_struct s_sb_upd_work; + /* Atomic write unit values in bytes */ + unsigned int s_awu_min; + unsigned int s_awu_max; + /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; @@ -3855,6 +3859,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } +static inline bool ext4_inode_can_atomic_write(struct inode *inode) +{ + + return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0; +} + extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f14aed14b9cf..a7de03e47db0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -599,6 +599,13 @@ out: ssize_t err; loff_t endbyte; + /* + * There is no support for atomic writes on buffered-io yet, + * we should never fallback to buffered-io for DIO atomic + * writes. + */ + WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); + offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) @@ -692,6 +699,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif + + if (iocb->ki_flags & IOCB_ATOMIC) { + size_t len = iov_iter_count(from); + int ret; + + if (len < EXT4_SB(inode->i_sb)->s_awu_min || + len > EXT4_SB(inode->i_sb)->s_awu_max) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else @@ -884,6 +905,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } + if (ext4_inode_can_atomic_write(inode)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 54bdd4884fe6..5b9eeb74ce47 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3444,17 +3444,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, return ret; } +static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) +{ + /* must be a directio to fall back to buffered */ + if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != + (IOMAP_WRITE | IOMAP_DIRECT)) + return false; + + /* atomic writes are all-or-nothing */ + if (flags & IOMAP_ATOMIC) + return false; + + /* can only try again if we wrote nothing */ + return written == 0; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to - * the allocated blocks. If so, return the magic error code so that we - * fallback to buffered I/O and attempt to complete the remainder of - * the I/O. Any blocks that may have been allocated in preparation for - * the direct I/O will be reused during buffered I/O. + * the allocated blocks. If so, return the magic error code for + * non-atomic write so that we fallback to buffered I/O and attempt to + * complete the remainder of the I/O. + * For non-atomic writes, any blocks that may have been + * allocated in preparation for the direct I/O will be reused during + * buffered I/O. For atomic write, we never fallback to buffered-io. */ - if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) + if (ext4_want_directio_fallback(flags, written)) return -ENOTBLK; return 0; @@ -5578,6 +5595,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, } } + if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int awu_min = 0, awu_max = 0; + + if (ext4_inode_can_atomic_write(inode)) { + awu_min = sbi->s_awu_min; + awu_max = sbi->s_awu_max; + } + + generic_fill_statx_atomic_writes(stat, awu_min, awu_max); + } + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 790db7eac6c2..612ccbeb493b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2395,11 +2395,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; -#if IS_ENABLED(CONFIG_UNICODE) - if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && - utf8_validate(sb->s_encoding, &dentry->d_name)) + if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; -#endif retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); if (retval) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ad5543866d21..b7b9261fec3b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -421,7 +421,7 @@ submit_and_retry: io_submit_init_bio(io, bh); if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size); io->io_next_block++; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16a4ce704460..7ea7178750f2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4425,6 +4425,36 @@ static int ext4_handle_clustersize(struct super_block *sb) return 0; } +/* + * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * @sb: super block + * TODO: Later add support for bigalloc + */ +static void ext4_atomic_write_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct block_device *bdev = sb->s_bdev; + + if (!bdev_can_atomic_write(bdev)) + return; + + if (!ext4_has_feature_extents(sb)) + return; + + sbi->s_awu_min = max(sb->s_blocksize, + bdev_atomic_write_unit_min_bytes(bdev)); + sbi->s_awu_max = min(sb->s_blocksize, + bdev_atomic_write_unit_max_bytes(bdev)); + if (sbi->s_awu_min && sbi->s_awu_max && + sbi->s_awu_min <= sbi->s_awu_max) { + ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u", + sbi->s_awu_min, sbi->s_awu_max); + } else { + sbi->s_awu_min = 0; + sbi->s_awu_max = 0; + } +} + static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -5336,6 +5366,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) spin_lock_init(&sbi->s_bdev_wb_lock); + ext4_atomic_write_init(sb); ext4_fast_commit_init(sb); sb->s_root = NULL; @@ -7329,7 +7360,7 @@ static struct file_system_type ext4_fs_type = { .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("ext4"); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 94f7b084f601..e3ce763cce18 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page, false)); @@ -911,7 +912,8 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); @@ -1011,7 +1013,8 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; diff --git a/fs/fcntl.c b/fs/fcntl.c index 22dd9dcce7ec..a7947a615db6 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -12,7 +12,6 @@ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/capability.h> #include <linux/dnotify.h> #include <linux/slab.h> @@ -397,6 +396,9 @@ static long f_dupfd_query(int fd, struct file *filp) { CLASS(fd_raw, f)(fd); + if (fd_empty(f)) + return -EBADF; + /* * We can do the 'fdput()' immediately, as the only thing that * matters is the pointer value which isn't changed by the fdput. diff --git a/fs/file.c b/fs/file.c index eb093e736972..fb1011cf6b4a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -20,10 +20,73 @@ #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> +#include <linux/file_ref.h> #include <net/sock.h> #include "internal.h" +/** + * __file_ref_put - Slowpath of file_ref_put() + * @ref: Pointer to the reference count + * @cnt: Current reference count + * + * Invoked when the reference count is outside of the valid zone. + * + * Return: + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely schedule the + * object, which is protected by the reference counter, for + * deconstruction. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * deconstruct the protected object. + */ +bool __file_ref_put(file_ref_t *ref, unsigned long cnt) +{ + /* Did this drop the last reference? */ + if (likely(cnt == FILE_REF_NOREF)) { + /* + * Carefully try to set the reference count to FILE_REF_DEAD. + * + * This can fail if a concurrent get() operation has + * elevated it again or the corresponding put() even marked + * it dead already. Both are valid situations and do not + * require a retry. If this fails the caller is not + * allowed to deconstruct the object. + */ + if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) + return false; + + /* + * The caller can safely schedule the object for + * deconstruction. Provide acquire ordering. + */ + smp_acquire__after_ctrl_dep(); + return true; + } + + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} +EXPORT_SYMBOL_GPL(__file_ref_put); + unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ @@ -89,18 +152,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". - * - * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied - * by that "1024/sizeof(ptr)" before, we already know there are sufficient - * clear low bits. Clang seems to realize that, gcc ends up being confused. - * - * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, - * let's consider it documentation (and maybe a test-case for gcc to improve - * its code generation ;) */ -static struct fdtable * alloc_fdtable(unsigned int nr) +static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; + unsigned int nr; void *data; /* @@ -108,22 +164,32 @@ static struct fdtable * alloc_fdtable(unsigned int nr) * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B - * and growing in powers of two from there on. + * and growing in powers of two from there on. Since we called only + * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab + * already gives BITS_PER_LONG slots), the above boils down to + * 1. use the smallest power of two large enough to give us that many + * slots. + * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is + * 256 slots (i.e. 1Kb fd array). + * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there + * and we are never going to be asked for 64 or less. */ - nr /= (1024 / sizeof(struct file *)); - nr = roundup_pow_of_two(nr + 1); - nr *= (1024 / sizeof(struct file *)); - nr = ALIGN(nr, BITS_PER_LONG); + if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) + nr = 256; + else + nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open - * had been set lower between the check in expand_files() and here. Deal - * with that in caller, it's cheaper that way. + * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ - if (unlikely(nr > sysctl_nr_open)) - nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + if (unlikely(nr > sysctl_nr_open)) { + nr = round_down(sysctl_nr_open, BITS_PER_LONG); + if (nr < slots_wanted) + return ERR_PTR(-EMFILE); + } fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) @@ -152,14 +218,14 @@ out_arr: out_fdt: kfree(fdt); out: - return NULL; + return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. - * Return <0 error code on error; 1 on successful completion. + * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) @@ -169,7 +235,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); - new_fdt = alloc_fdtable(nr); + new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. @@ -178,16 +244,8 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) synchronize_rcu(); spin_lock(&files->file_lock); - if (!new_fdt) - return -ENOMEM; - /* - * extremely unlikely race - sysctl_nr_open decreased between the check in - * caller and alloc_fdtable(). Cheaper to catch it here... - */ - if (unlikely(new_fdt->max_fds <= nr)) { - __free_fdtable(new_fdt); - return -EMFILE; - } + if (IS_ERR(new_fdt)) + return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); @@ -196,15 +254,14 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); - return 1; + return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. - * Return <0 error code on error; 0 when nothing done; 1 when files were - * expanded and execution may have blocked. + * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) @@ -212,14 +269,14 @@ static int expand_files(struct files_struct *files, unsigned int nr) __acquires(files->file_lock) { struct fdtable *fdt; - int expanded = 0; + int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return expanded; + return 0; /* Can we expand? */ if (nr >= sysctl_nr_open) @@ -227,7 +284,6 @@ repeat: if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); - expanded = 1; wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; @@ -235,27 +291,28 @@ repeat: /* All good, so we try */ files->resize_in_progress = true; - expanded = expand_fdtable(files, nr); + error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); - return expanded; -} - -static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) -{ - __set_bit(fd, fdt->close_on_exec); + return error; } -static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) +static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, + bool set) { - if (test_bit(fd, fdt->close_on_exec)) - __clear_bit(fd, fdt->close_on_exec); + if (set) { + __set_bit(fd, fdt->close_on_exec); + } else { + if (test_bit(fd, fdt->close_on_exec)) + __clear_bit(fd, fdt->close_on_exec); + } } -static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) +static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); + __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); @@ -264,7 +321,9 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); - __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); + fd /= BITS_PER_LONG; + if (test_bit(fd, fdt->full_fds_bits)) + __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) @@ -306,7 +365,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; - int error; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) @@ -338,17 +396,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); - new_fdt = alloc_fdtable(open_files - 1); - if (!new_fdt) { - error = -ENOMEM; - goto out_release; - } - - /* beyond sysctl_nr_open; nothing to do */ - if (unlikely(new_fdt->max_fds < open_files)) { - __free_fdtable(new_fdt); - error = -EMFILE; - goto out_release; + new_fdt = alloc_fdtable(open_files); + if (IS_ERR(new_fdt)) { + kmem_cache_free(files_cachep, newf); + return ERR_CAST(new_fdt); } /* @@ -389,10 +440,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho rcu_assign_pointer(newf->fdt, new_fdt); return newf; - -out_release: - kmem_cache_free(files_cachep, newf); - return ERR_PTR(error); } static struct fdtable *close_files(struct files_struct * files) @@ -413,7 +460,7 @@ static struct fdtable *close_files(struct files_struct * files) set = fdt->open_fds[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); + struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); @@ -470,6 +517,15 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; + unsigned int bit; + + /* + * Try to avoid looking at the second level bitmap + */ + bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, + start & (BITS_PER_LONG - 1)); + if (bit < BITS_PER_LONG) + return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) @@ -496,7 +552,7 @@ repeat: if (fd < files->next_fd) fd = files->next_fd; - if (fd < fdt->max_fds) + if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* @@ -504,36 +560,22 @@ repeat: * will limit the total number of files that can be opened. */ error = -EMFILE; - if (fd >= end) + if (unlikely(fd >= end)) goto out; - error = expand_files(files, fd); - if (error < 0) - goto out; + if (unlikely(fd >= fdt->max_fds)) { + error = expand_files(files, fd); + if (error < 0) + goto out; - /* - * If we needed to expand the fs array we - * might have blocked - try again. - */ - if (error) goto repeat; + } if (start <= files->next_fd) files->next_fd = fd + 1; - __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; -#if 1 - /* Sanity check */ - if (rcu_access_pointer(fdt->fd[fd]) != NULL) { - printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); - rcu_assign_pointer(fdt->fd[fd], NULL); - } -#endif out: spin_unlock(&files->file_lock); @@ -599,7 +641,7 @@ void fd_install(unsigned int fd, struct file *file) rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); - BUG_ON(fdt->fd[fd] != NULL); + WARN_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; @@ -713,7 +755,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, } /** - * __close_range() - Close all file descriptors in a given range. + * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close @@ -721,8 +763,10 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. + * Currently, errors to close a given file descriptor are ignored. */ -int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) +SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, + unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; @@ -839,7 +883,7 @@ static struct file *__get_file_rcu(struct file __rcu **f) if (!file) return NULL; - if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); @@ -853,8 +897,8 @@ static struct file *__get_file_rcu(struct file __rcu **f) OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* - * atomic_long_inc_not_zero() above provided a full memory - * barrier when we acquired a reference. + * file_ref_get() above provided a full memory barrier when we + * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still @@ -952,11 +996,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * We need to confirm it by incrementing the refcount * and then check the lookup again. * - * atomic_long_inc_not_zero() gives us a full memory - * barrier. We only really need an 'acquire' one to - * protect the loads below, but we don't have that. + * file_ref_get() gives us a full memory barrier. We + * only really need an 'acquire' one to protect the + * loads below, but we don't have that. */ - if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + if (unlikely(!file_ref_get(&file->f_ref))) continue; /* @@ -1037,29 +1081,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) return file; } -struct file *lookup_fdget_rcu(unsigned int fd) -{ - return __fget_files_rcu(current->files, fd, 0); - -} -EXPORT_SYMBOL_GPL(lookup_fdget_rcu); - -struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd) -{ - /* Must be called with rcu_read_lock held */ - struct files_struct *files; - struct file *file = NULL; - - task_lock(task); - files = task->files; - if (files) - file = __fget_files_rcu(files, fd, 0); - task_unlock(task); - - return file; -} - -struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd) +struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -1069,17 +1091,19 @@ struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int * task_lock(task); files = task->files; if (files) { + rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } + rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } -EXPORT_SYMBOL(task_lookup_next_fdget_rcu); +EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. @@ -1096,6 +1120,13 @@ EXPORT_SYMBOL(task_lookup_next_fdget_rcu); * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. + * + * (As an exception to rule 2, you can call filp_close between fget_light and + * fput_light provided that you capture a real refcount with get_file before + * the call to filp_close, and ensure that this real refcount is fput *after* + * the fput_light call.) + * + * See also the documentation in rust/kernel/file.rs. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { @@ -1176,13 +1207,8 @@ void __f_unlock_pos(struct file *f) void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; - struct fdtable *fdt; spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (flag) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } @@ -1223,11 +1249,7 @@ __releases(&files->file_lock) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); - __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) diff --git a/fs/file_table.c b/fs/file_table.c index eed5ffad9997..976736be47cb 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -9,7 +9,6 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> @@ -40,13 +39,17 @@ static struct files_stat_struct files_stat = { /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __ro_after_init; +static struct kmem_cache *bfilp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; /* Container for backing file with optional user path */ struct backing_file { struct file file; - struct path user_path; + union { + struct path user_path; + freeptr_t bf_freeptr; + }; }; static inline struct backing_file *backing_file(struct file *f) @@ -68,7 +71,7 @@ static inline void file_free(struct file *f) put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { path_put(backing_file_user_path(f)); - kfree(backing_file(f)); + kmem_cache_free(bfilp_cachep, backing_file(f)); } else { kmem_cache_free(filp_cachep, f); } @@ -165,16 +168,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); - f->f_flags = flags; - f->f_mode = OPEN_FMODE(flags); - /* f->f_version: 0 */ + memset(&f->f_path, 0, sizeof(f->f_path)); + memset(&f->f_ra, 0, sizeof(f->f_ra)); + + f->f_flags = flags; + f->f_mode = OPEN_FMODE(flags); + + f->f_op = NULL; + f->f_mapping = NULL; + f->private_data = NULL; + f->f_inode = NULL; + f->f_owner = NULL; +#ifdef CONFIG_EPOLL + f->f_ep = NULL; +#endif + + f->f_iocb_flags = 0; + f->f_pos = 0; + f->f_wb_err = 0; + f->f_sb_err = 0; /* * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ - atomic_long_set(&f->f_count, 1); + file_ref_init(&f->f_ref, 1); return 0; } @@ -206,7 +225,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); @@ -240,7 +259,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) struct file *f; int error; - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); @@ -267,13 +286,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) struct backing_file *ff; int error; - ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); + ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); error = init_file(&ff->file, flags, cred); if (unlikely(error)) { - kfree(ff); + kmem_cache_free(bfilp_cachep, ff); return ERR_PTR(error); } @@ -479,7 +498,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); void fput(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { + if (file_ref_put(&file->f_ref)) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { @@ -512,7 +531,7 @@ void fput(struct file *file) */ void __fput_sync(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) + if (file_ref_put(&file->f_ref)) __fput(file); } @@ -529,6 +548,11 @@ void __init files_init(void) filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); + + args.freeptr_offset = offsetof(struct backing_file, bf_freeptr); + bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file), + &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h index fbcd603365ad..8c67627f2a3d 100644 --- a/fs/freevxfs/vxfs_dir.h +++ b/fs/freevxfs/vxfs_dir.h @@ -25,7 +25,7 @@ struct vxfs_dirblk { __fs16 d_free; /* free space in dirblock */ __fs16 d_nhash; /* no of hash chains */ - __fs16 d_hash[1]; /* hash chain */ + __fs16 d_hash[]; /* hash chain */ }; /* diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d8bec3c1bb1f..3cd99e2dc6ac 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } -EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list @@ -731,8 +730,9 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ -void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) +static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); @@ -762,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } -EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); + +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} +EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection @@ -890,17 +907,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); /** * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress - * @page: page being written out + * @folio: folio being written out * @bytes: number of bytes being written out * - * @bytes from @page are about to written out during the writeback + * @bytes from @folio are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ -void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes) { - struct folio *folio; struct cgroup_subsys_state *css; int id; @@ -913,7 +929,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - folio = page_folio(page); css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) @@ -1227,6 +1242,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, } } +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 24727ec34e5a..16fa61ef56bf 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -156,6 +156,7 @@ int fs_lookup_param(struct fs_context *fc, f = getname_kernel(param->string); if (IS_ERR(f)) return PTR_ERR(f); + param->dirfd = AT_FDCWD; put_f = true; break; case fs_value_is_filename: @@ -308,6 +309,26 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, } EXPORT_SYMBOL(fs_param_is_fd); +int fs_param_is_file_or_string(struct p_log *log, + const struct fs_parameter_spec *p, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + switch (param->type) { + case fs_value_is_string: + return fs_param_is_string(log, p, param, result); + case fs_value_is_file: + result->uint_32 = param->dirfd; + if (result->uint_32 <= INT_MAX) + return 0; + break; + default: + break; + } + return fs_param_bad_value(log, param); +} +EXPORT_SYMBOL(fs_param_is_file_or_string); + int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index d418d8b5367f..3334c394ce9c 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -190,6 +190,5 @@ const struct export_operations gfs2_export_ops = { .fh_to_parent = gfs2_fh_to_parent, .get_name = gfs2_get_name, .get_parent = gfs2_get_parent, - .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index f7dd64856c9b..1e73cf87ff88 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1586,6 +1586,7 @@ const struct file_operations gfs2_file_fops = { .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, + .fop_flags = FOP_ASYNC_LOCK, }; const struct file_operations gfs2_dir_fops = { @@ -1598,6 +1599,7 @@ const struct file_operations gfs2_dir_fops = { .lock = gfs2_lock, .flock = gfs2_flock, .llseek = default_llseek, + .fop_flags = FOP_ASYNC_LOCK, }; #endif /* CONFIG_GFS2_FS_LOCKING_DLM */ diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 269c3bc7fced..4701c4aafbf4 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -34,7 +34,6 @@ #include <linux/lockref.h> #include <linux/rhashtable.h> #include <linux/pid_namespace.h> -#include <linux/fdtable.h> #include <linux/file.h> #include "gfs2.h" @@ -2768,25 +2767,18 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) i->file = NULL; } - rcu_read_lock(); for(;; i->fd++) { - struct inode *inode; - - i->file = task_lookup_next_fdget_rcu(i->task, &i->fd); + i->file = fget_task_next(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } - inode = file_inode(i->file); - if (inode->i_sb == i->sb) + if (file_inode(i->file)->i_sb == i->sb) break; - rcu_read_unlock(); fput(i->file); - rcu_read_lock(); } - rcu_read_unlock(); return i->file; } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index eeac99765f0d..3bee9b5dba5e 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -15,10 +15,11 @@ #include <linux/module.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/nls.h> -#include <linux/parser.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/vfs.h> @@ -111,21 +112,24 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int hfs_remount(struct super_block *sb, int *flags, char *data) +static int hfs_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + sync_filesystem(sb); - *flags |= SB_NODIRATIME; - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + fc->sb_flags |= SB_NODIRATIME; + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (!(*flags & SB_RDONLY)) { + + if (!(fc->sb_flags & SB_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } } return 0; @@ -180,7 +184,6 @@ static const struct super_operations hfs_super_operations = { .put_super = hfs_put_super, .sync_fs = hfs_sync_fs, .statfs = hfs_statfs, - .remount_fs = hfs_remount, .show_options = hfs_show_options, }; @@ -188,181 +191,112 @@ enum { opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask, opt_part, opt_session, opt_type, opt_creator, opt_quiet, opt_codepage, opt_iocharset, - opt_err }; -static const match_table_t tokens = { - { opt_uid, "uid=%u" }, - { opt_gid, "gid=%u" }, - { opt_umask, "umask=%o" }, - { opt_file_umask, "file_umask=%o" }, - { opt_dir_umask, "dir_umask=%o" }, - { opt_part, "part=%u" }, - { opt_session, "session=%u" }, - { opt_type, "type=%s" }, - { opt_creator, "creator=%s" }, - { opt_quiet, "quiet" }, - { opt_codepage, "codepage=%s" }, - { opt_iocharset, "iocharset=%s" }, - { opt_err, NULL } +static const struct fs_parameter_spec hfs_param_spec[] = { + fsparam_u32 ("uid", opt_uid), + fsparam_u32 ("gid", opt_gid), + fsparam_u32oct ("umask", opt_umask), + fsparam_u32oct ("file_umask", opt_file_umask), + fsparam_u32oct ("dir_umask", opt_dir_umask), + fsparam_u32 ("part", opt_part), + fsparam_u32 ("session", opt_session), + fsparam_string ("type", opt_type), + fsparam_string ("creator", opt_creator), + fsparam_flag ("quiet", opt_quiet), + fsparam_string ("codepage", opt_codepage), + fsparam_string ("iocharset", opt_iocharset), + {} }; -static inline int match_fourchar(substring_t *arg, u32 *result) -{ - if (arg->to - arg->from != 4) - return -EINVAL; - memcpy(result, arg->from, 4); - return 0; -} - /* - * parse_options() + * hfs_parse_param() * - * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger - * This function is called by hfs_read_super() to parse the mount options. + * This function is called by the vfs to parse the mount options. */ -static int parse_options(char *options, struct hfs_sb_info *hsb) +static int hfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int tmp, token; - - /* initialize the sb with defaults */ - hsb->s_uid = current_uid(); - hsb->s_gid = current_gid(); - hsb->s_file_umask = 0133; - hsb->s_dir_umask = 0022; - hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */ - hsb->s_quiet = 0; - hsb->part = -1; - hsb->session = -1; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case opt_uid: - if (match_int(&args[0], &tmp)) { - pr_err("uid requires an argument\n"); - return 0; - } - hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp); - if (!uid_valid(hsb->s_uid)) { - pr_err("invalid uid %d\n", tmp); - return 0; - } - break; - case opt_gid: - if (match_int(&args[0], &tmp)) { - pr_err("gid requires an argument\n"); - return 0; - } - hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp); - if (!gid_valid(hsb->s_gid)) { - pr_err("invalid gid %d\n", tmp); - return 0; - } - break; - case opt_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("umask requires a value\n"); - return 0; - } - hsb->s_file_umask = (umode_t)tmp; - hsb->s_dir_umask = (umode_t)tmp; - break; - case opt_file_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("file_umask requires a value\n"); - return 0; - } - hsb->s_file_umask = (umode_t)tmp; - break; - case opt_dir_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("dir_umask requires a value\n"); - return 0; - } - hsb->s_dir_umask = (umode_t)tmp; - break; - case opt_part: - if (match_int(&args[0], &hsb->part)) { - pr_err("part requires an argument\n"); - return 0; - } - break; - case opt_session: - if (match_int(&args[0], &hsb->session)) { - pr_err("session requires an argument\n"); - return 0; - } - break; - case opt_type: - if (match_fourchar(&args[0], &hsb->s_type)) { - pr_err("type requires a 4 character value\n"); - return 0; - } - break; - case opt_creator: - if (match_fourchar(&args[0], &hsb->s_creator)) { - pr_err("creator requires a 4 character value\n"); - return 0; - } - break; - case opt_quiet: - hsb->s_quiet = 1; - break; - case opt_codepage: - if (hsb->nls_disk) { - pr_err("unable to change codepage\n"); - return 0; - } - p = match_strdup(&args[0]); - if (p) - hsb->nls_disk = load_nls(p); - if (!hsb->nls_disk) { - pr_err("unable to load codepage \"%s\"\n", p); - kfree(p); - return 0; - } - kfree(p); - break; - case opt_iocharset: - if (hsb->nls_io) { - pr_err("unable to change iocharset\n"); - return 0; - } - p = match_strdup(&args[0]); - if (p) - hsb->nls_io = load_nls(p); - if (!hsb->nls_io) { - pr_err("unable to load iocharset \"%s\"\n", p); - kfree(p); - return 0; - } - kfree(p); - break; - default: - return 0; - } - } + struct hfs_sb_info *hsb = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + /* hfs does not honor any fs-specific options on remount */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + return 0; - if (hsb->nls_disk && !hsb->nls_io) { - hsb->nls_io = load_nls_default(); + opt = fs_parse(fc, hfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case opt_uid: + hsb->s_uid = result.uid; + break; + case opt_gid: + hsb->s_gid = result.gid; + break; + case opt_umask: + hsb->s_file_umask = (umode_t)result.uint_32; + hsb->s_dir_umask = (umode_t)result.uint_32; + break; + case opt_file_umask: + hsb->s_file_umask = (umode_t)result.uint_32; + break; + case opt_dir_umask: + hsb->s_dir_umask = (umode_t)result.uint_32; + break; + case opt_part: + hsb->part = result.uint_32; + break; + case opt_session: + hsb->session = result.uint_32; + break; + case opt_type: + if (strlen(param->string) != 4) { + pr_err("type requires a 4 character value\n"); + return -EINVAL; + } + memcpy(&hsb->s_type, param->string, 4); + break; + case opt_creator: + if (strlen(param->string) != 4) { + pr_err("creator requires a 4 character value\n"); + return -EINVAL; + } + memcpy(&hsb->s_creator, param->string, 4); + break; + case opt_quiet: + hsb->s_quiet = 1; + break; + case opt_codepage: + if (hsb->nls_disk) { + pr_err("unable to change codepage\n"); + return -EINVAL; + } + hsb->nls_disk = load_nls(param->string); + if (!hsb->nls_disk) { + pr_err("unable to load codepage \"%s\"\n", + param->string); + return -EINVAL; + } + break; + case opt_iocharset: + if (hsb->nls_io) { + pr_err("unable to change iocharset\n"); + return -EINVAL; + } + hsb->nls_io = load_nls(param->string); if (!hsb->nls_io) { - pr_err("unable to load default iocharset\n"); - return 0; + pr_err("unable to load iocharset \"%s\"\n", + param->string); + return -EINVAL; } + break; + default: + return -EINVAL; } - hsb->s_dir_umask &= 0777; - hsb->s_file_umask &= 0577; - return 1; + return 0; } /* @@ -376,29 +310,25 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) * hfs_btree_init() to get the necessary data about the extents and * catalog B-trees and, finally, reading the root inode into memory. */ -static int hfs_fill_super(struct super_block *sb, void *data, int silent) +static int hfs_fill_super(struct super_block *sb, struct fs_context *fc) { - struct hfs_sb_info *sbi; + struct hfs_sb_info *sbi = HFS_SB(sb); struct hfs_find_data fd; hfs_cat_rec rec; struct inode *root_inode; + int silent = fc->sb_flags & SB_SILENT; int res; - sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; + /* load_nls_default does not fail */ + if (sbi->nls_disk && !sbi->nls_io) + sbi->nls_io = load_nls_default(); + sbi->s_dir_umask &= 0777; + sbi->s_file_umask &= 0577; - sbi->sb = sb; - sb->s_fs_info = sbi; spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb); - res = -EINVAL; - if (!parse_options((char *)data, sbi)) { - pr_err("unable to parse mount options\n"); - goto bail; - } - + sbi->sb = sb; sb->s_op = &hfs_super_operations; sb->s_xattr = hfs_xattr_handlers; sb->s_flags |= SB_NODIRATIME; @@ -451,18 +381,56 @@ bail: return res; } -static struct dentry *hfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, hfs_fill_super); +} + +static void hfs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations hfs_context_ops = { + .parse_param = hfs_parse_param, + .get_tree = hfs_get_tree, + .reconfigure = hfs_reconfigure, + .free = hfs_free_fc, +}; + +static int hfs_init_fs_context(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super); + struct hfs_sb_info *hsb; + + hsb = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); + if (!hsb) + return -ENOMEM; + + fc->s_fs_info = hsb; + fc->ops = &hfs_context_ops; + + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) { + /* initialize options with defaults */ + hsb->s_uid = current_uid(); + hsb->s_gid = current_gid(); + hsb->s_file_umask = 0133; + hsb->s_dir_umask = 0022; + hsb->s_type = cpu_to_be32(0x3f3f3f3f); /* == '????' */ + hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */ + hsb->s_quiet = 0; + hsb->part = -1; + hsb->session = -1; + } + + return 0; } static struct file_system_type hfs_fs_type = { .owner = THIS_MODULE, .name = "hfs", - .mount = hfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = hfs_init_fs_context, }; MODULE_ALIAS_FS("hfs"); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 59ce81dca73f..2f089bff0095 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -21,6 +21,7 @@ #include <linux/mutex.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> +#include <linux/fs_context.h> #include "hfsplus_raw.h" #define DBG_BNODE_REFS 0x00000001 @@ -156,6 +157,7 @@ struct hfsplus_sb_info { /* Runtime variables */ u32 blockoffset; + u32 min_io_size; sector_t part_start; sector_t sect_count; int fs_shift; @@ -307,7 +309,7 @@ struct hfsplus_readdir_data { */ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) { - return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size, HFSPLUS_SECTOR_SIZE); } @@ -496,8 +498,7 @@ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); /* options.c */ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts); -int hfsplus_parse_options_remount(char *input, int *force); -int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi); +int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param); int hfsplus_show_options(struct seq_file *seq, struct dentry *root); /* part_tbl.c */ diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index c94a58762ad6..a66a09a56bf7 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -12,7 +12,8 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/nls.h> #include <linux/mount.h> #include <linux/seq_file.h> @@ -23,26 +24,23 @@ enum { opt_creator, opt_type, opt_umask, opt_uid, opt_gid, opt_part, opt_session, opt_nls, - opt_nodecompose, opt_decompose, - opt_barrier, opt_nobarrier, - opt_force, opt_err + opt_decompose, opt_barrier, + opt_force, }; -static const match_table_t tokens = { - { opt_creator, "creator=%s" }, - { opt_type, "type=%s" }, - { opt_umask, "umask=%o" }, - { opt_uid, "uid=%u" }, - { opt_gid, "gid=%u" }, - { opt_part, "part=%u" }, - { opt_session, "session=%u" }, - { opt_nls, "nls=%s" }, - { opt_decompose, "decompose" }, - { opt_nodecompose, "nodecompose" }, - { opt_barrier, "barrier" }, - { opt_nobarrier, "nobarrier" }, - { opt_force, "force" }, - { opt_err, NULL } +static const struct fs_parameter_spec hfs_param_spec[] = { + fsparam_string ("creator", opt_creator), + fsparam_string ("type", opt_type), + fsparam_u32oct ("umask", opt_umask), + fsparam_u32 ("uid", opt_uid), + fsparam_u32 ("gid", opt_gid), + fsparam_u32 ("part", opt_part), + fsparam_u32 ("session", opt_session), + fsparam_string ("nls", opt_nls), + fsparam_flag_no ("decompose", opt_decompose), + fsparam_flag_no ("barrier", opt_barrier), + fsparam_flag ("force", opt_force), + {} }; /* Initialize an options object to reasonable defaults */ @@ -60,162 +58,89 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts) opts->session = -1; } -/* convert a "four byte character" to a 32 bit int with error checks */ -static inline int match_fourchar(substring_t *arg, u32 *result) +/* Parse options from mount. Returns nonzero errno on failure */ +int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param) { - if (arg->to - arg->from != 4) - return -EINVAL; - memcpy(result, arg->from, 4); - return 0; -} - -int hfsplus_parse_options_remount(char *input, int *force) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int token; - - if (!input) - return 1; - - while ((p = strsep(&input, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case opt_force: - *force = 1; - break; - default: - break; + struct hfsplus_sb_info *sbi = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + /* + * Only the force option is examined during remount, all others + * are ignored. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + strncmp(param->key, "force", 5)) + return 0; + + opt = fs_parse(fc, hfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case opt_creator: + if (strlen(param->string) != 4) { + pr_err("creator requires a 4 character value\n"); + return -EINVAL; } - } - - return 1; -} - -/* Parse options from mount. Returns 0 on failure */ -/* input is the options passed to mount() as a string */ -int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int tmp, token; - - if (!input) - goto done; - - while ((p = strsep(&input, ",")) != NULL) { - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case opt_creator: - if (match_fourchar(&args[0], &sbi->creator)) { - pr_err("creator requires a 4 character value\n"); - return 0; - } - break; - case opt_type: - if (match_fourchar(&args[0], &sbi->type)) { - pr_err("type requires a 4 character value\n"); - return 0; - } - break; - case opt_umask: - if (match_octal(&args[0], &tmp)) { - pr_err("umask requires a value\n"); - return 0; - } - sbi->umask = (umode_t)tmp; - break; - case opt_uid: - if (match_int(&args[0], &tmp)) { - pr_err("uid requires an argument\n"); - return 0; - } - sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp); - if (!uid_valid(sbi->uid)) { - pr_err("invalid uid specified\n"); - return 0; - } else { - set_bit(HFSPLUS_SB_UID, &sbi->flags); - } - break; - case opt_gid: - if (match_int(&args[0], &tmp)) { - pr_err("gid requires an argument\n"); - return 0; - } - sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp); - if (!gid_valid(sbi->gid)) { - pr_err("invalid gid specified\n"); - return 0; - } else { - set_bit(HFSPLUS_SB_GID, &sbi->flags); - } - break; - case opt_part: - if (match_int(&args[0], &sbi->part)) { - pr_err("part requires an argument\n"); - return 0; - } - break; - case opt_session: - if (match_int(&args[0], &sbi->session)) { - pr_err("session requires an argument\n"); - return 0; - } - break; - case opt_nls: - if (sbi->nls) { - pr_err("unable to change nls mapping\n"); - return 0; - } - p = match_strdup(&args[0]); - if (p) - sbi->nls = load_nls(p); - if (!sbi->nls) { - pr_err("unable to load nls mapping \"%s\"\n", - p); - kfree(p); - return 0; - } - kfree(p); - break; - case opt_decompose: - clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); - break; - case opt_nodecompose: + memcpy(&sbi->creator, param->string, 4); + break; + case opt_type: + if (strlen(param->string) != 4) { + pr_err("type requires a 4 character value\n"); + return -EINVAL; + } + memcpy(&sbi->type, param->string, 4); + break; + case opt_umask: + sbi->umask = (umode_t)result.uint_32; + break; + case opt_uid: + sbi->uid = result.uid; + set_bit(HFSPLUS_SB_UID, &sbi->flags); + break; + case opt_gid: + sbi->gid = result.gid; + set_bit(HFSPLUS_SB_GID, &sbi->flags); + break; + case opt_part: + sbi->part = result.uint_32; + break; + case opt_session: + sbi->session = result.uint_32; + break; + case opt_nls: + if (sbi->nls) { + pr_err("unable to change nls mapping\n"); + return -EINVAL; + } + sbi->nls = load_nls(param->string); + if (!sbi->nls) { + pr_err("unable to load nls mapping \"%s\"\n", + param->string); + return -EINVAL; + } + break; + case opt_decompose: + if (result.negated) set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); - break; - case opt_barrier: - clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); - break; - case opt_nobarrier: + else + clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); + break; + case opt_barrier: + if (result.negated) set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); - break; - case opt_force: - set_bit(HFSPLUS_SB_FORCE, &sbi->flags); - break; - default: - return 0; - } - } - -done: - if (!sbi->nls) { - /* try utf8 first, as this is the old default behaviour */ - sbi->nls = load_nls("utf8"); - if (!sbi->nls) - sbi->nls = load_nls_default(); - if (!sbi->nls) - return 0; + else + clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); + break; + case opt_force: + set_bit(HFSPLUS_SB_FORCE, &sbi->flags); + break; + default: + return -EINVAL; } - return 1; + return 0; } int hfsplus_show_options(struct seq_file *seq, struct dentry *root) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 97920202790f..948b8aaee33e 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -14,6 +14,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/fs.h> +#include <linux/fs_context.h> #include <linux/slab.h> #include <linux/vfs.h> #include <linux/nls.h> @@ -332,34 +333,33 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int hfsplus_remount(struct super_block *sb, int *flags, char *data) +static int hfsplus_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + sync_filesystem(sb); - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (!(*flags & SB_RDONLY)) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; - int force = 0; - - if (!hfsplus_parse_options_remount(data, &force)) - return -EINVAL; + if (!(fc->sb_flags & SB_RDONLY)) { + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); + struct hfsplus_vh *vhdr = sbi->s_vhdr; if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; - } else if (force) { + fc->sb_flags |= SB_RDONLY; + } else if (test_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { pr_warn("filesystem is marked journaled, leaving read-only.\n"); sb->s_flags |= SB_RDONLY; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; } } return 0; @@ -373,38 +373,33 @@ static const struct super_operations hfsplus_sops = { .put_super = hfsplus_put_super, .sync_fs = hfsplus_sync_fs, .statfs = hfsplus_statfs, - .remount_fs = hfsplus_remount, .show_options = hfsplus_show_options, }; -static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) +static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc) { struct hfsplus_vh *vhdr; - struct hfsplus_sb_info *sbi; + struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); hfsplus_cat_entry entry; struct hfs_find_data fd; struct inode *root, *inode; struct qstr str; - struct nls_table *nls = NULL; + struct nls_table *nls; u64 last_fs_block, last_fs_page; + int silent = fc->sb_flags & SB_SILENT; int err; - err = -ENOMEM; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - goto out; - - sb->s_fs_info = sbi; mutex_init(&sbi->alloc_mutex); mutex_init(&sbi->vh_mutex); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); - hfsplus_fill_defaults(sbi); err = -EINVAL; - if (!hfsplus_parse_options(data, sbi)) { - pr_err("unable to parse mount options\n"); - goto out_unload_nls; + if (!sbi->nls) { + /* try utf8 first, as this is the old default behaviour */ + sbi->nls = load_nls("utf8"); + if (!sbi->nls) + sbi->nls = load_nls_default(); } /* temporarily use utf8 to correctly find the hidden dir below */ @@ -616,7 +611,6 @@ out_unload_nls: unload_nls(sbi->nls); unload_nls(nls); kfree(sbi); -out: return err; } @@ -641,18 +635,46 @@ static void hfsplus_free_inode(struct inode *inode) #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) -static struct dentry *hfsplus_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hfsplus_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, hfsplus_fill_super); +} + +static void hfsplus_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super); + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations hfsplus_context_ops = { + .parse_param = hfsplus_parse_param, + .get_tree = hfsplus_get_tree, + .reconfigure = hfsplus_reconfigure, + .free = hfsplus_free_fc, +}; + +static int hfsplus_init_fs_context(struct fs_context *fc) +{ + struct hfsplus_sb_info *sbi; + + sbi = kzalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) + hfsplus_fill_defaults(sbi); + + fc->s_fs_info = sbi; + fc->ops = &hfsplus_context_ops; + + return 0; } static struct file_system_type hfsplus_fs_type = { .owner = THIS_MODULE, .name = "hfsplus", - .mount = hfsplus_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = hfsplus_init_fs_context, }; MODULE_ALIAS_FS("hfsplus"); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 9592ffcb44e5..74801911bc1c 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -172,6 +172,8 @@ int hfsplus_read_wrapper(struct super_block *sb) if (!blocksize) goto out; + sbi->min_io_size = blocksize; + if (hfsplus_get_last_session(sb, &part_start, &part_size)) goto out; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index e73717daa5f9..27567920abe4 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -9,7 +9,8 @@ #include "hpfs_fn.h" #include <linux/module.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/init.h> #include <linux/statfs.h> #include <linux/magic.h> @@ -90,7 +91,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...) hpfs_sb(s)->sb_was_error = 1; } -/* +/* * A little trick to detect cycles in many hpfs structures and don't let the * kernel crash on corrupted filesystem. When first called, set c2 to 0. * @@ -272,146 +273,70 @@ static void destroy_inodecache(void) kmem_cache_destroy(hpfs_inode_cachep); } -/* - * A tiny parser for option strings, stolen from dosfs. - * Stolen again from read-only hpfs. - * And updated for table-driven option parsing. - */ - enum { - Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis, - Opt_check_none, Opt_check_normal, Opt_check_strict, - Opt_err_cont, Opt_err_ro, Opt_err_panic, - Opt_eas_no, Opt_eas_ro, Opt_eas_rw, - Opt_chkdsk_no, Opt_chkdsk_errors, Opt_chkdsk_always, - Opt_timeshift, Opt_err, + Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case, + Opt_check, Opt_err, Opt_eas, Opt_chkdsk, Opt_timeshift, }; -static const match_table_t tokens = { - {Opt_help, "help"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_umask, "umask=%o"}, - {Opt_case_lower, "case=lower"}, - {Opt_case_asis, "case=asis"}, - {Opt_check_none, "check=none"}, - {Opt_check_normal, "check=normal"}, - {Opt_check_strict, "check=strict"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_err_panic, "errors=panic"}, - {Opt_eas_no, "eas=no"}, - {Opt_eas_ro, "eas=ro"}, - {Opt_eas_rw, "eas=rw"}, - {Opt_chkdsk_no, "chkdsk=no"}, - {Opt_chkdsk_errors, "chkdsk=errors"}, - {Opt_chkdsk_always, "chkdsk=always"}, - {Opt_timeshift, "timeshift=%d"}, - {Opt_err, NULL}, +static const struct constant_table hpfs_param_case[] = { + {"asis", 0}, + {"lower", 1}, + {} }; -static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask, - int *lowercase, int *eas, int *chk, int *errs, - int *chkdsk, int *timeshift) -{ - char *p; - int option; +static const struct constant_table hpfs_param_check[] = { + {"none", 0}, + {"normal", 1}, + {"strict", 2}, + {} +}; - if (!opts) - return 1; +static const struct constant_table hpfs_param_err[] = { + {"continue", 0}, + {"remount-ro", 1}, + {"panic", 2}, + {} +}; - /*pr_info("Parsing opts: '%s'\n",opts);*/ - - while ((p = strsep(&opts, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_help: - return 2; - case Opt_uid: - if (match_int(args, &option)) - return 0; - *uid = make_kuid(current_user_ns(), option); - if (!uid_valid(*uid)) - return 0; - break; - case Opt_gid: - if (match_int(args, &option)) - return 0; - *gid = make_kgid(current_user_ns(), option); - if (!gid_valid(*gid)) - return 0; - break; - case Opt_umask: - if (match_octal(args, &option)) - return 0; - *umask = option; - break; - case Opt_case_lower: - *lowercase = 1; - break; - case Opt_case_asis: - *lowercase = 0; - break; - case Opt_check_none: - *chk = 0; - break; - case Opt_check_normal: - *chk = 1; - break; - case Opt_check_strict: - *chk = 2; - break; - case Opt_err_cont: - *errs = 0; - break; - case Opt_err_ro: - *errs = 1; - break; - case Opt_err_panic: - *errs = 2; - break; - case Opt_eas_no: - *eas = 0; - break; - case Opt_eas_ro: - *eas = 1; - break; - case Opt_eas_rw: - *eas = 2; - break; - case Opt_chkdsk_no: - *chkdsk = 0; - break; - case Opt_chkdsk_errors: - *chkdsk = 1; - break; - case Opt_chkdsk_always: - *chkdsk = 2; - break; - case Opt_timeshift: - { - int m = 1; - char *rhs = args[0].from; - if (!rhs || !*rhs) - return 0; - if (*rhs == '-') m = -1; - if (*rhs == '+' || *rhs == '-') rhs++; - *timeshift = simple_strtoul(rhs, &rhs, 0) * m; - if (*rhs) - return 0; - break; - } - default: - return 0; - } - } - return 1; -} +static const struct constant_table hpfs_param_eas[] = { + {"no", 0}, + {"ro", 1}, + {"rw", 2}, + {} +}; + +static const struct constant_table hpfs_param_chkdsk[] = { + {"no", 0}, + {"errors", 1}, + {"always", 2}, + {} +}; + +static const struct fs_parameter_spec hpfs_param_spec[] = { + fsparam_flag ("help", Opt_help), + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("umask", Opt_umask), + fsparam_enum ("case", Opt_case, hpfs_param_case), + fsparam_enum ("check", Opt_check, hpfs_param_check), + fsparam_enum ("errors", Opt_err, hpfs_param_err), + fsparam_enum ("eas", Opt_eas, hpfs_param_eas), + fsparam_enum ("chkdsk", Opt_chkdsk, hpfs_param_chkdsk), + fsparam_s32 ("timeshift", Opt_timeshift), + {} +}; + +struct hpfs_fc_context { + kuid_t uid; + kgid_t gid; + umode_t umask; + int lowercase; + int eas; + int chk; + int errs; + int chkdsk; + int timeshift; +}; static inline void hpfs_help(void) { @@ -439,49 +364,92 @@ HPFS filesystem options:\n\ \n"); } -static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) +static int hpfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - kuid_t uid; - kgid_t gid; - umode_t umask; - int lowercase, eas, chk, errs, chkdsk, timeshift; - int o; + struct hpfs_fc_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, hpfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_help: + hpfs_help(); + return -EINVAL; + case Opt_uid: + ctx->uid = result.uid; + break; + case Opt_gid: + ctx->gid = result.gid; + break; + case Opt_umask: + ctx->umask = result.uint_32; + break; + case Opt_case: + ctx->lowercase = result.uint_32; + break; + case Opt_check: + ctx->chk = result.uint_32; + break; + case Opt_err: + ctx->errs = result.uint_32; + break; + case Opt_eas: + ctx->eas = result.uint_32; + break; + case Opt_chkdsk: + ctx->chkdsk = result.uint_32; + break; + case Opt_timeshift: + { + int m = 1; + char *rhs = param->string; + int timeshift; + + if (*rhs == '-') m = -1; + if (*rhs == '+' || *rhs == '-') rhs++; + timeshift = simple_strtoul(rhs, &rhs, 0) * m; + if (*rhs) + return -EINVAL; + ctx->timeshift = timeshift; + break; + } + default: + return -EINVAL; + } + + return 0; +} + +static int hpfs_reconfigure(struct fs_context *fc) +{ + struct hpfs_fc_context *ctx = fc->fs_private; + struct super_block *s = fc->root->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); sync_filesystem(s); - *flags |= SB_NOATIME; + fc->sb_flags |= SB_NOATIME; hpfs_lock(s); - uid = sbi->sb_uid; gid = sbi->sb_gid; - umask = 0777 & ~sbi->sb_mode; - lowercase = sbi->sb_lowercase; - eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk; - errs = sbi->sb_err; timeshift = sbi->sb_timeshift; - - if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase, - &eas, &chk, &errs, &chkdsk, ×hift))) { - pr_err("bad mount options.\n"); - goto out_err; - } - if (o == 2) { - hpfs_help(); - goto out_err; - } - if (timeshift != sbi->sb_timeshift) { + + if (ctx->timeshift != sbi->sb_timeshift) { pr_err("timeshift can't be changed using remount.\n"); goto out_err; } unmark_dirty(s); - sbi->sb_uid = uid; sbi->sb_gid = gid; - sbi->sb_mode = 0777 & ~umask; - sbi->sb_lowercase = lowercase; - sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk; - sbi->sb_err = errs; sbi->sb_timeshift = timeshift; + sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid; + sbi->sb_mode = 0777 & ~ctx->umask; + sbi->sb_lowercase = ctx->lowercase; + sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk; + sbi->sb_chkdsk = ctx->chkdsk; + sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift; - if (!(*flags & SB_RDONLY)) mark_dirty(s, 1); + if (!(fc->sb_flags & SB_RDONLY)) mark_dirty(s, 1); hpfs_unlock(s); return 0; @@ -530,30 +498,24 @@ static const struct super_operations hpfs_sops = .evict_inode = hpfs_evict_inode, .put_super = hpfs_put_super, .statfs = hpfs_statfs, - .remount_fs = hpfs_remount_fs, .show_options = hpfs_show_options, }; -static int hpfs_fill_super(struct super_block *s, void *options, int silent) +static int hpfs_fill_super(struct super_block *s, struct fs_context *fc) { + struct hpfs_fc_context *ctx = fc->fs_private; struct buffer_head *bh0, *bh1, *bh2; struct hpfs_boot_block *bootblock; struct hpfs_super_block *superblock; struct hpfs_spare_block *spareblock; struct hpfs_sb_info *sbi; struct inode *root; - - kuid_t uid; - kgid_t gid; - umode_t umask; - int lowercase, eas, chk, errs, chkdsk, timeshift; + int silent = fc->sb_flags & SB_SILENT; dnode_secno root_dno; struct hpfs_dirent *de = NULL; struct quad_buffer_head qbh; - int o; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) { return -ENOMEM; @@ -563,26 +525,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) mutex_init(&sbi->hpfs_mutex); hpfs_lock(s); - uid = current_uid(); - gid = current_gid(); - umask = current_umask(); - lowercase = 0; - eas = 2; - chk = 1; - errs = 1; - chkdsk = 1; - timeshift = 0; - - if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase, - &eas, &chk, &errs, &chkdsk, ×hift))) { - pr_err("bad mount options.\n"); - goto bail0; - } - if (o==2) { - hpfs_help(); - goto bail0; - } - /*sbi->sb_mounting = 1;*/ sb_set_blocksize(s, 512); sbi->sb_fs_size = -1; @@ -622,17 +564,17 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start); sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band); sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap); - sbi->sb_uid = uid; - sbi->sb_gid = gid; - sbi->sb_mode = 0777 & ~umask; + sbi->sb_uid = ctx->uid; + sbi->sb_gid = ctx->gid; + sbi->sb_mode = 0777 & ~ctx->umask; sbi->sb_n_free = -1; sbi->sb_n_free_dnodes = -1; - sbi->sb_lowercase = lowercase; - sbi->sb_eas = eas; - sbi->sb_chk = chk; - sbi->sb_chkdsk = chkdsk; - sbi->sb_err = errs; - sbi->sb_timeshift = timeshift; + sbi->sb_lowercase = ctx->lowercase; + sbi->sb_eas = ctx->eas; + sbi->sb_chk = ctx->chk; + sbi->sb_chkdsk = ctx->chkdsk; + sbi->sb_err = ctx->errs; + sbi->sb_timeshift = ctx->timeshift; sbi->sb_was_error = 0; sbi->sb_cp_table = NULL; sbi->sb_c_bitmap = -1; @@ -653,7 +595,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) /* Check for general fs errors*/ if (spareblock->dirty && !spareblock->old_wrote) { - if (errs == 2) { + if (sbi->sb_err == 2) { pr_err("Improperly stopped, not mounted\n"); goto bail4; } @@ -667,16 +609,16 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) } if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) { - if (errs >= 2) { + if (sbi->sb_err >= 2) { pr_err("Spare dnodes used, try chkdsk\n"); mark_dirty(s, 0); goto bail4; } hpfs_error(s, "warning: spare dnodes used, try chkdsk"); - if (errs == 0) + if (sbi->sb_err == 0) pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n"); } - if (chk) { + if (sbi->sb_chk) { unsigned a; if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) || le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) { @@ -755,18 +697,70 @@ bail0: return -EINVAL; } -static struct dentry *hpfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hpfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, hpfs_fill_super); +} + +static void hpfs_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super); + kfree(fc->fs_private); } +static const struct fs_context_operations hpfs_fc_context_ops = { + .parse_param = hpfs_parse_param, + .get_tree = hpfs_get_tree, + .reconfigure = hpfs_reconfigure, + .free = hpfs_free_fc, +}; + +static int hpfs_init_fs_context(struct fs_context *fc) +{ + struct hpfs_fc_context *ctx; + + ctx = kzalloc(sizeof(struct hpfs_fc_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct hpfs_sb_info *sbi = hpfs_sb(sb); + + ctx->uid = sbi->sb_uid; + ctx->gid = sbi->sb_gid; + ctx->umask = 0777 & ~sbi->sb_mode; + ctx->lowercase = sbi->sb_lowercase; + ctx->eas = sbi->sb_eas; + ctx->chk = sbi->sb_chk; + ctx->chkdsk = sbi->sb_chkdsk; + ctx->errs = sbi->sb_err; + ctx->timeshift = sbi->sb_timeshift; + + } else { + ctx->uid = current_uid(); + ctx->gid = current_gid(); + ctx->umask = current_umask(); + ctx->lowercase = 0; + ctx->eas = 2; + ctx->chk = 1; + ctx->errs = 1; + ctx->chkdsk = 1; + ctx->timeshift = 0; + } + + fc->fs_private = ctx; + fc->ops = &hpfs_fc_context_ops; + + return 0; +}; + static struct file_system_type hpfs_fs_type = { .owner = THIS_MODULE, .name = "hpfs", - .mount = hpfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = hpfs_init_fs_context, + .parameters = hpfs_param_spec, }; MODULE_ALIAS_FS("hpfs"); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5cf327337e22..2dea122e5b93 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -39,6 +39,9 @@ #include <linux/uaccess.h> #include <linux/sched/mm.h> +#define CREATE_TRACE_POINTS +#include <trace/events/hugetlbfs.h> + static const struct address_space_operations hugetlbfs_aops; static const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; @@ -687,6 +690,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; + trace_hugetlbfs_evict_inode(inode); remove_inode_hugepages(inode, 0, LLONG_MAX); /* @@ -814,8 +818,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_PUNCH_HOLE) - return hugetlbfs_punch_hole(inode, offset, len); + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = hugetlbfs_punch_hole(inode, offset, len); + goto out_nolock; + } /* * Default preallocate case. @@ -919,6 +925,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, inode_set_ctime_current(inode); out: inode_unlock(inode); + +out_nolock: + trace_hugetlbfs_fallocate(inode, mode, offset, len, error); return error; } @@ -935,6 +944,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, if (error) return error; + trace_hugetlbfs_setattr(inode, dentry, attr); + if (ia_valid & ATTR_SIZE) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; @@ -1033,6 +1044,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); + trace_hugetlbfs_alloc_inode(inode, dir, mode); } else { if (resv_map) kref_put(&resv_map->refs, resv_map_release); @@ -1272,6 +1284,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) static void hugetlbfs_free_inode(struct inode *inode) { + trace_hugetlbfs_free_inode(inode); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } diff --git a/fs/inode.c b/fs/inode.c index 8dabb224f941..b13b778257ae 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -21,7 +21,12 @@ #include <linux/list_lru.h> #include <linux/iversion.h> #include <linux/rw_hint.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> #include <trace/events/writeback.h> +#define CREATE_TRACE_POINTS +#include <trace/events/timestamp.h> + #include "internal.h" /* @@ -98,6 +103,70 @@ long get_nr_dirty_inodes(void) return nr_dirty > 0 ? nr_dirty : 0; } +#ifdef CONFIG_DEBUG_FS +static DEFINE_PER_CPU(long, mg_ctime_updates); +static DEFINE_PER_CPU(long, mg_fine_stamps); +static DEFINE_PER_CPU(long, mg_ctime_swaps); + +static unsigned long get_mg_ctime_updates(void) +{ + unsigned long sum = 0; + int i; + + for_each_possible_cpu(i) + sum += data_race(per_cpu(mg_ctime_updates, i)); + return sum; +} + +static unsigned long get_mg_fine_stamps(void) +{ + unsigned long sum = 0; + int i; + + for_each_possible_cpu(i) + sum += data_race(per_cpu(mg_fine_stamps, i)); + return sum; +} + +static unsigned long get_mg_ctime_swaps(void) +{ + unsigned long sum = 0; + int i; + + for_each_possible_cpu(i) + sum += data_race(per_cpu(mg_ctime_swaps, i)); + return sum; +} + +#define mgtime_counter_inc(__var) this_cpu_inc(__var) + +static int mgts_show(struct seq_file *s, void *p) +{ + unsigned long ctime_updates = get_mg_ctime_updates(); + unsigned long ctime_swaps = get_mg_ctime_swaps(); + unsigned long fine_stamps = get_mg_fine_stamps(); + unsigned long floor_swaps = timekeeping_get_mg_floor_swaps(); + + seq_printf(s, "%lu %lu %lu %lu\n", + ctime_updates, ctime_swaps, fine_stamps, floor_swaps); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(mgts); + +static int __init mg_debugfs_init(void) +{ + debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops); + return 0; +} +late_initcall(mg_debugfs_init); + +#else /* ! CONFIG_DEBUG_FS */ + +#define mgtime_counter_inc(__var) do { } while (0) + +#endif /* CONFIG_DEBUG_FS */ + /* * Handle nr_inode sysctl */ @@ -174,6 +243,8 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp inode->i_opflags = 0; if (sb->s_xattr) inode->i_opflags |= IOP_XATTR; + if (sb->s_type->fs_flags & FS_MGTIME) + inode->i_opflags |= IOP_MGTIME; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); @@ -748,7 +819,7 @@ static void evict(struct inode *inode) * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ - smp_mb(); + smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); @@ -1241,16 +1312,15 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, - * and if present it is return it with an increased reference count. This is - * a variant of iget5_locked() for callers that don't want to fail on memory - * allocation of inode. + * and if present return it with an increased reference count. This is a + * variant of iget5_locked() that doesn't allocate an inode. * - * If the inode is not in cache, insert the pre-allocated inode to cache and + * If the inode is not present in the cache, insert the pre-allocated inode and * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note that both @test and @set are called with the inode_hash_lock held, so + * they can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), @@ -1314,16 +1384,16 @@ EXPORT_SYMBOL(inode_insert5); * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, - * and if present it is return it with an increased reference count. This is - * a generalized version of iget_locked() for file systems where the inode + * and if present return it with an increased reference count. This is a + * generalized version of iget_locked() for file systems where the inode * number is not sufficient for unique identification of an inode. * - * If the inode is not in cache, allocate a new inode and return it locked, - * hashed, and with the I_NEW flag set. The file system gets to fill it in - * before unlocking it via unlock_new_inode(). + * If the inode is not present in the cache, allocate and insert a new inode + * and return it locked, hashed, and with the I_NEW flag set. The file system + * gets to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note that both @test and @set are called with the inode_hash_lock held, so + * they can't sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), @@ -2211,19 +2281,58 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); +/** + * current_time - Return FS time (possibly fine-grained) + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs, as suitable for a ctime/mtime change. If the ctime is flagged + * as having been QUERIED, get a fine-grained timestamp, but don't update + * the floor. + * + * For a multigrain inode, this is effectively an estimate of the timestamp + * that a file would receive. An actual update must go through + * inode_set_ctime_current(). + */ +struct timespec64 current_time(struct inode *inode) +{ + struct timespec64 now; + u32 cns; + + ktime_get_coarse_real_ts64_mg(&now); + + if (!is_mgtime(inode)) + goto out; + + /* If nothing has queried it, then coarse time is fine */ + cns = smp_load_acquire(&inode->i_ctime_nsec); + if (cns & I_CTIME_QUERIED) { + /* + * If there is no apparent change, then get a fine-grained + * timestamp. + */ + if (now.tv_nsec == (cns & ~I_CTIME_QUERIED)) + ktime_get_real_ts64(&now); + } +out: + return timestamp_truncate(now, inode); +} +EXPORT_SYMBOL(current_time); + static int inode_needs_update_time(struct inode *inode) { + struct timespec64 now, ts; int sync_it = 0; - struct timespec64 now = current_time(inode); - struct timespec64 ts; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; + now = current_time(inode); + ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) - sync_it = S_MTIME; + sync_it |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) @@ -2600,6 +2709,16 @@ void inode_nohighmem(struct inode *inode) } EXPORT_SYMBOL(inode_nohighmem); +struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) +{ + trace_inode_set_ctime_to_ts(inode, &ts); + set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec); + inode->i_ctime_sec = ts.tv_sec; + inode->i_ctime_nsec = ts.tv_nsec; + return ts; +} +EXPORT_SYMBOL(inode_set_ctime_to_ts); + /** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec @@ -2632,39 +2751,159 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) EXPORT_SYMBOL(timestamp_truncate); /** - * current_time - Return FS time - * @inode: inode. + * inode_set_ctime_current - set the ctime to current_time + * @inode: inode * - * Return the current time truncated to the time granularity supported by - * the fs. + * Set the inode's ctime to the current value for the inode. Returns the + * current value that was assigned. If this is not a multigrain inode, then we + * set it to the later of the coarse time and floor value. + * + * If it is multigrain, then we first see if the coarse-grained timestamp is + * distinct from what is already there. If so, then use that. Otherwise, get a + * fine-grained timestamp. * - * Note that inode and inode->sb cannot be NULL. - * Otherwise, the function warns and returns time without truncation. + * After that, try to swap the new value into i_ctime_nsec. Accept the + * resulting ctime, regardless of the outcome of the swap. If it has + * already been replaced, then that timestamp is later than the earlier + * unacceptable one, and is thus acceptable. */ -struct timespec64 current_time(struct inode *inode) +struct timespec64 inode_set_ctime_current(struct inode *inode) { struct timespec64 now; + u32 cns, cur; - ktime_get_coarse_real_ts64(&now); - return timestamp_truncate(now, inode); + ktime_get_coarse_real_ts64_mg(&now); + now = timestamp_truncate(now, inode); + + /* Just return that if this is not a multigrain fs */ + if (!is_mgtime(inode)) { + inode_set_ctime_to_ts(inode, now); + goto out; + } + + /* + * A fine-grained time is only needed if someone has queried + * for timestamps, and the current coarse grained time isn't + * later than what's already there. + */ + cns = smp_load_acquire(&inode->i_ctime_nsec); + if (cns & I_CTIME_QUERIED) { + struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec, + .tv_nsec = cns & ~I_CTIME_QUERIED }; + + if (timespec64_compare(&now, &ctime) <= 0) { + ktime_get_real_ts64_mg(&now); + now = timestamp_truncate(now, inode); + mgtime_counter_inc(mg_fine_stamps); + } + } + mgtime_counter_inc(mg_ctime_updates); + + /* No need to cmpxchg if it's exactly the same */ + if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) { + trace_ctime_xchg_skip(inode, &now); + goto out; + } + cur = cns; +retry: + /* Try to swap the nsec value into place. */ + if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) { + /* If swap occurred, then we're (mostly) done */ + inode->i_ctime_sec = now.tv_sec; + trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur); + mgtime_counter_inc(mg_ctime_swaps); + } else { + /* + * Was the change due to someone marking the old ctime QUERIED? + * If so then retry the swap. This can only happen once since + * the only way to clear I_CTIME_QUERIED is to stamp the inode + * with a new ctime. + */ + if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) { + cns = cur; + goto retry; + } + /* Otherwise, keep the existing ctime */ + now.tv_sec = inode->i_ctime_sec; + now.tv_nsec = cur & ~I_CTIME_QUERIED; + } +out: + return now; } -EXPORT_SYMBOL(current_time); +EXPORT_SYMBOL(inode_set_ctime_current); /** - * inode_set_ctime_current - set the ctime to current_time - * @inode: inode + * inode_set_ctime_deleg - try to update the ctime on a delegated inode + * @inode: inode to update + * @update: timespec64 to set the ctime * - * Set the inode->i_ctime to the current value for the inode. Returns - * the current value that was assigned to i_ctime. + * Attempt to atomically update the ctime on behalf of a delegation holder. + * + * The nfs server can call back the holder of a delegation to get updated + * inode attributes, including the mtime. When updating the mtime, update + * the ctime to a value at least equal to that. + * + * This can race with concurrent updates to the inode, in which + * case the update is skipped. + * + * Note that this works even when multigrain timestamps are not enabled, + * so it is used in either case. */ -struct timespec64 inode_set_ctime_current(struct inode *inode) +struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update) { - struct timespec64 now = current_time(inode); + struct timespec64 now, cur_ts; + u32 cur, old; - inode_set_ctime_to_ts(inode, now); - return now; + /* pairs with try_cmpxchg below */ + cur = smp_load_acquire(&inode->i_ctime_nsec); + cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; + cur_ts.tv_sec = inode->i_ctime_sec; + + /* If the update is older than the existing value, skip it. */ + if (timespec64_compare(&update, &cur_ts) <= 0) + return cur_ts; + + ktime_get_coarse_real_ts64_mg(&now); + + /* Clamp the update to "now" if it's in the future */ + if (timespec64_compare(&update, &now) > 0) + update = now; + + update = timestamp_truncate(update, inode); + + /* No need to update if the values are already the same */ + if (timespec64_equal(&update, &cur_ts)) + return cur_ts; + + /* + * Try to swap the nsec value into place. If it fails, that means + * it raced with an update due to a write or similar activity. That + * stamp takes precedence, so just skip the update. + */ +retry: + old = cur; + if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) { + inode->i_ctime_sec = update.tv_sec; + mgtime_counter_inc(mg_ctime_swaps); + return update; + } + + /* + * Was the change due to another task marking the old ctime QUERIED? + * + * If so, then retry the swap. This can only happen once since + * the only way to clear I_CTIME_QUERIED is to stamp the inode + * with a new ctime. + */ + if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED))) + goto retry; + + /* Otherwise, it was a new timestamp. */ + cur_ts.tv_sec = inode->i_ctime_sec; + cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; + return cur_ts; } -EXPORT_SYMBOL(inode_set_ctime_current); +EXPORT_SYMBOL(inode_set_ctime_deleg); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged @@ -2672,7 +2911,7 @@ EXPORT_SYMBOL(inode_set_ctime_current); * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * - * Check wether @vfsgid is in the caller's group list or if the caller is + * Check whether @vfsgid is in the caller's group list or if the caller is * privileged with CAP_FSETID over @inode. This can be used to determine * whether the setgid bit can be kept or must be dropped. * diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index aa587b2142e2..ce73d2a48c1e 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1277,22 +1277,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) loff_t length = iomap_length(iter); loff_t written = 0; - /* Don't bother with blocks that are not shared to start with. */ - if (!(iomap->flags & IOMAP_F_SHARED)) - return length; - - /* - * Don't bother with delalloc reservations, holes or unwritten extents. - * - * Note that we use srcmap directly instead of iomap_iter_srcmap as - * unsharing requires providing a separate source map, and the presence - * of one is a good indicator that unsharing is needed, unlike - * IOMAP_F_SHARED which can be set for any data that goes into the COW - * fork for XFS. - */ - if (iter->srcmap.type == IOMAP_HOLE || - iter->srcmap.type == IOMAP_DELALLOC || - iter->srcmap.type == IOMAP_UNWRITTEN) + if (!iomap_want_unshare_iter(iter)) return length; do { @@ -1799,7 +1784,7 @@ new_ioend: if (ifs) atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, &folio->page, len); + wbc_account_cgroup_owner(wbc, folio, len); return 0; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f637aa0706a3..b521eb15759e 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua) + const struct iomap *iomap, bool use_fua, bool atomic) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; @@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, opflags |= REQ_FUA; else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + if (atomic) + opflags |= REQ_ATOMIC; return opflags; } @@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; - loff_t length = iomap_length(iter); + const loff_t length = iomap_length(iter); + bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; @@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, size_t copied = 0; size_t orig_count; + if (atomic && length != fs_block_size) + return -EINVAL; + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; @@ -377,12 +383,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - /* - * Set the operation flags early so that bio_iov_iter_get_pages - * can set up the page vector appropriately for a ZONE_APPEND - * operation. - */ - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -415,6 +416,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; + if (WARN_ON_ONCE(atomic && n != length)) { + /* + * This bio should have covered the complete length, + * which it doesn't, so error. We may need to zero out + * the tail (complete FS block), similar to when + * bio_iov_iter_get_pages() returns an error, above. + */ + ret = -EINVAL; + bio_put(bio); + goto zero_tail; + } if (dio->flags & IOMAP_DIO_WRITE) { task_io_account_write(n); } else { @@ -598,6 +610,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC; + if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -659,7 +674,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret != -EAGAIN) { trace_iomap_dio_invalidate_fail(inode, iomi.pos, iomi.len); - ret = -ENOTBLK; + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * folio invalidation failed, maybe + * this is transient, unlock and see if + * the caller tries again. + */ + ret = -EAGAIN; + } else { + /* fall back to buffered write */ + ret = -ENOTBLK; + } } goto out_free_dio; } diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 0a991c4ce87d..4118a42cdab0 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ - { IOMAP_NOWAIT, "NOWAIT" } + { IOMAP_NOWAIT, "NOWAIT" }, \ + { IOMAP_ATOMIC, "ATOMIC" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index 33ef13a0b110..8794281f8ffd 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -24,6 +24,7 @@ #define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ #define JFS_ERR_CONTINUE 0x00000004 /* continue */ #define JFS_ERR_PANIC 0x00000008 /* panic */ +#define JFS_ERR_MASK (JFS_ERR_REMOUNT_RO|JFS_ERR_CONTINUE|JFS_ERR_PANIC) /* Quota support */ #define JFS_USRQUOTA 0x00000010 diff --git a/fs/jfs/super.c b/fs/jfs/super.c index e1be21ca5d6e..223d9ac59839 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -6,11 +6,11 @@ #include <linux/fs.h> #include <linux/module.h> -#include <linux/parser.h> #include <linux/completion.h> #include <linux/vfs.h> #include <linux/quotaops.h> -#include <linux/mount.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/posix_acl.h> @@ -210,240 +210,195 @@ enum { Opt_discard, Opt_nodiscard, Opt_discard_minblk }; -static const match_table_t tokens = { - {Opt_integrity, "integrity"}, - {Opt_nointegrity, "nointegrity"}, - {Opt_iocharset, "iocharset=%s"}, - {Opt_resize, "resize=%u"}, - {Opt_resize_nosize, "resize"}, - {Opt_errors, "errors=%s"}, - {Opt_ignore, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, - {Opt_grpquota, "grpquota"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_umask, "umask=%u"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_discard_minblk, "discard=%u"}, - {Opt_err, NULL} +static const struct constant_table jfs_param_errors[] = { + {"continue", JFS_ERR_CONTINUE}, + {"remount-ro", JFS_ERR_REMOUNT_RO}, + {"panic", JFS_ERR_PANIC}, + {} }; -static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, - int *flag) -{ - void *nls_map = (void *)-1; /* -1: no change; NULL: none */ - char *p; - struct jfs_sb_info *sbi = JFS_SBI(sb); +static const struct fs_parameter_spec jfs_param_spec[] = { + fsparam_flag_no ("integrity", Opt_integrity), + fsparam_string ("iocharset", Opt_iocharset), + fsparam_u64 ("resize", Opt_resize), + fsparam_flag ("resize", Opt_resize_nosize), + fsparam_enum ("errors", Opt_errors, jfs_param_errors), + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("noquota", Opt_ignore), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("umask", Opt_umask), + fsparam_flag ("discard", Opt_discard), + fsparam_u32 ("discard", Opt_discard_minblk), + fsparam_flag ("nodiscard", Opt_nodiscard), + {} +}; - *newLVSize = 0; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_integrity: - *flag &= ~JFS_NOINTEGRITY; - break; - case Opt_nointegrity: - *flag |= JFS_NOINTEGRITY; - break; - case Opt_ignore: - /* Silently ignore the quota options */ - /* Don't do anything ;-) */ - break; - case Opt_iocharset: - if (nls_map && nls_map != (void *) -1) - unload_nls(nls_map); - if (!strcmp(args[0].from, "none")) - nls_map = NULL; - else { - nls_map = load_nls(args[0].from); - if (!nls_map) { - pr_err("JFS: charset not found\n"); - goto cleanup; - } - } - break; - case Opt_resize: - { - char *resize = args[0].from; - int rc = kstrtoll(resize, 0, newLVSize); +struct jfs_context { + int flag; + kuid_t uid; + kgid_t gid; + uint umask; + uint minblks_trim; + void *nls_map; + bool resize; + s64 newLVSize; +}; - if (rc) - goto cleanup; - break; - } - case Opt_resize_nosize: - { - *newLVSize = sb_bdev_nr_blocks(sb); - if (*newLVSize == 0) - pr_err("JFS: Cannot determine volume size\n"); - break; +static int jfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct jfs_context *ctx = fc->fs_private; + int reconfigure = (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE); + struct fs_parse_result result; + struct nls_table *nls_map; + int opt; + + opt = fs_parse(fc, jfs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_integrity: + if (result.negated) + ctx->flag |= JFS_NOINTEGRITY; + else + ctx->flag &= ~JFS_NOINTEGRITY; + break; + case Opt_ignore: + /* Silently ignore the quota options */ + /* Don't do anything ;-) */ + break; + case Opt_iocharset: + if (ctx->nls_map && ctx->nls_map != (void *) -1) { + unload_nls(ctx->nls_map); + ctx->nls_map = NULL; } - case Opt_errors: - { - char *errors = args[0].from; - if (!errors || !*errors) - goto cleanup; - if (!strcmp(errors, "continue")) { - *flag &= ~JFS_ERR_REMOUNT_RO; - *flag &= ~JFS_ERR_PANIC; - *flag |= JFS_ERR_CONTINUE; - } else if (!strcmp(errors, "remount-ro")) { - *flag &= ~JFS_ERR_CONTINUE; - *flag &= ~JFS_ERR_PANIC; - *flag |= JFS_ERR_REMOUNT_RO; - } else if (!strcmp(errors, "panic")) { - *flag &= ~JFS_ERR_CONTINUE; - *flag &= ~JFS_ERR_REMOUNT_RO; - *flag |= JFS_ERR_PANIC; - } else { - pr_err("JFS: %s is an invalid error handler\n", - errors); - goto cleanup; + if (!strcmp(param->string, "none")) + ctx->nls_map = NULL; + else { + nls_map = load_nls(param->string); + if (!nls_map) { + pr_err("JFS: charset not found\n"); + return -EINVAL; } - break; + ctx->nls_map = nls_map; } + break; + case Opt_resize: + if (!reconfigure) + return -EINVAL; + ctx->resize = true; + ctx->newLVSize = result.uint_64; + break; + case Opt_resize_nosize: + if (!reconfigure) + return -EINVAL; + ctx->resize = true; + break; + case Opt_errors: + ctx->flag &= ~JFS_ERR_MASK; + ctx->flag |= result.uint_32; + break; #ifdef CONFIG_QUOTA - case Opt_quota: - case Opt_usrquota: - *flag |= JFS_USRQUOTA; - break; - case Opt_grpquota: - *flag |= JFS_GRPQUOTA; - break; + case Opt_quota: + case Opt_usrquota: + ctx->flag |= JFS_USRQUOTA; + break; + case Opt_grpquota: + ctx->flag |= JFS_GRPQUOTA; + break; #else - case Opt_usrquota: - case Opt_grpquota: - case Opt_quota: - pr_err("JFS: quota operations not supported\n"); - break; + case Opt_usrquota: + case Opt_grpquota: + case Opt_quota: + pr_err("JFS: quota operations not supported\n"); + break; #endif - case Opt_uid: - { - char *uid = args[0].from; - uid_t val; - int rc = kstrtouint(uid, 0, &val); - - if (rc) - goto cleanup; - sbi->uid = make_kuid(current_user_ns(), val); - if (!uid_valid(sbi->uid)) - goto cleanup; - break; - } - - case Opt_gid: - { - char *gid = args[0].from; - gid_t val; - int rc = kstrtouint(gid, 0, &val); - - if (rc) - goto cleanup; - sbi->gid = make_kgid(current_user_ns(), val); - if (!gid_valid(sbi->gid)) - goto cleanup; - break; + case Opt_uid: + ctx->uid = result.uid; + break; + + case Opt_gid: + ctx->gid = result.gid; + break; + + case Opt_umask: + if (result.uint_32 & ~0777) { + pr_err("JFS: Invalid value of umask\n"); + return -EINVAL; } + ctx->umask = result.uint_32; + break; - case Opt_umask: - { - char *umask = args[0].from; - int rc = kstrtouint(umask, 8, &sbi->umask); + case Opt_discard: + /* if set to 1, even copying files will cause + * trimming :O + * -> user has more control over the online trimming + */ + ctx->minblks_trim = 64; + ctx->flag |= JFS_DISCARD; + break; - if (rc) - goto cleanup; - if (sbi->umask & ~0777) { - pr_err("JFS: Invalid value of umask\n"); - goto cleanup; - } - break; - } + case Opt_nodiscard: + ctx->flag &= ~JFS_DISCARD; + break; - case Opt_discard: - /* if set to 1, even copying files will cause - * trimming :O - * -> user has more control over the online trimming - */ - sbi->minblks_trim = 64; - if (bdev_max_discard_sectors(sb->s_bdev)) - *flag |= JFS_DISCARD; - else - pr_err("JFS: discard option not supported on device\n"); - break; - - case Opt_nodiscard: - *flag &= ~JFS_DISCARD; - break; - - case Opt_discard_minblk: - { - char *minblks_trim = args[0].from; - int rc; - if (bdev_max_discard_sectors(sb->s_bdev)) { - *flag |= JFS_DISCARD; - rc = kstrtouint(minblks_trim, 0, - &sbi->minblks_trim); - if (rc) - goto cleanup; - } else - pr_err("JFS: discard option not supported on device\n"); - break; - } + case Opt_discard_minblk: + ctx->minblks_trim = result.uint_32; + ctx->flag |= JFS_DISCARD; + break; - default: - printk("jfs: Unrecognized mount option \"%s\" or missing value\n", - p); - goto cleanup; - } - } - - if (nls_map != (void *) -1) { - /* Discard old (if remount) */ - unload_nls(sbi->nls_tab); - sbi->nls_tab = nls_map; + default: + return -EINVAL; } - return 1; -cleanup: - if (nls_map && nls_map != (void *) -1) - unload_nls(nls_map); return 0; } -static int jfs_remount(struct super_block *sb, int *flags, char *data) +static int jfs_reconfigure(struct fs_context *fc) { - s64 newLVSize = 0; + struct jfs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; + int readonly = fc->sb_flags & SB_RDONLY; int rc = 0; - int flag = JFS_SBI(sb)->flag; + int flag = ctx->flag; int ret; sync_filesystem(sb); - if (!parse_options(data, sb, &newLVSize, &flag)) - return -EINVAL; - if (newLVSize) { + /* Transfer results of parsing to the sbi */ + JFS_SBI(sb)->flag = ctx->flag; + JFS_SBI(sb)->uid = ctx->uid; + JFS_SBI(sb)->gid = ctx->gid; + JFS_SBI(sb)->umask = ctx->umask; + JFS_SBI(sb)->minblks_trim = ctx->minblks_trim; + if (ctx->nls_map != (void *) -1) { + unload_nls(JFS_SBI(sb)->nls_tab); + JFS_SBI(sb)->nls_tab = ctx->nls_map; + } + ctx->nls_map = NULL; + + if (ctx->resize) { if (sb_rdonly(sb)) { pr_err("JFS: resize requires volume to be mounted read-write\n"); return -EROFS; } - rc = jfs_extendfs(sb, newLVSize, 0); + + if (!ctx->newLVSize) { + ctx->newLVSize = sb_bdev_nr_blocks(sb); + if (ctx->newLVSize == 0) + pr_err("JFS: Cannot determine volume size\n"); + } + + rc = jfs_extendfs(sb, ctx->newLVSize, 0); if (rc) return rc; } - if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { + if (sb_rdonly(sb) && !readonly) { /* * Invalidate any previously read metadata. fsck may have * changed the on-disk data since we mounted r/o @@ -459,7 +414,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) dquot_resume(sb, -1); return ret; } - if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { + if (!sb_rdonly(sb) && readonly) { rc = dquot_suspend(sb, -1); if (rc < 0) return rc; @@ -467,7 +422,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) JFS_SBI(sb)->flag = flag; return rc; } - if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) + if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) { if (!sb_rdonly(sb)) { rc = jfs_umount_rw(sb); if (rc) @@ -477,18 +432,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) ret = jfs_mount_rw(sb, 1); return ret; } + } JFS_SBI(sb)->flag = flag; return 0; } -static int jfs_fill_super(struct super_block *sb, void *data, int silent) +static int jfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct jfs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct jfs_sb_info *sbi; struct inode *inode; int rc; - s64 newLVSize = 0; - int flag, ret = -EINVAL; + int ret = -EINVAL; jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); @@ -501,24 +458,34 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_min = 0; sb->s_time_max = U32_MAX; sbi->sb = sb; - sbi->uid = INVALID_UID; - sbi->gid = INVALID_GID; - sbi->umask = -1; - - /* initialize the mount flag and determine the default error handler */ - flag = JFS_ERR_REMOUNT_RO; - if (!parse_options((char *) data, sb, &newLVSize, &flag)) - goto out_kfree; - sbi->flag = flag; + /* Transfer results of parsing to the sbi */ + sbi->flag = ctx->flag; + sbi->uid = ctx->uid; + sbi->gid = ctx->gid; + sbi->umask = ctx->umask; + if (ctx->nls_map != (void *) -1) { + unload_nls(sbi->nls_tab); + sbi->nls_tab = ctx->nls_map; + } + ctx->nls_map = NULL; + + if (sbi->flag & JFS_DISCARD) { + if (!bdev_max_discard_sectors(sb->s_bdev)) { + pr_err("JFS: discard option not supported on device\n"); + sbi->flag &= ~JFS_DISCARD; + } else { + sbi->minblks_trim = ctx->minblks_trim; + } + } #ifdef CONFIG_JFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif - if (newLVSize) { + if (ctx->resize) { pr_err("resize option for remount only\n"); - goto out_kfree; + goto out_unload; } /* @@ -608,7 +575,6 @@ out_mount_failed: sbi->direct_inode = NULL; out_unload: unload_nls(sbi->nls_tab); -out_kfree: kfree(sbi); return ret; } @@ -664,10 +630,9 @@ out: return rc; } -static struct dentry *jfs_do_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int jfs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super); + return get_tree_bdev(fc, jfs_fill_super); } static int jfs_sync_fs(struct super_block *sb, int wait) @@ -886,7 +851,6 @@ static const struct super_operations jfs_super_operations = { .freeze_fs = jfs_freeze, .unfreeze_fs = jfs_unfreeze, .statfs = jfs_statfs, - .remount_fs = jfs_remount, .show_options = jfs_show_options, #ifdef CONFIG_QUOTA .quota_read = jfs_quota_read, @@ -902,12 +866,71 @@ static const struct export_operations jfs_export_operations = { .get_parent = jfs_get_parent, }; +static void jfs_init_options(struct fs_context *fc, struct jfs_context *ctx) +{ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + + /* Copy over current option values and mount flags */ + ctx->uid = JFS_SBI(sb)->uid; + ctx->gid = JFS_SBI(sb)->gid; + ctx->umask = JFS_SBI(sb)->umask; + ctx->nls_map = (void *)-1; + ctx->minblks_trim = JFS_SBI(sb)->minblks_trim; + ctx->flag = JFS_SBI(sb)->flag; + + } else { + /* + * Initialize the mount flag and determine the default + * error handler + */ + ctx->flag = JFS_ERR_REMOUNT_RO; + ctx->uid = INVALID_UID; + ctx->gid = INVALID_GID; + ctx->umask = -1; + ctx->nls_map = (void *)-1; + } +} + +static void jfs_free_fc(struct fs_context *fc) +{ + struct jfs_context *ctx = fc->fs_private; + + if (ctx->nls_map != (void *) -1) + unload_nls(ctx->nls_map); + kfree(ctx); +} + +static const struct fs_context_operations jfs_context_ops = { + .parse_param = jfs_parse_param, + .get_tree = jfs_get_tree, + .reconfigure = jfs_reconfigure, + .free = jfs_free_fc, +}; + +static int jfs_init_fs_context(struct fs_context *fc) +{ + struct jfs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + jfs_init_options(fc, ctx); + + fc->fs_private = ctx; + fc->ops = &jfs_context_ops; + + return 0; +} + static struct file_system_type jfs_fs_type = { .owner = THIS_MODULE, .name = "jfs", - .mount = jfs_do_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = jfs_init_fs_context, + .parameters = jfs_param_spec, }; MODULE_ALIAS_FS("jfs"); diff --git a/fs/libfs.c b/fs/libfs.c index 46966fd8bcf9..a168ece5cc61 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -77,6 +77,10 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) d_set_d_op(dentry, &simple_dentry_operations); + + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + return NULL; + d_add(dentry, NULL); return NULL; } @@ -1791,8 +1795,8 @@ bool is_empty_dir_inode(struct inode *inode) * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ -static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) +int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) { const struct dentry *parent; const struct inode *dir; @@ -1835,6 +1839,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } +EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems @@ -1843,7 +1848,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ -static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; @@ -1858,6 +1863,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return -EINVAL; return 0; } +EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 1f2149db10f2..2359347c9fbd 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -30,7 +30,6 @@ #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> -#include <linux/exportfs.h> #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -481,7 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie, int reclaim) { - struct inode *inode = nlmsvc_file_inode(file); + struct inode *inode __maybe_unused = nlmsvc_file_inode(file); struct nlm_block *block = NULL; int error; int mode; @@ -496,7 +495,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) { + if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) { async_block = wait; wait = 0; } @@ -550,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, * requests on the underlaying ->lock() implementation but * only one nlm_block to being granted by lm_grant(). */ - if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) && + if (locks_can_async_lock(nlmsvc_file_file(file)->f_op) && !list_empty(&block->b_list)) { spin_unlock(&nlm_blocked_lock); ret = nlm_lck_blocked; diff --git a/fs/mpage.c b/fs/mpage.c index b5b5ddf9d513..82aecf372743 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -606,7 +606,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); + wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit_write(bio); diff --git a/fs/namei.c b/fs/namei.c index 4a4a22a08ac2..05a8d544fb35 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -326,6 +326,25 @@ static int check_acl(struct mnt_idmap *idmap, return -EAGAIN; } +/* + * Very quick optimistic "we know we have no ACL's" check. + * + * Note that this is purely for ACL_TYPE_ACCESS, and purely + * for the "we have cached that there are no ACLs" case. + * + * If this returns true, we know there are no ACLs. But if + * it returns false, we might still not have ACLs (it could + * be the is_uncached_acl() case). + */ +static inline bool no_acl_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_POSIX_ACL + return likely(!READ_ONCE(inode->i_acl)); +#else + return true; +#endif +} + /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from @@ -348,6 +367,28 @@ static int acl_permission_check(struct mnt_idmap *idmap, unsigned int mode = inode->i_mode; vfsuid_t vfsuid; + /* + * Common cheap case: everybody has the requested + * rights, and there are no ACLs to check. No need + * to do any owner/group checks in that case. + * + * - 'mask&7' is the requested permission bit set + * - multiplying by 0111 spreads them out to all of ugo + * - '& ~mode' looks for missing inode permission bits + * - the '!' is for "no missing permissions" + * + * After that, we just need to check that there are no + * ACL's on the inode - do the 'IS_POSIXACL()' check last + * because it will dereference the ->i_sb pointer and we + * want to avoid that if at all possible. + */ + if (!((mask & 7) * 0111 & ~mode)) { + if (no_acl_inode(inode)) + return 0; + if (!IS_POSIXACL(inode)) + return 0; + } + /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { diff --git a/fs/namespace.c b/fs/namespace.c index d26f5e6d2ca3..206fc54feeba 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3901,7 +3901,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } new_ns->ns.ops = &mntns_operations; if (!anon) - new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + new_ns->seq = atomic64_inc_return(&mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; @@ -5006,6 +5006,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) return 0; } +static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + + if (sb->s_subtype) + seq_puts(seq, sb->s_subtype); +} + +static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + struct mount *r = real_mount(s->mnt); + + if (sb->s_op->show_devname) { + size_t start = seq->count; + int ret; + + ret = sb->s_op->show_devname(seq, s->mnt->mnt_root); + if (ret) + return ret; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + /* Unescape the result */ + seq->buf[seq->count] = '\0'; + seq->count = start; + seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); + } else if (r->mnt_devname) { + seq_puts(seq, r->mnt_devname); + } + return 0; +} + static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; @@ -5040,35 +5074,134 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) return 0; } +static inline int statmount_opt_unescape(struct seq_file *seq, char *buf_start) +{ + char *buf_end, *opt_start, *opt_end; + int count = 0; + + buf_end = seq->buf + seq->count; + *buf_end = '\0'; + for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) { + opt_end = strchrnul(opt_start, ','); + *opt_end = '\0'; + buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1; + if (WARN_ON_ONCE(++count == INT_MAX)) + return -EOVERFLOW; + } + seq->count = buf_start - 1 - seq->buf; + return count; +} + +static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + char *buf_start; + int err; + + if (!sb->s_op->show_options) + return 0; + + buf_start = seq->buf + start; + err = sb->s_op->show_options(seq, mnt->mnt_root); + if (err) + return err; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (seq->count == start) + return 0; + + err = statmount_opt_unescape(seq, buf_start); + if (err < 0) + return err; + + s->sm.opt_num = err; + return 0; +} + +static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + char *buf_start; + int err; + + buf_start = seq->buf + start; + + err = security_sb_show_options(seq, sb); + if (!err) + return err; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (seq->count == start) + return 0; + + err = statmount_opt_unescape(seq, buf_start); + if (err < 0) + return err; + + s->sm.opt_sec_num = err; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { - int ret; + int ret = 0; size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; + u32 start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: - sm->fs_type = seq->count; + sm->fs_type = start; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: - sm->mnt_root = seq->count; + sm->mnt_root = start; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: - sm->mnt_point = seq->count; + sm->mnt_point = start; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: - sm->mnt_opts = seq->count; + sm->mnt_opts = start; ret = statmount_mnt_opts(s, seq); break; + case STATMOUNT_OPT_ARRAY: + sm->opt_array = start; + ret = statmount_opt_array(s, seq); + break; + case STATMOUNT_OPT_SEC_ARRAY: + sm->opt_sec_array = start; + ret = statmount_opt_sec_array(s, seq); + break; + case STATMOUNT_FS_SUBTYPE: + sm->fs_subtype = start; + statmount_fs_subtype(s, seq); + break; + case STATMOUNT_SB_SOURCE: + sm->sb_source = start; + ret = statmount_sb_source(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; } + /* + * If nothing was emitted, return to avoid setting the flag + * and terminating the buffer. + */ + if (seq->count == start) + return ret; if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) return -EOVERFLOW; if (kbufsize >= s->bufsize) @@ -5203,6 +5336,18 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); + if (!err && s->mask & STATMOUNT_OPT_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_ARRAY); + + if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY); + + if (!err && s->mask & STATMOUNT_FS_SUBTYPE) + err = statmount_string(s, STATMOUNT_FS_SUBTYPE); + + if (!err && s->mask & STATMOUNT_SB_SOURCE) + err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); @@ -5224,7 +5369,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ - STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS) + STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index af46a598f4d7..7ac34550c403 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -627,7 +627,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, if (unlikely(always_fill)) { if (pos - offset + len <= i_size) return false; /* Page entirely before EOF */ - zero_user_segment(&folio->page, 0, plen); + folio_zero_segment(folio, 0, plen); folio_mark_uptodate(folio); return true; } @@ -646,7 +646,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, return false; zero_out: - zero_user_segments(&folio->page, 0, offset, offset + len, plen); + folio_zero_segments(folio, 0, offset, offset + len, plen); return true; } @@ -713,7 +713,7 @@ retry: if (folio_test_uptodate(folio)) goto have_folio; - /* If the page is beyond the EOF, we want to clear it - unless it's + /* If the folio is beyond the EOF, we want to clear it - unless it's * within the cache granule containing the EOF, in which case we need * to preload the granule. */ @@ -773,7 +773,7 @@ error: EXPORT_SYMBOL(netfs_write_begin); /* - * Preload the data into a page we're proposing to write into. + * Preload the data into a folio we're proposing to write into. */ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len) diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index b3910dfcb56d..b4826360a411 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -83,13 +83,13 @@ static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode, * netfs_perform_write - Copy data into the pagecache. * @iocb: The operation parameters * @iter: The source buffer - * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * @netfs_group: Grouping for dirty folios (eg. ceph snaps). * - * Copy data into pagecache pages attached to the inode specified by @iocb. + * Copy data into pagecache folios attached to the inode specified by @iocb. * The caller must hold appropriate inode locks. * - * Dirty pages are tagged with a netfs_folio struct if they're not up to date - * to indicate the range modified. Dirty pages may also be tagged with a + * Dirty folios are tagged with a netfs_folio struct if they're not up to date + * to indicate the range modified. Dirty folios may also be tagged with a * netfs-specific grouping such that data from an old group gets flushed before * a new one is started. */ @@ -223,11 +223,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, * we try to read it. */ if (fpos >= ctx->zero_point) { - zero_user_segment(&folio->page, 0, offset); + folio_zero_segment(folio, 0, offset); copied = copy_folio_from_iter_atomic(folio, offset, part, iter); if (unlikely(copied == 0)) goto copy_failed; - zero_user_segment(&folio->page, offset + copied, flen); + folio_zero_segment(folio, offset + copied, flen); __netfs_set_group(folio, netfs_group); folio_mark_uptodate(folio); trace_netfs_folio(folio, netfs_modify_and_clear); @@ -407,7 +407,7 @@ EXPORT_SYMBOL(netfs_perform_write); * netfs_buffered_write_iter_locked - write data to a file * @iocb: IO state structure (file, offset, etc.) * @from: iov_iter with data to write - * @netfs_group: Grouping for dirty pages (eg. ceph snaps). + * @netfs_group: Grouping for dirty folios (eg. ceph snaps). * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates @@ -491,7 +491,9 @@ EXPORT_SYMBOL(netfs_file_write_iter); /* * Notification that a previously read-only page is about to become writable. - * Note that the caller indicates a single page of a multipage folio. + * The caller indicates the precise page that needs to be written to, but + * we only track group on a per-folio basis, so we block more often than + * we might otherwise. */ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group) { @@ -501,7 +503,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr struct address_space *mapping = file->f_mapping; struct inode *inode = file_inode(file); struct netfs_inode *ictx = netfs_inode(inode); - vm_fault_t ret = VM_FAULT_RETRY; + vm_fault_t ret = VM_FAULT_NOPAGE; int err; _enter("%lx", folio->index); @@ -510,21 +512,15 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr if (folio_lock_killable(folio) < 0) goto out; - if (folio->mapping != mapping) { - folio_unlock(folio); - ret = VM_FAULT_NOPAGE; - goto out; - } - - if (folio_wait_writeback_killable(folio)) { - ret = VM_FAULT_LOCKED; - goto out; - } + if (folio->mapping != mapping) + goto unlock; + if (folio_wait_writeback_killable(folio) < 0) + goto unlock; /* Can we see a streaming write here? */ if (WARN_ON(!folio_test_uptodate(folio))) { - ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED; - goto out; + ret = VM_FAULT_SIGBUS; + goto unlock; } group = netfs_folio_group(folio); @@ -559,5 +555,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr out: sb_end_pagefault(inode->i_sb); return ret; +unlock: + folio_unlock(folio); + goto out; } EXPORT_SYMBOL(netfs_page_mkwrite); diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c index cb75c07b5281..ced14ac78cc1 100644 --- a/fs/netfs/fscache_volume.c +++ b/fs/netfs/fscache_volume.c @@ -322,8 +322,7 @@ maybe_wait: } return; no_wait: - clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags); - wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING); + clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags); } /* diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 114282398716..03ecc7765615 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -181,8 +181,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); ktime_get_real_ts64(&clp->cl_nfssvc_boot); - clp->cl_uuid.net = NULL; - clp->cl_uuid.dom = NULL; + nfs_uuid_init(&clp->cl_uuid); spin_lock_init(&clp->cl_localio_lock); #endif /* CONFIG_NFS_LOCALIO */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 542c7d97b235..596f35170137 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -205,12 +205,15 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) nfs_fscache_invalidate(inode, 0); flags &= ~NFS_INO_REVAL_FORCED; - nfsi->cache_validity |= flags; + flags |= nfsi->cache_validity; + if (inode->i_mapping->nrpages == 0) + flags &= ~NFS_INO_INVALID_DATA; - if (inode->i_mapping->nrpages == 0) { - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; - nfs_ooo_clear(nfsi); - } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { + /* pairs with nfs_clear_invalid_mapping()'s smp_load_acquire() */ + smp_store_release(&nfsi->cache_validity, flags); + + if (inode->i_mapping->nrpages == 0 || + nfsi->cache_validity & NFS_INO_INVALID_DATA) { nfs_ooo_clear(nfsi); } trace_nfs_set_cache_invalid(inode, 0); @@ -628,23 +631,35 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) } } +static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) +{ + enum file_time_flags time_flags = 0; + unsigned int cache_flags = 0; + + if (ia_valid & ATTR_MTIME) { + time_flags |= S_MTIME | S_CTIME; + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (ia_valid & ATTR_ATIME) { + time_flags |= S_ATIME; + cache_flags |= NFS_INO_INVALID_ATIME; + } + inode_update_timestamps(inode, time_flags); + NFS_I(inode)->cache_validity &= ~cache_flags; +} + void nfs_update_delegated_atime(struct inode *inode) { spin_lock(&inode->i_lock); - if (nfs_have_delegated_atime(inode)) { - inode_update_timestamps(inode, S_ATIME); - NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ATIME; - } + if (nfs_have_delegated_atime(inode)) + nfs_update_timestamps(inode, ATTR_ATIME); spin_unlock(&inode->i_lock); } void nfs_update_delegated_mtime_locked(struct inode *inode) { - if (nfs_have_delegated_mtime(inode)) { - inode_update_timestamps(inode, S_CTIME | S_MTIME); - NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_CTIME | - NFS_INO_INVALID_MTIME); - } + if (nfs_have_delegated_mtime(inode)) + nfs_update_timestamps(inode, ATTR_MTIME); } void nfs_update_delegated_mtime(struct inode *inode) @@ -682,15 +697,16 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, attr->ia_valid &= ~ATTR_SIZE; } - if (nfs_have_delegated_mtime(inode)) { - if (attr->ia_valid & ATTR_MTIME) { - nfs_update_delegated_mtime(inode); - attr->ia_valid &= ~ATTR_MTIME; - } - if (attr->ia_valid & ATTR_ATIME) { - nfs_update_delegated_atime(inode); - attr->ia_valid &= ~ATTR_ATIME; - } + if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { + spin_lock(&inode->i_lock); + nfs_update_timestamps(inode, attr->ia_valid); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME); + } else if (nfs_have_delegated_atime(inode) && + attr->ia_valid & ATTR_ATIME && + !(attr->ia_valid & ATTR_MTIME)) { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; } /* Optimization: if the end result is no change, don't RPC */ @@ -1408,6 +1424,13 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); if (ret) goto out; + smp_rmb(); /* pairs with smp_wmb() below */ + if (test_bit(NFS_INO_INVALIDATING, bitlock)) + continue; + /* pairs with nfs_set_cache_invalid()'s smp_store_release() */ + if (!(smp_load_acquire(&nfsi->cache_validity) & NFS_INO_INVALID_DATA)) + goto out; + /* Slow-path that double-checks with spinlock held */ spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock)) { spin_unlock(&inode->i_lock); @@ -1633,6 +1656,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr) fattr->gencount = nfs_inc_attr_generation_counter(); fattr->owner_name = NULL; fattr->group_name = NULL; + fattr->mdsthreshold = NULL; } EXPORT_SYMBOL_GPL(nfs_fattr_init); diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index d0aa680ec816..8f0ce82a677e 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -205,7 +205,8 @@ void nfs_local_probe(struct nfs_client *clp) nfs_local_disable(clp); } - nfs_uuid_begin(&clp->cl_uuid); + if (!nfs_uuid_begin(&clp->cl_uuid)) + return; if (nfs_server_uuid_is_local(clp)) nfs_local_enable(clp); nfs_uuid_end(&clp->cl_uuid); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index cd2fbde2e6d7..9d40319e063d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3452,6 +3452,10 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, adjust_flags |= NFS_INO_INVALID_MODE; if (sattr->ia_valid & (ATTR_UID | ATTR_GID)) adjust_flags |= NFS_INO_INVALID_OTHER; + if (sattr->ia_valid & ATTR_ATIME) + adjust_flags |= NFS_INO_INVALID_ATIME; + if (sattr->ia_valid & ATTR_MTIME) + adjust_flags |= NFS_INO_INVALID_MTIME; do { nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label), diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9723b6c53397..ae5c5e39afa0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -885,7 +885,15 @@ static int nfs_request_mount(struct fs_context *fc, * Now ask the mount server to map our export path * to a file handle. */ - status = nfs_mount(&request, ctx->timeo, ctx->retrans); + if ((request.protocol == XPRT_TRANSPORT_UDP) == + !(ctx->flags & NFS_MOUNT_TCP)) + /* + * NFS protocol and mount protocol are both UDP or neither UDP + * so timeouts are compatible. Use NFS timeouts for MOUNT + */ + status = nfs_mount(&request, ctx->timeo, ctx->retrans); + else + status = nfs_mount(&request, NFS_UNSPEC_TIMEO, NFS_UNSPEC_RETRANS); if (status != 0) { dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", request.hostname, status); diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c index 5c8ce5066c16..09404d142d1a 100644 --- a/fs/nfs_common/nfslocalio.c +++ b/fs/nfs_common/nfslocalio.c @@ -5,7 +5,7 @@ */ #include <linux/module.h> -#include <linux/rculist.h> +#include <linux/list.h> #include <linux/nfslocalio.h> #include <net/netns/generic.h> @@ -20,15 +20,27 @@ static DEFINE_SPINLOCK(nfs_uuid_lock); */ static LIST_HEAD(nfs_uuids); -void nfs_uuid_begin(nfs_uuid_t *nfs_uuid) +void nfs_uuid_init(nfs_uuid_t *nfs_uuid) { nfs_uuid->net = NULL; nfs_uuid->dom = NULL; - uuid_gen(&nfs_uuid->uuid); + INIT_LIST_HEAD(&nfs_uuid->list); +} +EXPORT_SYMBOL_GPL(nfs_uuid_init); +bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid) +{ spin_lock(&nfs_uuid_lock); - list_add_tail_rcu(&nfs_uuid->list, &nfs_uuids); + /* Is this nfs_uuid already in use? */ + if (!list_empty(&nfs_uuid->list)) { + spin_unlock(&nfs_uuid_lock); + return false; + } + uuid_gen(&nfs_uuid->uuid); + list_add_tail(&nfs_uuid->list, &nfs_uuids); spin_unlock(&nfs_uuid_lock); + + return true; } EXPORT_SYMBOL_GPL(nfs_uuid_begin); @@ -36,7 +48,8 @@ void nfs_uuid_end(nfs_uuid_t *nfs_uuid) { if (nfs_uuid->net == NULL) { spin_lock(&nfs_uuid_lock); - list_del_init(&nfs_uuid->list); + if (nfs_uuid->net == NULL) + list_del_init(&nfs_uuid->list); spin_unlock(&nfs_uuid_lock); } } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b5a6bf4f459f..d32f2dfd148f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1841,14 +1841,12 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!async_copy) goto out_err; async_copy->cp_nn = nn; + INIT_LIST_HEAD(&async_copy->copies); + refcount_set(&async_copy->refcount, 1); /* Arbitrary cap on number of pending async copy operations */ if (atomic_inc_return(&nn->pending_async_copies) > - (int)rqstp->rq_pool->sp_nrthreads) { - atomic_dec(&nn->pending_async_copies); + (int)rqstp->rq_pool->sp_nrthreads) goto out_err; - } - INIT_LIST_HEAD(&async_copy->copies); - refcount_set(&async_copy->refcount, 1); async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); if (!async_copy->cp_src) goto out_err; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 551d2958ec29..d80406f8b568 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8001,9 +8001,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate) || - exportfs_lock_op_is_async(sb->s_export_op)) - flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); @@ -8014,9 +8011,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate) || - exportfs_lock_op_is_async(sb->s_export_op)) - flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); @@ -8036,15 +8030,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - /* - * Most filesystems with their own ->lock operations will block - * the nfsd thread waiting to acquire the lock. That leads to - * deadlocks (we don't want every nfsd thread tied up waiting - * for file locks), so don't attempt blocking lock notifications - * on those filesystems: - */ - if (!exportfs_lock_op_is_async(sb->s_export_op)) - flags &= ~FL_SLEEP; + if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) && + nfsd4_has_session(cstate) && + locks_can_async_lock(nf->nf_file->f_op)) + flags |= FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 22325b590e17..d6d4f2a0e898 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -903,11 +903,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, goto out; } - if (may_flags & NFSD_MAY_64BIT_COOKIE) - file->f_mode |= FMODE_64BITHASH; - else - file->f_mode |= FMODE_32BITHASH; - *filp = file; out: return host_err; @@ -2174,13 +2169,15 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, loff_t offset = *offsetp; int may_flags = NFSD_MAY_READ; - if (fhp->fh_64bit_cookies) - may_flags |= NFSD_MAY_64BIT_COOKIE; - err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file); if (err) goto out; + if (fhp->fh_64bit_cookies) + file->f_mode |= FMODE_64BITHASH; + else + file->f_mode |= FMODE_32BITHASH; + offset = vfs_llseek(file, offset, SEEK_SET); if (offset < 0) { err = nfserrno((int)offset); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 57b4af5ad646..501ad7be5174 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -68,7 +68,6 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) goto failed; } memset(bh->b_data, 0, i_blocksize(inode)); - bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); set_buffer_uptodate(bh); @@ -133,7 +132,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, goto found; } set_buffer_mapped(bh); - bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 1c9ae36a03ab..ace22253fed0 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -83,10 +83,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, goto out; } - if (!buffer_mapped(bh)) { - bh->b_bdev = inode->i_sb->s_bdev; + if (!buffer_mapped(bh)) set_buffer_mapped(bh); - } bh->b_blocknr = pbn; bh->b_end_io = end_buffer_read_sync; get_bh(bh); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index ceb7dc0b5bad..2db6350b5ac2 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -89,7 +89,6 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, if (buffer_uptodate(bh)) goto failed_bh; - bh->b_bdev = sb->s_bdev; err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); if (likely(!err)) { get_bh(bh); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 4905063790c5..9b108052d9f7 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -157,6 +157,9 @@ static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir, /* slow symlink */ inode->i_op = &nilfs_symlink_inode_operations; inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_constraint(inode->i_mapping, + ~__GFP_FS)); inode->i_mapping->a_ops = &nilfs_aops; err = page_symlink(inode, symname, l); if (err) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 5436eb0424bd..9a849397c768 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -39,7 +39,6 @@ static struct buffer_head *__nilfs_get_folio_block(struct folio *folio, first_block = (unsigned long)index << (PAGE_SHIFT - blkbits); bh = get_nth_bh(bh, block - first_block); - touch_buffer(bh); wait_on_buffer(bh); return bh; } @@ -64,6 +63,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, folio_put(folio); return NULL; } + bh->b_bdev = inode->i_sb->s_bdev; return bh; } @@ -99,16 +99,16 @@ void nilfs_forget_buffer(struct buffer_head *bh) */ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) { - void *kaddr0, *kaddr1; + void *saddr, *daddr; unsigned long bits; - struct page *spage = sbh->b_page, *dpage = dbh->b_page; + struct folio *sfolio = sbh->b_folio, *dfolio = dbh->b_folio; struct buffer_head *bh; - kaddr0 = kmap_local_page(spage); - kaddr1 = kmap_local_page(dpage); - memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); - kunmap_local(kaddr1); - kunmap_local(kaddr0); + saddr = kmap_local_folio(sfolio, bh_offset(sbh)); + daddr = kmap_local_folio(dfolio, bh_offset(dbh)); + memcpy(daddr, saddr, sbh->b_size); + kunmap_local(daddr); + kunmap_local(saddr); dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS; dbh->b_blocknr = sbh->b_blocknr; @@ -122,13 +122,13 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) unlock_buffer(bh); } if (bits & BIT(BH_Uptodate)) - SetPageUptodate(dpage); + folio_mark_uptodate(dfolio); else - ClearPageUptodate(dpage); + folio_clear_uptodate(dfolio); if (bits & BIT(BH_Mapped)) - SetPageMappedToDisk(dpage); + folio_set_mappedtodisk(dfolio); else - ClearPageMappedToDisk(dpage); + folio_clear_mappedtodisk(dfolio); } /** @@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio) folio_clear_uptodate(folio); folio_clear_mappedtodisk(folio); + folio_clear_checked(folio); head = folio_buffers(folio); if (head) { diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index d5dbef7f5c95..6004dfdfdf0f 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -16,7 +16,6 @@ #include <linux/security.h> #include <linux/spinlock.h> #include <linux/slab.h> -#include <linux/fdtable.h> #include <linux/fsnotify_backend.h> static int dir_notify_enable __read_mostly = 1; @@ -347,9 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) new_fsn_mark = NULL; } - rcu_read_lock(); - f = lookup_fdget_rcu(fd); - rcu_read_unlock(); + f = fget_raw(fd); /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 224bccaab4cc..24c7c5df4998 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> -#include <linux/fdtable.h> #include <linux/fsnotify_backend.h> #include <linux/init.h> #include <linux/jiffies.h> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9644bc72e457..61b83039771e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fcntl.h> -#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/anon_inodes.h> diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 96b684763b39..b95724b767e1 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -280,5 +280,4 @@ const struct export_operations ocfs2_export_ops = { .fh_to_dentry = ocfs2_fh_to_dentry, .fh_to_parent = ocfs2_fh_to_parent, .get_parent = ocfs2_get_parent, - .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 58887456e3c5..4fa6c840d20b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1787,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode, return 0; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di); + + if (byte_start > id_count || byte_start + byte_len > id_count) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, byte_start + byte_len, 0); if (ret) { @@ -2804,6 +2812,7 @@ const struct file_operations ocfs2_fops = { .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, .remap_file_range = ocfs2_remap_file_range, + .fop_flags = FOP_ASYNC_LOCK, }; WRAP_DIR_ITER(ocfs2_readdir) // FIXME! @@ -2820,6 +2829,7 @@ const struct file_operations ocfs2_dops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, + .fop_flags = FOP_ASYNC_LOCK, }; /* diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index c4a4016d3866..b0733c08ed13 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -574,6 +574,8 @@ out_commit: ocfs2_commit_trans(osb, handle); out_free_group_bh: + if (ret < 0) + ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh); brelse(group_bh); out_unlock: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3d404624bb96..c79b4291777f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2319,6 +2319,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct ocfs2_blockcheck_stats *stats) { int status = -EAGAIN; + u32 blksz_bits; if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { @@ -2333,11 +2334,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, goto out; } status = -EINVAL; - if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { + /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */ + blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); + if (blksz_bits < 9 || blksz_bits > 12) { mlog(ML_ERROR, "found superblock with incorrect block " - "size: found %u, should be %u\n", - 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), - blksz); + "size bits: found %u, should be 9, 10, 11, or 12\n", + blksz_bits); + } else if ((1 << le32_to_cpu(blksz_bits)) != blksz) { + mlog(ML_ERROR, "found superblock with incorrect block " + "size: found %u, should be %u\n", 1 << blksz_bits, blksz); } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != OCFS2_MAJOR_REV_LEVEL || le16_to_cpu(di->id2.i_super.s_minor_rev_level) != diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd0a05365e79..73a6f6fd8a8e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2036,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc, rc = 0; ocfs2_xa_cleanup_value_truncate(loc, "removing", orig_clusters); - if (rc) - goto out; + goto out; } } diff --git a/fs/open.c b/fs/open.c index 5da4df2f9b18..446ab8ef04af 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1576,23 +1576,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd) return retval; } -/** - * sys_close_range() - Close all file descriptors in a given range. - * - * @fd: starting file descriptor to close - * @max_fd: last file descriptor to close - * @flags: reserved for future extensions - * - * This closes a range of file descriptors. All file descriptors - * from @fd up to and including @max_fd are closed. - * Currently, errors to close a given file descriptor are ignored. - */ -SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, - unsigned int, flags) -{ - return __close_range(fd, max_fd, flags); -} - /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 2ed6ad641a20..ee2cbd044ce6 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -16,7 +16,6 @@ #include <linux/sched/signal.h> #include <linux/cred.h> #include <linux/namei.h> -#include <linux/fdtable.h> #include <linux/ratelimit.h> #include <linux/exportfs.h> #include "overlayfs.h" diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index e42546c6c5df..1115c22deca0 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -141,10 +141,10 @@ static int ovl_verity_mode_def(void) const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_string_empty("lowerdir", Opt_lowerdir), - fsparam_string("lowerdir+", Opt_lowerdir_add), - fsparam_string("datadir+", Opt_datadir_add), - fsparam_string("upperdir", Opt_upperdir), - fsparam_string("workdir", Opt_workdir), + fsparam_file_or_string("lowerdir+", Opt_lowerdir_add), + fsparam_file_or_string("datadir+", Opt_datadir_add), + fsparam_file_or_string("upperdir", Opt_upperdir), + fsparam_file_or_string("workdir", Opt_workdir), fsparam_flag("default_permissions", Opt_default_permissions), fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir), fsparam_enum("index", Opt_index, ovl_parameter_bool), @@ -367,40 +367,100 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, } } -static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer) +static inline bool is_upper_layer(enum ovl_opt layer) +{ + return layer == Opt_upperdir || layer == Opt_workdir; +} + +/* Handle non-file descriptor-based layer options that require path lookup. */ +static inline int ovl_kern_path(const char *layer_name, struct path *layer_path, + enum ovl_opt layer) { - char *name = kstrdup(layer_name, GFP_KERNEL); - bool upper = (layer == Opt_upperdir || layer == Opt_workdir); - struct path path; int err; + switch (layer) { + case Opt_upperdir: + fallthrough; + case Opt_workdir: + fallthrough; + case Opt_lowerdir: + err = ovl_mount_dir(layer_name, layer_path); + break; + case Opt_lowerdir_add: + fallthrough; + case Opt_datadir_add: + err = ovl_mount_dir_noesc(layer_name, layer_path); + break; + default: + WARN_ON_ONCE(true); + err = -EINVAL; + } + + return err; +} + +static int ovl_do_parse_layer(struct fs_context *fc, const char *layer_name, + struct path *layer_path, enum ovl_opt layer) +{ + char *name __free(kfree) = kstrdup(layer_name, GFP_KERNEL); + bool upper; + int err = 0; + if (!name) return -ENOMEM; - if (upper || layer == Opt_lowerdir) - err = ovl_mount_dir(name, &path); - else - err = ovl_mount_dir_noesc(name, &path); + upper = is_upper_layer(layer); + err = ovl_mount_dir_check(fc, layer_path, layer, name, upper); if (err) - goto out_free; - - err = ovl_mount_dir_check(fc, &path, layer, name, upper); - if (err) - goto out_put; + return err; if (!upper) { err = ovl_ctx_realloc_lower(fc); if (err) - goto out_put; + return err; } /* Store the user provided path string in ctx to show in mountinfo */ - ovl_add_layer(fc, layer, &path, &name); + ovl_add_layer(fc, layer, layer_path, &name); + return err; +} + +static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, + enum ovl_opt layer) +{ + struct path layer_path __free(path_put) = {}; + int err = 0; + + switch (param->type) { + case fs_value_is_string: + err = ovl_kern_path(param->string, &layer_path, layer); + if (err) + return err; + err = ovl_do_parse_layer(fc, param->string, &layer_path, layer); + break; + case fs_value_is_file: { + char *buf __free(kfree); + char *layer_name; + + buf = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); + if (!buf) + return -ENOMEM; + + layer_path = param->file->f_path; + path_get(&layer_path); + + layer_name = d_path(&layer_path, buf, PATH_MAX); + if (IS_ERR(layer_name)) + return PTR_ERR(layer_name); + + err = ovl_do_parse_layer(fc, layer_name, &layer_path, layer); + break; + } + default: + WARN_ON_ONCE(true); + err = -EINVAL; + } -out_put: - path_put(&path); -out_free: - kfree(name); return err; } @@ -474,7 +534,13 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) iter = dup; for (nr = 0; nr < nr_lower; nr++) { - err = ovl_parse_layer(fc, iter, Opt_lowerdir); + struct path path __free(path_put) = {}; + + err = ovl_kern_path(iter, &path, Opt_lowerdir); + if (err) + goto out_err; + + err = ovl_do_parse_layer(fc, iter, &path, Opt_lowerdir); if (err) goto out_err; @@ -555,7 +621,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_datadir_add: case Opt_upperdir: case Opt_workdir: - err = ovl_parse_layer(fc, param->string, opt); + err = ovl_parse_layer(fc, param, opt); break; case Opt_default_permissions: config->default_permissions = true; diff --git a/fs/pidfs.c b/fs/pidfs.c index 80675b6bf884..618abb1fa1b8 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -2,6 +2,7 @@ #include <linux/anon_inodes.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/cgroup.h> #include <linux/magic.h> #include <linux/mount.h> #include <linux/pid.h> @@ -114,6 +115,81 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) return poll_flags; } +static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) +{ + struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; + size_t usize = _IOC_SIZE(cmd); + struct pidfd_info kinfo = {}; + struct user_namespace *user_ns; + const struct cred *c; + __u64 mask; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; +#endif + + if (!uinfo) + return -EINVAL; + if (usize < PIDFD_INFO_SIZE_VER0) + return -EINVAL; /* First version, no smaller struct possible */ + + if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) + return -EFAULT; + + c = get_task_cred(task); + if (!c) + return -ESRCH; + + /* Unconditionally return identifiers and credentials, the rest only on request */ + + user_ns = current_user_ns(); + kinfo.ruid = from_kuid_munged(user_ns, c->uid); + kinfo.rgid = from_kgid_munged(user_ns, c->gid); + kinfo.euid = from_kuid_munged(user_ns, c->euid); + kinfo.egid = from_kgid_munged(user_ns, c->egid); + kinfo.suid = from_kuid_munged(user_ns, c->suid); + kinfo.sgid = from_kgid_munged(user_ns, c->sgid); + kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); + kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); + kinfo.mask |= PIDFD_INFO_CREDS; + put_cred(c); + +#ifdef CONFIG_CGROUPS + rcu_read_lock(); + cgrp = task_dfl_cgroup(task); + kinfo.cgroupid = cgroup_id(cgrp); + kinfo.mask |= PIDFD_INFO_CGROUPID; + rcu_read_unlock(); +#endif + + /* + * Copy pid/tgid last, to reduce the chances the information might be + * stale. Note that it is not possible to ensure it will be valid as the + * task might return as soon as the copy_to_user finishes, but that's ok + * and userspace expects that might happen and can act accordingly, so + * this is just best-effort. What we can do however is checking that all + * the fields are set correctly, or return ESRCH to avoid providing + * incomplete information. */ + + kinfo.ppid = task_ppid_nr_ns(task, NULL); + kinfo.tgid = task_tgid_vnr(task); + kinfo.pid = task_pid_vnr(task); + kinfo.mask |= PIDFD_INFO_PID; + + if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) + return -ESRCH; + + /* + * If userspace and the kernel have the same struct size it can just + * be copied. If userspace provides an older struct, only the bits that + * userspace knows about will be copied. If userspace provides a new + * struct, only the bits that the kernel knows about will be copied. + */ + if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) + return -EFAULT; + + return 0; +} + static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; @@ -122,13 +198,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; - if (arg) - return -EINVAL; - task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; + /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ + if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) + return pidfd_info(task, cmd, arg); + + if (arg) + return -EINVAL; + scoped_guard(task_lock, task) { nsp = task->nsproxy; if (nsp) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 6c66a37522d0..4050942ab52f 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init); * Allocate a new ACL with the specified number of entries. */ struct posix_acl * -posix_acl_alloc(int count, gfp_t flags) +posix_acl_alloc(unsigned int count, gfp_t flags) { - const size_t size = sizeof(struct posix_acl) + - count * sizeof(struct posix_acl_entry); - struct posix_acl *acl = kmalloc(size, flags); + struct posix_acl *acl; + + acl = kmalloc(struct_size(acl, a_entries, count), flags); if (acl) posix_acl_init(acl, count); return acl; @@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) struct posix_acl *clone = NULL; if (acl) { - int size = sizeof(struct posix_acl) + acl->a_count * - sizeof(struct posix_acl_entry); - clone = kmemdup(acl, size, flags); + clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), + flags); if (clone) refcount_set(&clone->a_refcount, 1); } diff --git a/fs/proc/base.c b/fs/proc/base.c index b31283d81c52..e9d7ddc52f69 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -58,7 +58,6 @@ #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 5e391cbca7a3..24baf23e864f 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -116,9 +116,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) { struct file *file; - rcu_read_lock(); - file = task_lookup_fdget_rcu(task, fd); - rcu_read_unlock(); + file = fget_task(task, fd); if (file) { *mode = file->f_mode; fput(file); @@ -258,19 +256,17 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) goto out; - rcu_read_lock(); for (fd = ctx->pos - 2;; fd++) { struct file *f; struct fd_data data; char name[10 + 1]; unsigned int len; - f = task_lookup_next_fdget_rcu(p, &fd); + f = fget_task_next(p, &fd); ctx->pos = fd + 2LL; if (!f) break; data.mode = f->f_mode; - rcu_read_unlock(); fput(f); data.fd = fd; @@ -278,11 +274,9 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!proc_fill_cache(file, ctx, name, len, instantiate, p, &data)) - goto out; + break; cond_resched(); - rcu_read_lock(); } - rcu_read_unlock(); out: put_task_struct(p); return 0; diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index f4616083faef..04bb29721419 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -20,7 +20,7 @@ static int show_softirqs(struct seq_file *p, void *v) for (i = 0; i < NR_SOFTIRQS; i++) { seq_printf(p, "%12s:", softirq_to_name[i]); for_each_possible_cpu(j) - seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); + seq_put_decimal_ull_width(p, " ", kstat_softirqs_cpu(i, j), 10); seq_putc(p, '\n'); } return 0; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e52bd96137a6..7eb010de39fe 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2665,8 +2665,10 @@ static int pagemap_scan_get_args(struct pm_scan_arg *arg, return -EFAULT; if (!arg->vec && arg->vec_len) return -EINVAL; + if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) + return -EINVAL; if (arg->vec && !access_ok((void __user *)(long)arg->vec, - arg->vec_len * sizeof(struct page_region))) + size_mul(arg->vec_len, sizeof(struct page_region)))) return -EFAULT; /* Fixup default values */ diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index b52d85f8ad59..b4521b096058 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -457,10 +457,6 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) #endif } -static const struct vm_operations_struct vmcore_mmap_ops = { - .fault = mmap_vmcore_fault, -}; - /** * vmcore_alloc_buf - allocate buffer in vmalloc memory * @size: size of buffer @@ -488,6 +484,11 @@ static inline char *vmcore_alloc_buf(size_t size) * virtually contiguous user-space in ELF layout. */ #ifdef CONFIG_MMU + +static const struct vm_operations_struct vmcore_mmap_ops = { + .fault = mmap_vmcore_fault, +}; + /* * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages * reported as not being ram with the zero page. diff --git a/fs/read_write.c b/fs/read_write.c index 64dc24afdb3a..3e5dad12a5b4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1830,18 +1830,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } -bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos) +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) - return false; + return -EINVAL; if (!is_power_of_2(len)) - return false; + return -EINVAL; + + if (!IS_ALIGNED(iocb->ki_pos, len)) + return -EINVAL; - if (!IS_ALIGNED(pos, len)) - return false; + if (!(iocb->ki_flags & IOCB_DIRECT)) + return -EOPNOTSUPP; - return true; + return 0; } +EXPORT_SYMBOL_GPL(generic_atomic_write_valid); diff --git a/fs/seq_file.c b/fs/seq_file.c index e676c8b0cf5d..8bbb1ad46335 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -343,8 +343,8 @@ EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. - * @file: file in question * @inode: its inode + * @file: file in question * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 15d94ac4095e..0ce2d704b1f3 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1037,6 +1037,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server) */ } + put_net(cifs_net_ns(server)); kfree(server->leaf_fullpath); kfree(server); @@ -1635,8 +1636,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) /* srv_count can never go negative */ WARN_ON(server->srv_count < 0); - put_net(cifs_net_ns(server)); - list_del_init(&server->tcp_ses_list); spin_unlock(&cifs_tcp_ses_lock); @@ -3070,13 +3069,22 @@ generic_ip_connect(struct TCP_Server_Info *server) if (server->ssocket) { socket = server->ssocket; } else { - rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM, + struct net *net = cifs_net_ns(server); + struct sock *sk; + + rc = __sock_create(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket, 1); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } + sk = server->ssocket->sk; + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); + /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); socket = server->ssocket; diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index aa2a37a7ce84..e6a72f75ab94 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -70,6 +70,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); atomic_set(&conn->refcnt, 1); + atomic_set(&conn->mux_smb_requests, 0); conn->total_credits = 1; conn->outstanding_credits = 0; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index b379ae4fdcdf..8ddd5a3c7baf 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -107,6 +107,7 @@ struct ksmbd_conn { __le16 signing_algorithm; bool binding; atomic_t refcnt; + atomic_t mux_smb_requests; }; struct ksmbd_conn_ops { diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 1e4624e9d434..ad02fe555fda 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -90,7 +90,7 @@ static int __rpc_method(char *rpc_name) int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) { - struct ksmbd_session_rpc *entry; + struct ksmbd_session_rpc *entry, *old; struct ksmbd_rpc_command *resp; int method; @@ -106,16 +106,19 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) entry->id = ksmbd_ipc_id_alloc(); if (entry->id < 0) goto free_entry; - xa_store(&sess->rpc_handle_list, entry->id, entry, GFP_KERNEL); + old = xa_store(&sess->rpc_handle_list, entry->id, entry, GFP_KERNEL); + if (xa_is_err(old)) + goto free_id; resp = ksmbd_rpc_open(sess, entry->id); if (!resp) - goto free_id; + goto erase_xa; kvfree(resp); return entry->id; -free_id: +erase_xa: xa_erase(&sess->rpc_handle_list, entry->id); +free_id: ksmbd_rpc_id_free(entry->id); free_entry: kfree(entry); @@ -175,6 +178,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) unsigned long id; struct ksmbd_session *sess; + down_write(&sessions_table_lock); down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { if (atomic_read(&sess->refcnt) == 0 && @@ -188,6 +192,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) } } up_write(&conn->session_lock); + up_write(&sessions_table_lock); } int ksmbd_session_register(struct ksmbd_conn *conn, @@ -229,7 +234,6 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) } } } - up_write(&sessions_table_lock); down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { @@ -249,6 +253,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) } } up_write(&conn->session_lock); + up_write(&sessions_table_lock); } struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 9670c97f14b3..e6cfedba9992 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -238,11 +238,11 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, } while (is_chained == true); send: - if (work->sess) - ksmbd_user_session_put(work->sess); if (work->tcon) ksmbd_tree_connect_put(work->tcon); smb3_preauth_hash_rsp(work); + if (work->sess) + ksmbd_user_session_put(work->sess); if (work->sess && work->sess->enc && work->encrypted && conn->ops->encrypt_resp) { rc = conn->ops->encrypt_resp(work); @@ -270,6 +270,7 @@ static void handle_ksmbd_work(struct work_struct *wk) ksmbd_conn_try_dequeue_request(work); ksmbd_free_work_struct(work); + atomic_dec(&conn->mux_smb_requests); /* * Checking waitqueue to dropping pending requests on * disconnection. waitqueue_active is safe because it @@ -291,6 +292,15 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) struct ksmbd_work *work; int err; + err = ksmbd_init_smb_server(conn); + if (err) + return 0; + + if (atomic_inc_return(&conn->mux_smb_requests) >= conn->vals->max_credits) { + atomic_dec_return(&conn->mux_smb_requests); + return -ENOSPC; + } + work = ksmbd_alloc_work_struct(); if (!work) { pr_err("allocation for work failed\n"); @@ -301,12 +311,6 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) work->request_buf = conn->request_buf; conn->request_buf = NULL; - err = ksmbd_init_smb_server(work); - if (err) { - ksmbd_free_work_struct(work); - return 0; - } - ksmbd_conn_enqueue_request(work); atomic_inc(&conn->r_count); /* update activity on connection */ diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index a2ebbe604c8c..75b4eb856d32 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -388,6 +388,10 @@ static struct smb_version_ops smb1_server_ops = { .set_rsp_status = set_smb1_rsp_status, }; +static struct smb_version_values smb1_server_values = { + .max_credits = SMB2_MAX_CREDITS, +}; + static int smb1_negotiate(struct ksmbd_work *work) { return ksmbd_smb_negotiate_common(work, SMB_COM_NEGOTIATE); @@ -399,18 +403,18 @@ static struct smb_version_cmds smb1_server_cmds[1] = { static int init_smb1_server(struct ksmbd_conn *conn) { + conn->vals = &smb1_server_values; conn->ops = &smb1_server_ops; conn->cmds = smb1_server_cmds; conn->max_cmds = ARRAY_SIZE(smb1_server_cmds); return 0; } -int ksmbd_init_smb_server(struct ksmbd_work *work) +int ksmbd_init_smb_server(struct ksmbd_conn *conn) { - struct ksmbd_conn *conn = work->conn; __le32 proto; - proto = *(__le32 *)((struct smb_hdr *)work->request_buf)->Protocol; + proto = *(__le32 *)((struct smb_hdr *)conn->request_buf)->Protocol; if (conn->need_neg == false) { if (proto == SMB1_PROTO_NUMBER) return -EINVAL; diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index cc1d6dfe29d5..a3d8a905b07e 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -427,7 +427,7 @@ bool ksmbd_smb_request(struct ksmbd_conn *conn); int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count); -int ksmbd_init_smb_server(struct ksmbd_work *work); +int ksmbd_init_smb_server(struct ksmbd_conn *conn); struct ksmbd_kstat; int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 22251743fadf..d19d4db74af8 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -30,7 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; loff_t start_index = folio->index & ~mask; loff_t end_index = start_index | mask; - int i, n, pages, bytes, res = -ENOMEM; + loff_t index; + int i, pages, bytes, res = -ENOMEM; struct page **page, *last_page; struct squashfs_page_actor *actor; void *pageaddr; @@ -45,9 +46,9 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, return res; /* Try to grab all the pages covered by the Squashfs block */ - for (i = 0, n = start_index; n <= end_index; n++) { - page[i] = (n == folio->index) ? target_page : - grab_cache_page_nowait(target_page->mapping, n); + for (i = 0, index = start_index; index <= end_index; index++) { + page[i] = (index == folio->index) ? target_page : + grab_cache_page_nowait(target_page->mapping, index); if (page[i] == NULL) continue; diff --git a/fs/stat.c b/fs/stat.c index 41e598376d7e..5ef213343834 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -23,10 +23,46 @@ #include <linux/uaccess.h> #include <asm/unistd.h> +#include <trace/events/timestamp.h> + #include "internal.h" #include "mount.h" /** + * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED + * @stat: where to store the resulting values + * @request_mask: STATX_* values requested + * @inode: inode from which to grab the c/mtime + * + * Given @inode, grab the ctime and mtime out if it and store the result + * in @stat. When fetching the value, flag it as QUERIED (if not already) + * so the next write will record a distinct timestamp. + * + * NB: The QUERIED flag is tracked in the ctime, but we set it there even + * if only the mtime was requested, as that ensures that the next mtime + * change will be distinct. + */ +void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) +{ + atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec; + + /* If neither time was requested, then don't report them */ + if (!(request_mask & (STATX_CTIME|STATX_MTIME))) { + stat->result_mask &= ~(STATX_CTIME|STATX_MTIME); + return; + } + + stat->mtime = inode_get_mtime(inode); + stat->ctime.tv_sec = inode->i_ctime_sec; + stat->ctime.tv_nsec = (u32)atomic_read(pcn); + if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED)) + stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn)); + stat->ctime.tv_nsec &= ~I_CTIME_QUERIED; + trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime); +} +EXPORT_SYMBOL(fill_mg_cmtime); + +/** * generic_fillattr - Fill in the basic attributes from the inode struct * @idmap: idmap of the mount the inode was found from * @request_mask: statx request_mask @@ -58,8 +94,14 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); - stat->ctime = inode_get_ctime(inode); + + if (is_mgtime(inode)) { + fill_mg_cmtime(stat, request_mask, inode); + } else { + stat->ctime = inode_get_ctime(inode); + stat->mtime = inode_get_mtime(inode); + } + stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; diff --git a/fs/super.c b/fs/super.c index 1db230432960..c9c7223bc2a2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1596,13 +1596,14 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, EXPORT_SYMBOL_GPL(setup_bdev_super); /** - * get_tree_bdev - Get a superblock based on a single block device + * get_tree_bdev_flags - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock + * @flags: GET_TREE_BDEV_* flags */ -int get_tree_bdev(struct fs_context *fc, - int (*fill_super)(struct super_block *, - struct fs_context *)) +int get_tree_bdev_flags(struct fs_context *fc, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc), unsigned int flags) { struct super_block *s; int error = 0; @@ -1613,10 +1614,10 @@ int get_tree_bdev(struct fs_context *fc, error = lookup_bdev(fc->source, &dev); if (error) { - errorf(fc, "%s: Can't lookup blockdev", fc->source); + if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP)) + errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } - fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) @@ -1644,6 +1645,19 @@ int get_tree_bdev(struct fs_context *fc, fc->root = dget(s->s_root); return 0; } +EXPORT_SYMBOL_GPL(get_tree_bdev_flags); + +/** + * get_tree_bdev - Get a superblock based on a single block device + * @fc: The filesystem context holding the parameters + * @fill_super: Helper to initialise a new superblock + */ +int get_tree_bdev(struct fs_context *fc, + int (*fill_super)(struct super_block *, + struct fs_context *)) +{ + return get_tree_bdev_flags(fc, fill_super, 0); +} EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 1748dff58c3b..cfc614c638da 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -392,6 +392,9 @@ static int tracefs_reconfigure(struct fs_context *fc) struct tracefs_fs_info *sb_opts = sb->s_fs_info; struct tracefs_fs_info *new_opts = fc->s_fs_info; + if (!new_opts) + return 0; + sync_filesystem(sb); /* structure copy of new mount options to sb */ *sb_opts = *new_opts; @@ -478,14 +481,17 @@ static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &tracefs_super_operations; sb->s_d_op = &tracefs_dentry_operations; - tracefs_apply_options(sb, false); - return 0; } static int tracefs_get_tree(struct fs_context *fc) { - return get_tree_single(fc, tracefs_fill_super); + int err = get_tree_single(fc, tracefs_fill_super); + + if (err) + return err; + + return tracefs_reconfigure(fc); } static void tracefs_free_fc(struct fs_context *fc) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 291583005dd1..3fb308b6e167 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -19,9 +19,9 @@ #include <linux/module.h> #include <linux/ctype.h> #include <linux/kthread.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/seq_file.h> -#include <linux/mount.h> #include <linux/math64.h> #include <linux/writeback.h> #include "ubifs.h" @@ -981,177 +981,120 @@ enum { Opt_auth_key, Opt_auth_hash_name, Opt_ignore, - Opt_err, }; -static const match_table_t tokens = { - {Opt_fast_unmount, "fast_unmount"}, - {Opt_norm_unmount, "norm_unmount"}, - {Opt_bulk_read, "bulk_read"}, - {Opt_no_bulk_read, "no_bulk_read"}, - {Opt_chk_data_crc, "chk_data_crc"}, - {Opt_no_chk_data_crc, "no_chk_data_crc"}, - {Opt_override_compr, "compr=%s"}, - {Opt_auth_key, "auth_key=%s"}, - {Opt_auth_hash_name, "auth_hash_name=%s"}, - {Opt_ignore, "ubi=%s"}, - {Opt_ignore, "vol=%s"}, - {Opt_assert, "assert=%s"}, - {Opt_err, NULL}, +static const struct constant_table ubifs_param_compr[] = { + { "none", UBIFS_COMPR_NONE }, + { "lzo", UBIFS_COMPR_LZO }, + { "zlib", UBIFS_COMPR_ZLIB }, + { "zstd", UBIFS_COMPR_ZSTD }, + {} }; -/** - * parse_standard_option - parse a standard mount option. - * @option: the option to parse - * - * Normally, standard mount options like "sync" are passed to file-systems as - * flags. However, when a "rootflags=" kernel boot parameter is used, they may - * be present in the options string. This function tries to deal with this - * situation and parse standard options. Returns 0 if the option was not - * recognized, and the corresponding integer flag if it was. - * - * UBIFS is only interested in the "sync" option, so do not check for anything - * else. - */ -static int parse_standard_option(const char *option) -{ +static const struct constant_table ubifs_param_assert[] = { + { "report", ASSACT_REPORT }, + { "read-only", ASSACT_RO }, + { "panic", ASSACT_PANIC }, + {} +}; - pr_notice("UBIFS: parse %s\n", option); - if (!strcmp(option, "sync")) - return SB_SYNCHRONOUS; - return 0; -} +static const struct fs_parameter_spec ubifs_fs_param_spec[] = { + fsparam_flag ("fast_unmount", Opt_fast_unmount), + fsparam_flag ("norm_unmount", Opt_norm_unmount), + fsparam_flag ("bulk_read", Opt_bulk_read), + fsparam_flag ("no_bulk_read", Opt_no_bulk_read), + fsparam_flag ("chk_data_crc", Opt_chk_data_crc), + fsparam_flag ("no_chk_data_crc", Opt_no_chk_data_crc), + fsparam_enum ("compr", Opt_override_compr, ubifs_param_compr), + fsparam_enum ("assert", Opt_assert, ubifs_param_assert), + fsparam_string ("auth_key", Opt_auth_key), + fsparam_string ("auth_hash_name", Opt_auth_hash_name), + fsparam_string ("ubi", Opt_ignore), + fsparam_string ("vol", Opt_ignore), + {} +}; + +struct ubifs_fs_context { + struct ubifs_mount_opts mount_opts; + char *auth_key_name; + char *auth_hash_name; + unsigned int no_chk_data_crc:1; + unsigned int bulk_read:1; + unsigned int default_compr:2; + unsigned int assert_action:2; +}; /** - * ubifs_parse_options - parse mount parameters. - * @c: UBIFS file-system description object - * @options: parameters to parse - * @is_remount: non-zero if this is FS re-mount + * ubifs_parse_param - parse a parameter. + * @fc: the filesystem context + * @param: the parameter to parse * * This function parses UBIFS mount options and returns zero in case success * and a negative error code in case of failure. */ -static int ubifs_parse_options(struct ubifs_info *c, char *options, - int is_remount) +static int ubifs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - - if (!options) - return 0; + struct ubifs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); + int opt; - while ((p = strsep(&options, ","))) { - int token; + opt = fs_parse(fc, ubifs_fs_param_spec, param, &result); + if (opt < 0) + return opt; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { + switch (opt) { /* * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. * We accept them in order to be backward-compatible. But this * should be removed at some point. */ - case Opt_fast_unmount: - c->mount_opts.unmount_mode = 2; - break; - case Opt_norm_unmount: - c->mount_opts.unmount_mode = 1; - break; - case Opt_bulk_read: - c->mount_opts.bulk_read = 2; - c->bulk_read = 1; - break; - case Opt_no_bulk_read: - c->mount_opts.bulk_read = 1; - c->bulk_read = 0; - break; - case Opt_chk_data_crc: - c->mount_opts.chk_data_crc = 2; - c->no_chk_data_crc = 0; - break; - case Opt_no_chk_data_crc: - c->mount_opts.chk_data_crc = 1; - c->no_chk_data_crc = 1; - break; - case Opt_override_compr: - { - char *name = match_strdup(&args[0]); - - if (!name) - return -ENOMEM; - if (!strcmp(name, "none")) - c->mount_opts.compr_type = UBIFS_COMPR_NONE; - else if (!strcmp(name, "lzo")) - c->mount_opts.compr_type = UBIFS_COMPR_LZO; - else if (!strcmp(name, "zlib")) - c->mount_opts.compr_type = UBIFS_COMPR_ZLIB; - else if (!strcmp(name, "zstd")) - c->mount_opts.compr_type = UBIFS_COMPR_ZSTD; - else { - ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready? - kfree(name); - return -EINVAL; - } - kfree(name); - c->mount_opts.override_compr = 1; - c->default_compr = c->mount_opts.compr_type; - break; - } - case Opt_assert: - { - char *act = match_strdup(&args[0]); - - if (!act) - return -ENOMEM; - if (!strcmp(act, "report")) - c->assert_action = ASSACT_REPORT; - else if (!strcmp(act, "read-only")) - c->assert_action = ASSACT_RO; - else if (!strcmp(act, "panic")) - c->assert_action = ASSACT_PANIC; - else { - ubifs_err(c, "unknown assert action \"%s\"", act); - kfree(act); - return -EINVAL; - } - kfree(act); - break; - } - case Opt_auth_key: - if (!is_remount) { - c->auth_key_name = kstrdup(args[0].from, - GFP_KERNEL); - if (!c->auth_key_name) - return -ENOMEM; - } - break; - case Opt_auth_hash_name: - if (!is_remount) { - c->auth_hash_name = kstrdup(args[0].from, - GFP_KERNEL); - if (!c->auth_hash_name) - return -ENOMEM; - } - break; - case Opt_ignore: - break; - default: - { - unsigned long flag; - struct super_block *sb = c->vfs_sb; - - flag = parse_standard_option(p); - if (!flag) { - ubifs_err(c, "unrecognized mount option \"%s\" or missing value", - p); - return -EINVAL; - } - sb->s_flags |= flag; - break; + case Opt_fast_unmount: + ctx->mount_opts.unmount_mode = 2; + break; + case Opt_norm_unmount: + ctx->mount_opts.unmount_mode = 1; + break; + case Opt_bulk_read: + ctx->mount_opts.bulk_read = 2; + ctx->bulk_read = 1; + break; + case Opt_no_bulk_read: + ctx->mount_opts.bulk_read = 1; + ctx->bulk_read = 0; + break; + case Opt_chk_data_crc: + ctx->mount_opts.chk_data_crc = 2; + ctx->no_chk_data_crc = 0; + break; + case Opt_no_chk_data_crc: + ctx->mount_opts.chk_data_crc = 1; + ctx->no_chk_data_crc = 1; + break; + case Opt_override_compr: + ctx->mount_opts.compr_type = result.uint_32; + ctx->mount_opts.override_compr = 1; + ctx->default_compr = ctx->mount_opts.compr_type; + break; + case Opt_assert: + ctx->assert_action = result.uint_32; + break; + case Opt_auth_key: + if (!is_remount) { + kfree(ctx->auth_key_name); + ctx->auth_key_name = param->string; + param->string = NULL; } + break; + case Opt_auth_hash_name: + if (!is_remount) { + kfree(ctx->auth_hash_name); + ctx->auth_hash_name = param->string; + param->string = NULL; } + break; + case Opt_ignore: + break; } return 0; @@ -2003,21 +1946,27 @@ static void ubifs_put_super(struct super_block *sb) mutex_unlock(&c->umount_mutex); } -static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) +static int ubifs_reconfigure(struct fs_context *fc) { + struct ubifs_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; int err; struct ubifs_info *c = sb->s_fs_info; sync_filesystem(sb); - dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); + dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, fc->sb_flags); - err = ubifs_parse_options(c, data, 1); - if (err) { - ubifs_err(c, "invalid or unknown remount parameter"); - return err; - } + /* + * Apply the mount option changes. + * auth_key_name and auth_hash_name are ignored on remount. + */ + c->mount_opts = ctx->mount_opts; + c->bulk_read = ctx->bulk_read; + c->no_chk_data_crc = ctx->no_chk_data_crc; + c->default_compr = ctx->default_compr; + c->assert_action = ctx->assert_action; - if (c->ro_mount && !(*flags & SB_RDONLY)) { + if (c->ro_mount && !(fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/W due to prior errors"); return -EROFS; @@ -2029,7 +1978,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) err = ubifs_remount_rw(c); if (err) return err; - } else if (!c->ro_mount && (*flags & SB_RDONLY)) { + } else if (!c->ro_mount && (fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/O due to prior errors"); return -EROFS; @@ -2062,14 +2011,13 @@ const struct super_operations ubifs_super_operations = { .evict_inode = ubifs_evict_inode, .statfs = ubifs_statfs, .dirty_inode = ubifs_dirty_inode, - .remount_fs = ubifs_remount_fs, .show_options = ubifs_show_options, .sync_fs = ubifs_sync_fs, }; /** * open_ubi - parse UBI device name string and open the UBI device. - * @name: UBI volume name + * @fc: The filesystem context * @mode: UBI volume open mode * * The primary method of mounting UBIFS is by specifying the UBI volume @@ -2086,15 +2034,13 @@ const struct super_operations ubifs_super_operations = { * returns UBI volume description object in case of success and a negative * error code in case of failure. */ -static struct ubi_volume_desc *open_ubi(const char *name, int mode) +static struct ubi_volume_desc *open_ubi(struct fs_context *fc, int mode) { struct ubi_volume_desc *ubi; + const char *name = fc->source; int dev, vol; char *endptr; - if (!name || !*name) - return ERR_PTR(-EINVAL); - /* First, try to open using the device node path method */ ubi = ubi_open_volume_path(name, mode); if (!IS_ERR(ubi)) @@ -2102,14 +2048,14 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) /* Try the "nodev" method */ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') - return ERR_PTR(-EINVAL); + goto invalid_source; /* ubi:NAME method */ if ((name[3] == ':' || name[3] == '!') && name[4] != '\0') return ubi_open_volume_nm(0, name + 4, mode); if (!isdigit(name[3])) - return ERR_PTR(-EINVAL); + goto invalid_source; dev = simple_strtoul(name + 3, &endptr, 0); @@ -2121,7 +2067,7 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) if (*endptr == '_' && isdigit(endptr[1])) { vol = simple_strtoul(endptr + 1, &endptr, 0); if (*endptr != '\0') - return ERR_PTR(-EINVAL); + goto invalid_source; return ubi_open_volume(dev, vol, mode); } @@ -2129,7 +2075,8 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0') return ubi_open_volume_nm(dev, ++endptr, mode); - return ERR_PTR(-EINVAL); +invalid_source: + return ERR_PTR(invalf(fc, "Invalid source name")); } static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) @@ -2181,9 +2128,10 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) return c; } -static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +static int ubifs_fill_super(struct super_block *sb, struct fs_context *fc) { struct ubifs_info *c = sb->s_fs_info; + struct ubifs_fs_context *ctx = fc->fs_private; struct inode *root; int err; @@ -2195,9 +2143,18 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out; } - err = ubifs_parse_options(c, data, 0); - if (err) - goto out_close; + /* Copy in parsed mount options */ + c->mount_opts = ctx->mount_opts; + c->auth_key_name = ctx->auth_key_name; + c->auth_hash_name = ctx->auth_hash_name; + c->no_chk_data_crc = ctx->no_chk_data_crc; + c->bulk_read = ctx->bulk_read; + c->default_compr = ctx->default_compr; + c->assert_action = ctx->assert_action; + + /* ubifs_info owns auth strings now */ + ctx->auth_key_name = NULL; + ctx->auth_hash_name = NULL; /* * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For @@ -2264,41 +2221,38 @@ out: return err; } -static int sb_test(struct super_block *sb, void *data) +static int sb_test(struct super_block *sb, struct fs_context *fc) { - struct ubifs_info *c1 = data; + struct ubifs_info *c1 = fc->s_fs_info; struct ubifs_info *c = sb->s_fs_info; return c->vi.cdev == c1->vi.cdev; } -static int sb_set(struct super_block *sb, void *data) -{ - sb->s_fs_info = data; - return set_anon_super(sb, NULL); -} - -static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, - const char *name, void *data) +static int ubifs_get_tree(struct fs_context *fc) { struct ubi_volume_desc *ubi; struct ubifs_info *c; struct super_block *sb; int err; - dbg_gen("name %s, flags %#x", name, flags); + if (!fc->source || !*fc->source) + return invalf(fc, "No source specified"); + + dbg_gen("name %s, flags %#x", fc->source, fc->sb_flags); /* * Get UBI device number and volume ID. Mount it read-only so far * because this might be a new mount point, and UBI allows only one * read-write user at a time. */ - ubi = open_ubi(name, UBI_READONLY); + ubi = open_ubi(fc, UBI_READONLY); if (IS_ERR(ubi)) { - if (!(flags & SB_SILENT)) + err = PTR_ERR(ubi); + if (!(fc->sb_flags & SB_SILENT)) pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", - current->pid, name, (int)PTR_ERR(ubi)); - return ERR_CAST(ubi); + current->pid, fc->source, err); + return err; } c = alloc_ubifs_info(ubi); @@ -2306,10 +2260,11 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, err = -ENOMEM; goto out_close; } + fc->s_fs_info = c; dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); - sb = sget(fs_type, sb_test, sb_set, flags, c); + sb = sget_fc(fc, sb_test, set_anon_super_fc); if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); @@ -2321,12 +2276,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); - if (!!(flags & SB_RDONLY) != c1->ro_mount) { + if (!!(fc->sb_flags & SB_RDONLY) != c1->ro_mount) { err = -EBUSY; goto out_deact; } } else { - err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0); + err = ubifs_fill_super(sb, fc); if (err) goto out_deact; /* We do not support atime */ @@ -2340,13 +2295,14 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, /* 'fill_super()' opens ubi again so we must close it here */ ubi_close_volume(ubi); - return dget(sb->s_root); + fc->root = dget(sb->s_root); + return 0; out_deact: deactivate_locked_super(sb); out_close: ubi_close_volume(ubi); - return ERR_PTR(err); + return err; } static void kill_ubifs_super(struct super_block *s) @@ -2356,10 +2312,61 @@ static void kill_ubifs_super(struct super_block *s) kfree(c); } +static void ubifs_free_fc(struct fs_context *fc) +{ + struct ubifs_fs_context *ctx = fc->fs_private; + + if (ctx) { + kfree(ctx->auth_key_name); + kfree(ctx->auth_hash_name); + kfree(ctx); + } +} + +static const struct fs_context_operations ubifs_context_ops = { + .free = ubifs_free_fc, + .parse_param = ubifs_parse_param, + .get_tree = ubifs_get_tree, + .reconfigure = ubifs_reconfigure, +}; + +static int ubifs_init_fs_context(struct fs_context *fc) +{ + struct ubifs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct ubifs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) { + /* Iniitialize for first mount */ + ctx->no_chk_data_crc = 1; + ctx->assert_action = ASSACT_RO; + } else { + struct ubifs_info *c = fc->root->d_sb->s_fs_info; + + /* + * Preserve existing options across remounts. + * auth_key_name and auth_hash_name are not remountable. + */ + ctx->mount_opts = c->mount_opts; + ctx->bulk_read = c->bulk_read; + ctx->no_chk_data_crc = c->no_chk_data_crc; + ctx->default_compr = c->default_compr; + ctx->assert_action = c->assert_action; + } + + fc->ops = &ubifs_context_ops; + fc->fs_private = ctx; + + return 0; +} + static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, - .mount = ubifs_mount, + .init_fs_context = ubifs_init_fs_context, + .parameters = ubifs_fs_param_spec, .kill_sb = kill_ubifs_super, }; MODULE_ALIAS_FS("ubifs"); diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index 8395066341a4..7f7cb14e01ce 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -214,3 +214,29 @@ void utf8_unload(struct unicode_map *um) } EXPORT_SYMBOL(utf8_unload); +/** + * utf8_parse_version - Parse a UTF-8 version number from a string + * + * @version: input string + * + * Returns the parsed version on success, negative code on error + */ +int utf8_parse_version(char *version) +{ + substring_t args[3]; + unsigned int maj, min, rev; + static const struct match_token token[] = { + {1, "%d.%d.%d"}, + {0, NULL} + }; + + if (match_token(version, token, args) != 1) + return -EINVAL; + + if (match_int(&args[0], &maj) || match_int(&args[1], &min) || + match_int(&args[2], &rev)) + return -EINVAL; + + return UNICODE_AGE(maj, min, rev); +} +EXPORT_SYMBOL(utf8_parse_version); diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index 600e15efe9ed..5ddaf27b21a6 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -17,9 +17,6 @@ static unsigned int failed_tests; static unsigned int total_tests; -/* Tests will be based on this version. */ -#define UTF8_LATEST UNICODE_AGE(12, 1, 0) - #define _test(cond, func, line, fmt, ...) do { \ total_tests++; \ if (!cond) { \ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 68cdd89c97a3..7c0bd0b55f88 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs) } } +void dup_userfaultfd_fail(struct list_head *fcs) +{ + struct userfaultfd_fork_ctx *fctx, *n; + + /* + * An error has occurred on fork, we will tear memory down, but have + * allocated memory for fctx's and raised reference counts for both the + * original and child contexts (and on the mm for each as a result). + * + * These would ordinarily be taken care of by a user handling the event, + * but we are no longer doing so, so manually clean up here. + * + * mm tear down will take care of cleaning up VMA contexts. + */ + list_for_each_entry_safe(fctx, n, fcs, list) { + struct userfaultfd_ctx *octx = fctx->orig; + struct userfaultfd_ctx *ctx = fctx->new; + + atomic_dec(&octx->mmap_changing); + VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0); + userfaultfd_ctx_put(octx); + userfaultfd_ctx_put(ctx); + + list_del(&fctx->list); + kfree(fctx); + } +} + void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *vm_ctx) { diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 04f64cf9777e..22bdbb3e9980 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1923,7 +1923,7 @@ restart: error = -EFSCORRUPTED; goto error0; } - if (flen < bestrlen) + if (flen <= bestrlen) break; busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen, &busy_gen); diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 3c40f37e82c7..c962ad64b0c1 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,12 @@ xfs_trans_ichgtime( ASSERT(tp); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - tv = current_time(inode); + /* If the mtime changes, then ctime must also change */ + ASSERT(flags & XFS_ICHGTIME_CHG); + tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode_set_mtime_to_ts(inode, tv); - if (flags & XFS_ICHGTIME_CHG) - inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_ACCESS) inode_set_atime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index aa4dbda7b536..e8196f5778e2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2115,6 +2115,13 @@ xfs_alloc_buftarg( btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); + if (bdev_can_atomic_write(btp->bt_bdev)) { + btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( + btp->bt_bdev); + btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( + btp->bt_bdev); + } + /* * When allocating the buftargs we have not yet read the super block and * thus don't know the file system sector size yet. diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 209a389f2abc..3d56bc7a35cc 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -124,6 +124,10 @@ struct xfs_buftarg { struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; + /* Atomic write unit values */ + unsigned int bt_bdev_awu_min; + unsigned int bt_bdev_awu_max; + /* built-in cache, if we're not using the perag one */ struct xfs_buf_cache bt_cache[]; }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b19916b11fd5..ca47cae5a40a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -852,6 +852,20 @@ xfs_file_write_iter( if (IS_DAX(inode)) return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * Currently only atomic writing of a single FS block is + * supported. It would be possible to atomic write smaller than + * a FS block, but there is no requirement to support this. + * Note that iomap also does not support this yet. + */ + if (ocount != ip->i_mount->m_sb.sb_blocksize) + return -EINVAL; + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered @@ -1239,6 +1253,8 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; + if (xfs_inode_can_atomicwrite(XFS_I(inode))) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index e3aaa0555597..290ba8887d29 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -64,25 +64,31 @@ xfs_filestream_pick_ag( struct xfs_perag *pag; struct xfs_perag *max_pag = NULL; xfs_extlen_t minlen = *longest; - xfs_extlen_t free = 0, minfree, maxfree = 0; + xfs_extlen_t minfree, maxfree = 0; xfs_agnumber_t agno; bool first_pass = true; - int err; /* 2% of an AG's blocks must be free for it to be chosen. */ minfree = mp->m_sb.sb_agblocks / 50; restart: for_each_perag_wrap(mp, start_agno, agno, pag) { + int err; + trace_xfs_filestream_scan(pag, pino); + *longest = 0; err = xfs_bmap_longest_free_extent(pag, NULL, longest); if (err) { - if (err != -EAGAIN) - break; - /* Couldn't lock the AGF, skip this AG. */ - err = 0; - continue; + if (err == -EAGAIN) { + /* Couldn't lock the AGF, skip this AG. */ + err = 0; + continue; + } + xfs_perag_rele(pag); + if (max_pag) + xfs_perag_rele(max_pag); + return err; } /* Keep track of the AG with the most free blocks. */ @@ -107,8 +113,9 @@ restart: !(flags & XFS_PICK_USERDATA) || (flags & XFS_PICK_LOWSPACE))) { /* Break out, retaining the reference on the AG. */ - free = pag->pagf_freeblks; - break; + if (max_pag) + xfs_perag_rele(max_pag); + goto done; } } @@ -116,57 +123,47 @@ restart: atomic_dec(&pag->pagf_fstrms); } - if (err) { - xfs_perag_rele(pag); - if (max_pag) - xfs_perag_rele(max_pag); - return err; + /* + * Allow a second pass to give xfs_bmap_longest_free_extent() another + * attempt at locking AGFs that it might have skipped over before we + * fail. + */ + if (first_pass) { + first_pass = false; + goto restart; } - if (!pag) { - /* - * Allow a second pass to give xfs_bmap_longest_free_extent() - * another attempt at locking AGFs that it might have skipped - * over before we fail. - */ - if (first_pass) { - first_pass = false; - goto restart; - } + /* + * We must be low on data space, so run a final lowspace optimised + * selection pass if we haven't already. + */ + if (!(flags & XFS_PICK_LOWSPACE)) { + flags |= XFS_PICK_LOWSPACE; + goto restart; + } - /* - * We must be low on data space, so run a final lowspace - * optimised selection pass if we haven't already. - */ - if (!(flags & XFS_PICK_LOWSPACE)) { - flags |= XFS_PICK_LOWSPACE; - goto restart; + /* + * No unassociated AGs are available, so select the AG with the most + * free space, regardless of whether it's already in use by another + * filestream. It none suit, just use whatever AG we can grab. + */ + if (!max_pag) { + for_each_perag_wrap(args->mp, 0, start_agno, pag) { + max_pag = pag; + break; } - /* - * No unassociated AGs are available, so select the AG with the - * most free space, regardless of whether it's already in use by - * another filestream. It none suit, just use whatever AG we can - * grab. - */ - if (!max_pag) { - for_each_perag_wrap(args->mp, 0, start_agno, args->pag) - break; - atomic_inc(&args->pag->pagf_fstrms); - *longest = 0; - } else { - pag = max_pag; - free = maxfree; - atomic_inc(&pag->pagf_fstrms); - } - } else if (max_pag) { - xfs_perag_rele(max_pag); + /* Bail if there are no AGs at all to select from. */ + if (!max_pag) + return -ENOSPC; } - trace_xfs_filestream_pick(pag, pino, free); + pag = max_pag; + atomic_inc(&pag->pagf_fstrms); +done: + trace_xfs_filestream_pick(pag, pino); args->pag = pag; return 0; - } static struct xfs_inode * diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bcc277fc0a83..19dcb569a3e7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1409,7 +1409,7 @@ xfs_inactive( if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) + xfs_inode_has_filedata(ip))) truncate = 1; if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 97ed912306fd..a2a6b5fd2545 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -292,6 +292,11 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip) return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); } +static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip) +{ + return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0; +} + /* * Check if an inode has any data in the COW fork. This might be often false * even for inodes with the reflink flag when there is no pending COW operation. @@ -327,6 +332,21 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) +static inline bool +xfs_inode_can_atomicwrite( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + + if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min) + return false; + if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max) + return false; + + return true; +} + /* * In-core inode flags. */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a20d426ef021..2567fd2a0994 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -481,7 +481,7 @@ xfs_ioctl_setattr_xflags( if (rtflag != XFS_IS_REALTIME_INODE(ip)) { /* Can't change realtime flag if any extents are allocated. */ - if (ip->i_df.if_nextents || ip->i_delayed_blks) + if (xfs_inode_has_filedata(ip)) return -EINVAL; /* @@ -602,7 +602,7 @@ xfs_ioctl_setattr_check_extsize( if (!fa->fsx_valid) return 0; - if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && + if (S_ISREG(VFS_I(ip)->i_mode) && xfs_inode_has_filedata(ip) && XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize) return -EINVAL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 916531d9f83c..86da16f54be9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -707,7 +707,7 @@ imap_needs_cow( return false; /* when zeroing we don't have to COW holes or unwritten extents */ - if (flags & IOMAP_ZERO) { + if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) { if (!nimaps || imap->br_startblock == HOLESTARTBLOCK || imap->br_state == XFS_EXT_UNWRITTEN) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ee79cf161312..4084d26f0d78 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -570,6 +570,20 @@ xfs_stat_blksize( return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize); } +static void +xfs_get_atomic_write_attr( + struct xfs_inode *ip, + unsigned int *unit_min, + unsigned int *unit_max) +{ + if (!xfs_inode_can_atomicwrite(ip)) { + *unit_min = *unit_max = 0; + return; + } + + *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize; +} + STATIC int xfs_vn_getattr( struct mnt_idmap *idmap, @@ -597,8 +611,9 @@ xfs_vn_getattr( stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); - stat->ctime = inode_get_ctime(inode); + + fill_mg_cmtime(stat, request_mask, inode); + stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); if (xfs_has_v3inodes(mp)) { @@ -608,11 +623,6 @@ xfs_vn_getattr( } } - if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { - stat->change_cookie = inode_query_iversion(inode); - stat->result_mask |= STATX_CHANGE_COOKIE; - } - /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. @@ -643,6 +653,14 @@ xfs_vn_getattr( stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); } + if (request_mask & STATX_WRITE_ATOMIC) { + unsigned int unit_min, unit_max; + + xfs_get_atomic_write_attr(ip, &unit_min, + &unit_max); + generic_fill_statx_atomic_writes(stat, + unit_min, unit_max); + } fallthrough; default: stat->blksize = xfs_stat_blksize(ip); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fbb3a1594c0d..fda75db739b1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2063,7 +2063,7 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = xfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("xfs"); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ee9f0b1f548d..fcb2bad4f76e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -691,8 +691,8 @@ DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); TRACE_EVENT(xfs_filestream_pick, - TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino, xfs_extlen_t free), - TP_ARGS(pag, ino, free), + TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), + TP_ARGS(pag, ino), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) @@ -703,14 +703,9 @@ TRACE_EVENT(xfs_filestream_pick, TP_fast_assign( __entry->dev = pag->pag_mount->m_super->s_dev; __entry->ino = ino; - if (pag) { - __entry->agno = pag->pag_agno; - __entry->streams = atomic_read(&pag->pagf_fstrms); - } else { - __entry->agno = NULLAGNUMBER; - __entry->streams = 0; - } - __entry->free = free; + __entry->agno = pag->pag_agno; + __entry->streams = atomic_read(&pag->pagf_fstrms); + __entry->free = pag->pagf_freeblks; ), TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d", MAJOR(__entry->dev), MINOR(__entry->dev), |