aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/super.c186
-rw-r--r--fs/affs/super.c374
-rw-r--r--fs/aio.c3
-rw-r--r--fs/attr.c61
-rw-r--r--fs/backing-file.c53
-rw-r--r--fs/befs/linuxvfs.c199
-rw-r--r--fs/binfmt_elf.c6
-rw-r--r--fs/binfmt_elf_fdpic.c6
-rw-r--r--fs/btrfs/Kconfig26
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/backref.c3
-rw-r--r--fs/btrfs/bio.c2
-rw-r--r--fs/btrfs/block-group.c2
-rw-r--r--fs/btrfs/btrfs_inode.h15
-rw-r--r--fs/btrfs/compression.c14
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c132
-rw-r--r--fs/btrfs/ctree.h13
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c325
-rw-r--r--fs/btrfs/delayed-ref.h64
-rw-r--r--fs/btrfs/dev-replace.c4
-rw-r--r--fs/btrfs/dir-item.c11
-rw-r--r--fs/btrfs/dir-item.h3
-rw-r--r--fs/btrfs/direct-io.c2
-rw-r--r--fs/btrfs/disk-io.c93
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c104
-rw-r--r--fs/btrfs/extent_io.c118
-rw-r--r--fs/btrfs/extent_map.c122
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/fiemap.c6
-rw-r--r--fs/btrfs/file.c374
-rw-r--r--fs/btrfs/file.h7
-rw-r--r--fs/btrfs/free-space-cache.c22
-rw-r--r--fs/btrfs/fs.h16
-rw-r--r--fs/btrfs/inode.c505
-rw-r--r--fs/btrfs/ioctl.c483
-rw-r--r--fs/btrfs/ioctl.h2
-rw-r--r--fs/btrfs/locking.c15
-rw-r--r--fs/btrfs/locking.h1
-rw-r--r--fs/btrfs/lzo.c2
-rw-r--r--fs/btrfs/ordered-data.c14
-rw-r--r--fs/btrfs/qgroup.c90
-rw-r--r--fs/btrfs/qgroup.h17
-rw-r--r--fs/btrfs/raid-stripe-tree.c92
-rw-r--r--fs/btrfs/raid-stripe-tree.h5
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c37
-rw-r--r--fs/btrfs/send.c61
-rw-r--r--fs/btrfs/send.h2
-rw-r--r--fs/btrfs/space-info.c12
-rw-r--r--fs/btrfs/subpage.c204
-rw-r--r--fs/btrfs/subpage.h39
-rw-r--r--fs/btrfs/super.c36
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c4
-rw-r--r--fs/btrfs/tests/btrfs-tests.h2
-rw-r--r--fs/btrfs/tests/raid-stripe-tree-tests.c538
-rw-r--r--fs/btrfs/transaction.c8
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-checker.c16
-rw-r--r--fs/btrfs/tree-checker.h4
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/btrfs/tree-mod-log.c1
-rw-r--r--fs/btrfs/tree-mod-log.h1
-rw-r--r--fs/btrfs/volumes.c163
-rw-r--r--fs/btrfs/volumes.h11
-rw-r--r--fs/btrfs/xattr.c5
-rw-r--r--fs/btrfs/zlib.c2
-rw-r--r--fs/btrfs/zoned.c17
-rw-r--r--fs/btrfs/zstd.c4
-rw-r--r--fs/buffer.c8
-rw-r--r--fs/cachefiles/interface.c14
-rw-r--r--fs/cachefiles/namei.c5
-rw-r--r--fs/cachefiles/ondemand.c38
-rw-r--r--fs/ceph/addr.c20
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/compat_binfmt_elf.c10
-rw-r--r--fs/configfs/configfs_internal.h4
-rw-r--r--fs/configfs/dir.c42
-rw-r--r--fs/configfs/inode.c25
-rw-r--r--fs/coredump.c1
-rw-r--r--fs/crypto/keyring.c1
-rw-r--r--fs/dcache.c16
-rw-r--r--fs/debugfs/file.c100
-rw-r--r--fs/debugfs/inode.c63
-rw-r--r--fs/debugfs/internal.h6
-rw-r--r--fs/dlm/ast.c2
-rw-r--r--fs/dlm/config.c170
-rw-r--r--fs/dlm/config.h26
-rw-r--r--fs/dlm/lock.c73
-rw-r--r--fs/dlm/lowcomms.c8
-rw-r--r--fs/dlm/member.c2
-rw-r--r--fs/dlm/recover.c35
-rw-r--r--fs/dlm/recoverd.c2
-rw-r--r--fs/ecryptfs/crypto.c35
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h9
-rw-r--r--fs/ecryptfs/inode.c12
-rw-r--r--fs/ecryptfs/mmap.c136
-rw-r--r--fs/ecryptfs/read_write.c50
-rw-r--r--fs/efs/super.c43
-rw-r--r--fs/erofs/data.c69
-rw-r--r--fs/erofs/inode.c12
-rw-r--r--fs/erofs/internal.h35
-rw-r--r--fs/erofs/super.c35
-rw-r--r--fs/erofs/sysfs.c17
-rw-r--r--fs/erofs/zdata.c221
-rw-r--r--fs/erofs/zmap.c17
-rw-r--r--fs/erofs/zutil.c155
-rw-r--r--fs/eventfd.c9
-rw-r--r--fs/eventpoll.c87
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h22
-rw-r--r--fs/ext4/extents.c13
-rw-r--r--fs/ext4/extents_status.c8
-rw-r--r--fs/ext4/extents_status.h3
-rw-r--r--fs/ext4/fast_commit.c8
-rw-r--r--fs/ext4/file.c36
-rw-r--r--fs/ext4/fsmap.c54
-rw-r--r--fs/ext4/ialloc.c5
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inode.c109
-rw-r--r--fs/ext4/ioctl.c21
-rw-r--r--fs/ext4/mballoc.c22
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/namei.c23
-rw-r--r--fs/ext4/page-io.c6
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c113
-rw-r--r--fs/f2fs/data.c9
-rw-r--r--fs/f2fs/file.c15
-rw-r--r--fs/fcntl.c46
-rw-r--r--fs/fhandle.c5
-rw-r--r--fs/file.c288
-rw-r--r--fs/file_table.c50
-rw-r--r--fs/freevxfs/vxfs_dir.h2
-rw-r--r--fs/fs-writeback.c40
-rw-r--r--fs/fs_parser.c21
-rw-r--r--fs/fsopen.c19
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/fuse/passthrough.c32
-rw-r--r--fs/gfs2/export.c1
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c12
-rw-r--r--fs/hfs/super.c342
-rw-r--r--fs/hfsplus/hfsplus_fs.h7
-rw-r--r--fs/hfsplus/options.c263
-rw-r--r--fs/hfsplus/super.c84
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/hpfs/super.c414
-rw-r--r--fs/hugetlbfs/inode.c19
-rw-r--r--fs/inode.c313
-rw-r--r--fs/internal.h18
-rw-r--r--fs/ioctl.c23
-rw-r--r--fs/iomap/buffered-io.c8
-rw-r--r--fs/iomap/direct-io.c43
-rw-r--r--fs/iomap/trace.h3
-rw-r--r--fs/isofs/inode.c8
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/journal.c15
-rw-r--r--fs/jbd2/recovery.c311
-rw-r--r--fs/jfs/jfs_dmap.c6
-rw-r--r--fs/jfs/jfs_dtree.c15
-rw-r--r--fs/jfs/jfs_filsys.h1
-rw-r--r--fs/jfs/super.c469
-rw-r--r--fs/jfs/xattr.c2
-rw-r--r--fs/kernel_read_file.c12
-rw-r--r--fs/libfs.c23
-rw-r--r--fs/lockd/svclock.c7
-rw-r--r--fs/locks.c15
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c88
-rw-r--r--fs/namespace.c208
-rw-r--r--fs/netfs/buffered_read.c8
-rw-r--r--fs/netfs/buffered_write.c41
-rw-r--r--fs/netfs/fscache_volume.c3
-rw-r--r--fs/nfsd/nfs4state.c19
-rw-r--r--fs/nilfs2/page.c22
-rw-r--r--fs/notify/dnotify/dnotify.c5
-rw-r--r--fs/notify/fanotify/Kconfig1
-rw-r--r--fs/notify/fanotify/fanotify.c1
-rw-r--r--fs/notify/fanotify/fanotify_user.c130
-rw-r--r--fs/notify/fsnotify.c23
-rw-r--r--fs/notify/inotify/inotify_user.c38
-rw-r--r--fs/notify/mark.c12
-rw-r--r--fs/ocfs2/cluster/heartbeat.c24
-rw-r--r--fs/ocfs2/export.c1
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/open.c82
-rw-r--r--fs/overlayfs/copy_up.c3
-rw-r--r--fs/overlayfs/dir.c68
-rw-r--r--fs/overlayfs/file.c327
-rw-r--r--fs/overlayfs/inode.c37
-rw-r--r--fs/overlayfs/namei.c10
-rw-r--r--fs/overlayfs/overlayfs.h12
-rw-r--r--fs/overlayfs/params.c116
-rw-r--r--fs/overlayfs/readdir.c8
-rw-r--r--fs/overlayfs/util.c14
-rw-r--r--fs/overlayfs/xattrs.c9
-rw-r--r--fs/pidfs.c86
-rw-r--r--fs/posix_acl.c13
-rw-r--r--fs/proc/base.c5
-rw-r--r--fs/proc/fd.c12
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/interrupts.c4
-rw-r--r--fs/proc/proc_sysctl.c113
-rw-r--r--fs/proc/stat.c4
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/quota/Kconfig15
-rw-r--r--fs/quota/dquot.c1
-rw-r--r--fs/quota/quota.c12
-rw-r--r--fs/read_write.c161
-rw-r--r--fs/readdir.c28
-rw-r--r--fs/reiserfs/Kconfig91
-rw-r--r--fs/reiserfs/Makefile30
-rw-r--r--fs/reiserfs/README151
-rw-r--r--fs/reiserfs/acl.h78
-rw-r--r--fs/reiserfs/bitmap.c1476
-rw-r--r--fs/reiserfs/dir.c346
-rw-r--r--fs/reiserfs/do_balan.c1900
-rw-r--r--fs/reiserfs/file.c270
-rw-r--r--fs/reiserfs/fix_node.c2822
-rw-r--r--fs/reiserfs/hashes.c177
-rw-r--r--fs/reiserfs/ibalance.c1161
-rw-r--r--fs/reiserfs/inode.c3416
-rw-r--r--fs/reiserfs/ioctl.c221
-rw-r--r--fs/reiserfs/item_ops.c737
-rw-r--r--fs/reiserfs/journal.c4404
-rw-r--r--fs/reiserfs/lbalance.c1426
-rw-r--r--fs/reiserfs/lock.c101
-rw-r--r--fs/reiserfs/namei.c1725
-rw-r--r--fs/reiserfs/objectid.c216
-rw-r--r--fs/reiserfs/prints.c792
-rw-r--r--fs/reiserfs/procfs.c490
-rw-r--r--fs/reiserfs/reiserfs.h3419
-rw-r--r--fs/reiserfs/resize.c230
-rw-r--r--fs/reiserfs/stree.c2280
-rw-r--r--fs/reiserfs/super.c2646
-rw-r--r--fs/reiserfs/tail_conversion.c318
-rw-r--r--fs/reiserfs/xattr.c1039
-rw-r--r--fs/reiserfs/xattr.h117
-rw-r--r--fs/reiserfs/xattr_acl.c411
-rw-r--r--fs/reiserfs/xattr_security.c127
-rw-r--r--fs/reiserfs/xattr_trusted.c46
-rw-r--r--fs/reiserfs/xattr_user.c43
-rw-r--r--fs/remap_range.c11
-rw-r--r--fs/select.c48
-rw-r--r--fs/seq_file.c2
-rw-r--r--fs/signalfd.c9
-rw-r--r--fs/smb/client/ioctl.c11
-rw-r--r--fs/splice.c78
-rw-r--r--fs/stat.c98
-rw-r--r--fs/statfs.c12
-rw-r--r--fs/sync.c29
-rw-r--r--fs/timerfd.c44
-rw-r--r--fs/ubifs/super.c399
-rw-r--r--fs/ufs/balloc.c107
-rw-r--r--fs/ufs/cylinder.c31
-rw-r--r--fs/ufs/dir.c29
-rw-r--r--fs/ufs/file.c1
-rw-r--r--fs/ufs/inode.c179
-rw-r--r--fs/ufs/namei.c39
-rw-r--r--fs/ufs/super.c49
-rw-r--r--fs/ufs/ufs.h12
-rw-r--r--fs/ufs/ufs_fs.h4
-rw-r--r--fs/ufs/util.c46
-rw-r--r--fs/ufs/util.h61
-rw-r--r--fs/unicode/README.utf8data8
-rw-r--r--fs/unicode/mkutf8data.c4
-rw-r--r--fs/unicode/utf8-core.c28
-rw-r--r--fs/unicode/utf8-selftest.c3
-rw-r--r--fs/unicode/utf8data.c_shipped2
-rw-r--r--fs/unicode/utf8n.h2
-rw-r--r--fs/utimes.c11
-rw-r--r--fs/xattr.c446
-rw-r--r--fs/xfs/Makefile8
-rw-r--r--fs/xfs/libxfs/xfs_ag.c256
-rw-r--r--fs/xfs/libxfs/xfs_ag.h205
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c22
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c119
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h19
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c30
-rw-r--r--fs/xfs/libxfs/xfs_attr.c5
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c137
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h2
-rw-r--r--fs/xfs/libxfs/xfs_btree.c38
-rw-r--r--fs/xfs/libxfs/xfs_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_btree_mem.c6
-rw-r--r--fs/xfs/libxfs/xfs_defer.c6
-rw-r--r--fs/xfs/libxfs/xfs_defer.h1
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c190
-rw-r--r--fs/xfs/libxfs/xfs_format.h199
-rw-r--r--fs/xfs/libxfs/xfs_fs.h53
-rw-r--r--fs/xfs/libxfs/xfs_group.c225
-rw-r--r--fs/xfs/libxfs/xfs_group.h164
-rw-r--r--fs/xfs/libxfs/xfs_health.h89
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c175
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c31
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c90
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h3
-rw-r--r--fs/xfs/libxfs/xfs_inode_util.c6
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h8
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/libxfs/xfs_metadir.c481
-rw-r--r--fs/xfs/libxfs/xfs_metadir.h47
-rw-r--r--fs/xfs/libxfs/xfs_metafile.c52
-rw-r--r--fs/xfs/libxfs/xfs_metafile.h31
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h186
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h43
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c33
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c17
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c42
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h6
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c28
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c388
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h247
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.c697
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h284
-rw-r--r--fs/xfs/libxfs/xfs_sb.c276
-rw-r--r--fs/xfs/libxfs/xfs_sb.h6
-rw-r--r--fs/xfs/libxfs/xfs_shared.h4
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c6
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_types.c44
-rw-r--r--fs/xfs/libxfs/xfs_types.h16
-rw-r--r--fs/xfs/scrub/agheader.c52
-rw-r--r--fs/xfs/scrub/agheader_repair.c42
-rw-r--r--fs/xfs/scrub/alloc.c2
-rw-r--r--fs/xfs/scrub/alloc_repair.c22
-rw-r--r--fs/xfs/scrub/bmap.c38
-rw-r--r--fs/xfs/scrub/bmap_repair.c11
-rw-r--r--fs/xfs/scrub/common.c149
-rw-r--r--fs/xfs/scrub/common.h40
-rw-r--r--fs/xfs/scrub/cow_repair.c21
-rw-r--r--fs/xfs/scrub/dir.c10
-rw-r--r--fs/xfs/scrub/dir_repair.c20
-rw-r--r--fs/xfs/scrub/dirtree.c32
-rw-r--r--fs/xfs/scrub/dirtree.h12
-rw-r--r--fs/xfs/scrub/findparent.c28
-rw-r--r--fs/xfs/scrub/fscounters.c35
-rw-r--r--fs/xfs/scrub/fscounters_repair.c9
-rw-r--r--fs/xfs/scrub/health.c54
-rw-r--r--fs/xfs/scrub/ialloc.c16
-rw-r--r--fs/xfs/scrub/ialloc_repair.c27
-rw-r--r--fs/xfs/scrub/inode.c35
-rw-r--r--fs/xfs/scrub/inode_repair.c39
-rw-r--r--fs/xfs/scrub/iscan.c4
-rw-r--r--fs/xfs/scrub/metapath.c689
-rw-r--r--fs/xfs/scrub/newbt.c52
-rw-r--r--fs/xfs/scrub/nlinks.c4
-rw-r--r--fs/xfs/scrub/nlinks_repair.c4
-rw-r--r--fs/xfs/scrub/orphanage.c4
-rw-r--r--fs/xfs/scrub/parent.c39
-rw-r--r--fs/xfs/scrub/parent_repair.c37
-rw-r--r--fs/xfs/scrub/quotacheck.c7
-rw-r--r--fs/xfs/scrub/reap.c10
-rw-r--r--fs/xfs/scrub/refcount.c3
-rw-r--r--fs/xfs/scrub/refcount_repair.c7
-rw-r--r--fs/xfs/scrub/repair.c61
-rw-r--r--fs/xfs/scrub/repair.h13
-rw-r--r--fs/xfs/scrub/rgsuper.c84
-rw-r--r--fs/xfs/scrub/rmap.c4
-rw-r--r--fs/xfs/scrub/rmap_repair.c25
-rw-r--r--fs/xfs/scrub/rtbitmap.c54
-rw-r--r--fs/xfs/scrub/rtsummary.c116
-rw-r--r--fs/xfs/scrub/rtsummary_repair.c22
-rw-r--r--fs/xfs/scrub/scrub.c52
-rw-r--r--fs/xfs/scrub/scrub.h17
-rw-r--r--fs/xfs/scrub/stats.c2
-rw-r--r--fs/xfs/scrub/tempfile.c105
-rw-r--r--fs/xfs/scrub/tempfile.h3
-rw-r--r--fs/xfs/scrub/trace.c1
-rw-r--r--fs/xfs/scrub/trace.h247
-rw-r--r--fs/xfs/xfs_bmap_item.c26
-rw-r--r--fs/xfs/xfs_bmap_util.c46
-rw-r--r--fs/xfs/xfs_buf.c7
-rw-r--r--fs/xfs/xfs_buf.h4
-rw-r--r--fs/xfs/xfs_buf_item_recover.c67
-rw-r--r--fs/xfs/xfs_discard.c308
-rw-r--r--fs/xfs/xfs_dquot.c38
-rw-r--r--fs/xfs/xfs_dquot.h18
-rw-r--r--fs/xfs/xfs_drain.c78
-rw-r--r--fs/xfs/xfs_drain.h22
-rw-r--r--fs/xfs/xfs_exchrange.c20
-rw-r--r--fs/xfs/xfs_extent_busy.c214
-rw-r--r--fs/xfs/xfs_extent_busy.h65
-rw-r--r--fs/xfs/xfs_extfree_item.c282
-rw-r--r--fs/xfs/xfs_file.c82
-rw-r--r--fs/xfs/xfs_filestream.c13
-rw-r--r--fs/xfs/xfs_fsmap.c363
-rw-r--r--fs/xfs/xfs_fsmap.h15
-rw-r--r--fs/xfs/xfs_fsops.c14
-rw-r--r--fs/xfs/xfs_handle.c16
-rw-r--r--fs/xfs/xfs_health.c278
-rw-r--r--fs/xfs/xfs_icache.c134
-rw-r--r--fs/xfs/xfs_inode.c33
-rw-r--r--fs/xfs/xfs_inode.h64
-rw-r--r--fs/xfs/xfs_inode_item.c7
-rw-r--r--fs/xfs/xfs_inode_item_recover.c2
-rw-r--r--fs/xfs/xfs_ioctl.c115
-rw-r--r--fs/xfs/xfs_iomap.c71
-rw-r--r--fs/xfs/xfs_iomap.h1
-rw-r--r--fs/xfs/xfs_iops.c47
-rw-r--r--fs/xfs/xfs_itable.c33
-rw-r--r--fs/xfs/xfs_itable.h3
-rw-r--r--fs/xfs/xfs_iunlink_item.c13
-rw-r--r--fs/xfs/xfs_iwalk.c116
-rw-r--r--fs/xfs/xfs_iwalk.h7
-rw-r--r--fs/xfs/xfs_log_cil.c3
-rw-r--r--fs/xfs/xfs_log_recover.c18
-rw-r--r--fs/xfs/xfs_message.c51
-rw-r--r--fs/xfs/xfs_message.h20
-rw-r--r--fs/xfs/xfs_mount.c61
-rw-r--r--fs/xfs/xfs_mount.h113
-rw-r--r--fs/xfs/xfs_pnfs.c3
-rw-r--r--fs/xfs/xfs_qm.c381
-rw-r--r--fs/xfs/xfs_qm_bhv.c36
-rw-r--r--fs/xfs/xfs_quota.h19
-rw-r--r--fs/xfs/xfs_refcount_item.c9
-rw-r--r--fs/xfs/xfs_reflink.c7
-rw-r--r--fs/xfs/xfs_rmap_item.c9
-rw-r--r--fs/xfs/xfs_rtalloc.c1025
-rw-r--r--fs/xfs/xfs_rtalloc.h6
-rw-r--r--fs/xfs/xfs_stats.c7
-rw-r--r--fs/xfs/xfs_super.c77
-rw-r--r--fs/xfs/xfs_trace.c5
-rw-r--r--fs/xfs/xfs_trace.h687
-rw-r--r--fs/xfs/xfs_trans.c97
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c25
-rw-r--r--fs/xfs/xfs_trans_dquot.c17
-rw-r--r--fs/xfs/xfs_xattr.c3
441 files changed, 17298 insertions, 41522 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index aae170fc2795..64d420e3c475 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,7 +43,6 @@ config FS_MBCACHE
default y if EXT4_FS=y
default m if EXT2_FS_XATTR || EXT4_FS
-source "fs/reiserfs/Kconfig"
source "fs/jfs/Kconfig"
source "fs/xfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 61679fd587b7..15df0a923d3a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -61,7 +61,6 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_NETFS_SUPPORT) += netfs/
-obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT4_FS) += ext4/
# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
# ext2 driver, which doesn't know about journalling! Explicitly request ext2
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index f0b999a4961b..017c48a80203 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -6,7 +6,8 @@
*/
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -115,87 +116,61 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
+enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix};
-static const match_table_t tokens = {
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_ownmask, "ownmask=%o"},
- {Opt_othmask, "othmask=%o"},
- {Opt_ftsuffix, "ftsuffix=%u"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec adfs_param_spec[] = {
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("ownmask", Opt_ownmask),
+ fsparam_u32oct ("othmask", Opt_othmask),
+ fsparam_u32 ("ftsuffix", Opt_ftsuffix),
+ {}
};
-static int parse_options(struct super_block *sb, struct adfs_sb_info *asb,
- char *options)
+static int adfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- int option;
-
- if (!options)
- return 0;
-
- while ((p = strsep(&options, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(args, &option))
- return -EINVAL;
- asb->s_uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(asb->s_uid))
- return -EINVAL;
- break;
- case Opt_gid:
- if (match_int(args, &option))
- return -EINVAL;
- asb->s_gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(asb->s_gid))
- return -EINVAL;
- break;
- case Opt_ownmask:
- if (match_octal(args, &option))
- return -EINVAL;
- asb->s_owner_mask = option;
- break;
- case Opt_othmask:
- if (match_octal(args, &option))
- return -EINVAL;
- asb->s_other_mask = option;
- break;
- case Opt_ftsuffix:
- if (match_int(args, &option))
- return -EINVAL;
- asb->s_ftsuffix = option;
- break;
- default:
- adfs_msg(sb, KERN_ERR,
- "unrecognised mount option \"%s\" or missing value",
- p);
- return -EINVAL;
- }
+ struct adfs_sb_info *asb = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, adfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ asb->s_uid = result.uid;
+ break;
+ case Opt_gid:
+ asb->s_gid = result.gid;
+ break;
+ case Opt_ownmask:
+ asb->s_owner_mask = result.uint_32;
+ break;
+ case Opt_othmask:
+ asb->s_other_mask = result.uint_32;
+ break;
+ case Opt_ftsuffix:
+ asb->s_ftsuffix = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
}
return 0;
}
-static int adfs_remount(struct super_block *sb, int *flags, char *data)
+static int adfs_reconfigure(struct fs_context *fc)
{
- struct adfs_sb_info temp_asb;
- int ret;
+ struct adfs_sb_info *new_asb = fc->s_fs_info;
+ struct adfs_sb_info *asb = ADFS_SB(fc->root->d_sb);
- sync_filesystem(sb);
- *flags |= ADFS_SB_FLAGS;
+ sync_filesystem(fc->root->d_sb);
+ fc->sb_flags |= ADFS_SB_FLAGS;
- temp_asb = *ADFS_SB(sb);
- ret = parse_options(sb, &temp_asb, data);
- if (ret == 0)
- *ADFS_SB(sb) = temp_asb;
+ /* Structure copy newly parsed options */
+ *asb = *new_asb;
- return ret;
+ return 0;
}
static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -273,7 +248,6 @@ static const struct super_operations adfs_sops = {
.write_inode = adfs_write_inode,
.put_super = adfs_put_super,
.statfs = adfs_statfs,
- .remount_fs = adfs_remount,
.show_options = adfs_show_options,
};
@@ -361,34 +335,21 @@ static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh,
return 0;
}
-static int adfs_fill_super(struct super_block *sb, void *data, int silent)
+static int adfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct adfs_discrecord *dr;
struct object_info root_obj;
- struct adfs_sb_info *asb;
+ struct adfs_sb_info *asb = sb->s_fs_info;
struct inode *root;
int ret = -EINVAL;
+ int silent = fc->sb_flags & SB_SILENT;
sb->s_flags |= ADFS_SB_FLAGS;
- asb = kzalloc(sizeof(*asb), GFP_KERNEL);
- if (!asb)
- return -ENOMEM;
-
sb->s_fs_info = asb;
sb->s_magic = ADFS_SUPER_MAGIC;
sb->s_time_gran = 10000000;
- /* set default options */
- asb->s_uid = GLOBAL_ROOT_UID;
- asb->s_gid = GLOBAL_ROOT_GID;
- asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
- asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
- asb->s_ftsuffix = 0;
-
- if (parse_options(sb, asb, data))
- goto error;
-
/* Try to probe the filesystem boot block */
ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk);
if (ret == -EILSEQ)
@@ -453,18 +414,61 @@ error:
return ret;
}
-static struct dentry *adfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int adfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, adfs_fill_super);
+}
+
+static void adfs_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+ struct adfs_context *asb = fc->s_fs_info;
+
+ kfree(asb);
+}
+
+static const struct fs_context_operations adfs_context_ops = {
+ .parse_param = adfs_parse_param,
+ .get_tree = adfs_get_tree,
+ .reconfigure = adfs_reconfigure,
+ .free = adfs_free_fc,
+};
+
+static int adfs_init_fs_context(struct fs_context *fc)
+{
+ struct adfs_sb_info *asb;
+
+ asb = kzalloc(sizeof(struct adfs_sb_info), GFP_KERNEL);
+ if (!asb)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct adfs_sb_info *old_asb = ADFS_SB(sb);
+
+ /* structure copy existing options before parsing */
+ *asb = *old_asb;
+ } else {
+ /* set default options */
+ asb->s_uid = GLOBAL_ROOT_UID;
+ asb->s_gid = GLOBAL_ROOT_GID;
+ asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
+ asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
+ asb->s_ftsuffix = 0;
+ }
+
+ fc->ops = &adfs_context_ops;
+ fc->s_fs_info = asb;
+
+ return 0;
}
static struct file_system_type adfs_fs_type = {
.owner = THIS_MODULE,
.name = "adfs",
- .mount = adfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = adfs_init_fs_context,
+ .parameters = adfs_param_spec,
};
MODULE_ALIAS_FS("adfs");
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3c5821339609..2fa40337776d 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -14,7 +14,8 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/statfs.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/magic.h>
#include <linux/sched.h>
#include <linux/cred.h>
@@ -27,7 +28,6 @@
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
static int affs_show_options(struct seq_file *m, struct dentry *root);
-static int affs_remount (struct super_block *sb, int *flags, char *data);
static void
affs_commit_super(struct super_block *sb, int wait)
@@ -155,140 +155,114 @@ static const struct super_operations affs_sops = {
.put_super = affs_put_super,
.sync_fs = affs_sync_fs,
.statfs = affs_statfs,
- .remount_fs = affs_remount,
.show_options = affs_show_options,
};
enum {
Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
- Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
+ Opt_verbose, Opt_volume, Opt_ignore,
};
-static const match_table_t tokens = {
- {Opt_bs, "bs=%u"},
- {Opt_mode, "mode=%o"},
- {Opt_mufs, "mufs"},
- {Opt_notruncate, "nofilenametruncate"},
- {Opt_prefix, "prefix=%s"},
- {Opt_protect, "protect"},
- {Opt_reserved, "reserved=%u"},
- {Opt_root, "root=%u"},
- {Opt_setgid, "setgid=%u"},
- {Opt_setuid, "setuid=%u"},
- {Opt_verbose, "verbose"},
- {Opt_volume, "volume=%s"},
- {Opt_ignore, "grpquota"},
- {Opt_ignore, "noquota"},
- {Opt_ignore, "quota"},
- {Opt_ignore, "usrquota"},
- {Opt_err, NULL},
+struct affs_context {
+ kuid_t uid; /* uid to override */
+ kgid_t gid; /* gid to override */
+ unsigned int mode; /* mode to override */
+ unsigned int reserved; /* Number of reserved blocks */
+ int root_block; /* FFS root block number */
+ int blocksize; /* Initial device blksize */
+ char *prefix; /* Prefix for volumes and assigns */
+ char volume[32]; /* Vol. prefix for absolute symlinks */
+ unsigned long mount_flags; /* Options */
};
-static int
-parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root,
- int *blocksize, char **prefix, char *volume, unsigned long *mount_opts)
+static const struct fs_parameter_spec affs_param_spec[] = {
+ fsparam_u32 ("bs", Opt_bs),
+ fsparam_u32oct ("mode", Opt_mode),
+ fsparam_flag ("mufs", Opt_mufs),
+ fsparam_flag ("nofilenametruncate", Opt_notruncate),
+ fsparam_string ("prefix", Opt_prefix),
+ fsparam_flag ("protect", Opt_protect),
+ fsparam_u32 ("reserved", Opt_reserved),
+ fsparam_u32 ("root", Opt_root),
+ fsparam_gid ("setgid", Opt_setgid),
+ fsparam_uid ("setuid", Opt_setuid),
+ fsparam_flag ("verbose", Opt_verbose),
+ fsparam_string ("volume", Opt_volume),
+ fsparam_flag ("grpquota", Opt_ignore),
+ fsparam_flag ("noquota", Opt_ignore),
+ fsparam_flag ("quota", Opt_ignore),
+ fsparam_flag ("usrquota", Opt_ignore),
+ {},
+};
+
+static int affs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
-
- /* Fill in defaults */
-
- *uid = current_uid();
- *gid = current_gid();
- *reserved = 2;
- *root = -1;
- *blocksize = -1;
- volume[0] = ':';
- volume[1] = 0;
- *mount_opts = 0;
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token, n, option;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_bs:
- if (match_int(&args[0], &n))
- return 0;
- if (n != 512 && n != 1024 && n != 2048
- && n != 4096) {
- pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
- return 0;
- }
- *blocksize = n;
- break;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- return 0;
- *mode = option & 0777;
- affs_set_opt(*mount_opts, SF_SETMODE);
- break;
- case Opt_mufs:
- affs_set_opt(*mount_opts, SF_MUFS);
- break;
- case Opt_notruncate:
- affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
- break;
- case Opt_prefix:
- kfree(*prefix);
- *prefix = match_strdup(&args[0]);
- if (!*prefix)
- return 0;
- affs_set_opt(*mount_opts, SF_PREFIX);
- break;
- case Opt_protect:
- affs_set_opt(*mount_opts, SF_IMMUTABLE);
- break;
- case Opt_reserved:
- if (match_int(&args[0], reserved))
- return 0;
- break;
- case Opt_root:
- if (match_int(&args[0], root))
- return 0;
- break;
- case Opt_setgid:
- if (match_int(&args[0], &option))
- return 0;
- *gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(*gid))
- return 0;
- affs_set_opt(*mount_opts, SF_SETGID);
- break;
- case Opt_setuid:
- if (match_int(&args[0], &option))
- return 0;
- *uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(*uid))
- return 0;
- affs_set_opt(*mount_opts, SF_SETUID);
- break;
- case Opt_verbose:
- affs_set_opt(*mount_opts, SF_VERBOSE);
- break;
- case Opt_volume: {
- char *vol = match_strdup(&args[0]);
- if (!vol)
- return 0;
- strscpy(volume, vol, 32);
- kfree(vol);
- break;
- }
- case Opt_ignore:
- /* Silently ignore the quota options */
- break;
- default:
- pr_warn("Unrecognized mount option \"%s\" or missing value\n",
- p);
- return 0;
+ struct affs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int n;
+ int opt;
+
+ opt = fs_parse(fc, affs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_bs:
+ n = result.uint_32;
+ if (n != 512 && n != 1024 && n != 2048
+ && n != 4096) {
+ pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
+ return -EINVAL;
}
+ ctx->blocksize = n;
+ break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & 0777;
+ affs_set_opt(ctx->mount_flags, SF_SETMODE);
+ break;
+ case Opt_mufs:
+ affs_set_opt(ctx->mount_flags, SF_MUFS);
+ break;
+ case Opt_notruncate:
+ affs_set_opt(ctx->mount_flags, SF_NO_TRUNCATE);
+ break;
+ case Opt_prefix:
+ kfree(ctx->prefix);
+ ctx->prefix = param->string;
+ param->string = NULL;
+ affs_set_opt(ctx->mount_flags, SF_PREFIX);
+ break;
+ case Opt_protect:
+ affs_set_opt(ctx->mount_flags, SF_IMMUTABLE);
+ break;
+ case Opt_reserved:
+ ctx->reserved = result.uint_32;
+ break;
+ case Opt_root:
+ ctx->root_block = result.uint_32;
+ break;
+ case Opt_setgid:
+ ctx->gid = result.gid;
+ affs_set_opt(ctx->mount_flags, SF_SETGID);
+ break;
+ case Opt_setuid:
+ ctx->uid = result.uid;
+ affs_set_opt(ctx->mount_flags, SF_SETUID);
+ break;
+ case Opt_verbose:
+ affs_set_opt(ctx->mount_flags, SF_VERBOSE);
+ break;
+ case Opt_volume:
+ strscpy(ctx->volume, param->string, 32);
+ break;
+ case Opt_ignore:
+ /* Silently ignore the quota options */
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
static int affs_show_options(struct seq_file *m, struct dentry *root)
@@ -329,27 +303,22 @@ static int affs_show_options(struct seq_file *m, struct dentry *root)
* hopefully have the guts to do so. Until then: sorry for the mess.
*/
-static int affs_fill_super(struct super_block *sb, void *data, int silent)
+static int affs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct affs_sb_info *sbi;
+ struct affs_context *ctx = fc->fs_private;
struct buffer_head *root_bh = NULL;
struct buffer_head *boot_bh;
struct inode *root_inode = NULL;
- s32 root_block;
+ int silent = fc->sb_flags & SB_SILENT;
int size, blocksize;
u32 chksum;
int num_bm;
int i, j;
- kuid_t uid;
- kgid_t gid;
- int reserved;
- unsigned long mount_flags;
int tmp_flags; /* fix remount prototype... */
u8 sig[4];
int ret;
- pr_debug("read_super(%s)\n", data ? (const char *)data : "no options");
-
sb->s_magic = AFFS_SUPER_MAGIC;
sb->s_op = &affs_sops;
sb->s_flags |= SB_NODIRATIME;
@@ -369,19 +338,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock);
- if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
- &blocksize,&sbi->s_prefix,
- sbi->s_volume, &mount_flags)) {
- pr_err("Error parsing options\n");
- return -EINVAL;
- }
- /* N.B. after this point s_prefix must be released */
+ sbi->s_flags = ctx->mount_flags;
+ sbi->s_mode = ctx->mode;
+ sbi->s_uid = ctx->uid;
+ sbi->s_gid = ctx->gid;
+ sbi->s_reserved = ctx->reserved;
+ sbi->s_prefix = ctx->prefix;
+ ctx->prefix = NULL;
+ memcpy(sbi->s_volume, ctx->volume, 32);
- sbi->s_flags = mount_flags;
- sbi->s_mode = i;
- sbi->s_uid = uid;
- sbi->s_gid = gid;
- sbi->s_reserved= reserved;
+ /* N.B. after this point s_prefix must be released */
/* Get the size of the device in 512-byte blocks.
* If we later see that the partition uses bigger
@@ -396,15 +362,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
i = bdev_logical_block_size(sb->s_bdev);
j = PAGE_SIZE;
+ blocksize = ctx->blocksize;
if (blocksize > 0) {
i = j = blocksize;
size = size / (blocksize / 512);
}
for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
- sbi->s_root_block = root_block;
- if (root_block < 0)
- sbi->s_root_block = (reserved + size - 1) / 2;
+ sbi->s_root_block = ctx->root_block;
+ if (ctx->root_block < 0)
+ sbi->s_root_block = (ctx->reserved + size - 1) / 2;
pr_debug("setting blocksize to %d\n", blocksize);
affs_set_blocksize(sb, blocksize);
sbi->s_partition_size = size;
@@ -424,7 +391,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
"size=%d, reserved=%d\n",
sb->s_id,
sbi->s_root_block + num_bm,
- blocksize, size, reserved);
+ ctx->blocksize, size, ctx->reserved);
root_bh = affs_bread(sb, sbi->s_root_block + num_bm);
if (!root_bh)
continue;
@@ -447,7 +414,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
got_root:
/* Keep super block in cache */
sbi->s_root_bh = root_bh;
- root_block = sbi->s_root_block;
+ ctx->root_block = sbi->s_root_block;
/* Find out which kind of FS we have */
boot_bh = sb_bread(sb, 0);
@@ -506,7 +473,7 @@ got_root:
return -EINVAL;
}
- if (affs_test_opt(mount_flags, SF_VERBOSE)) {
+ if (affs_test_opt(ctx->mount_flags, SF_VERBOSE)) {
u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
len > 31 ? 31 : len,
@@ -528,7 +495,7 @@ got_root:
/* set up enough so that it can read an inode */
- root_inode = affs_iget(sb, root_block);
+ root_inode = affs_iget(sb, ctx->root_block);
if (IS_ERR(root_inode))
return PTR_ERR(root_inode);
@@ -548,56 +515,43 @@ got_root:
return 0;
}
-static int
-affs_remount(struct super_block *sb, int *flags, char *data)
+static int affs_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+ struct affs_context *ctx = fc->fs_private;
struct affs_sb_info *sbi = AFFS_SB(sb);
- int blocksize;
- kuid_t uid;
- kgid_t gid;
- int mode;
- int reserved;
- int root_block;
- unsigned long mount_flags;
int res = 0;
- char volume[32];
- char *prefix = NULL;
-
- pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
sync_filesystem(sb);
- *flags |= SB_NODIRATIME;
-
- memcpy(volume, sbi->s_volume, 32);
- if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
- &blocksize, &prefix, volume,
- &mount_flags)) {
- kfree(prefix);
- return -EINVAL;
- }
+ fc->sb_flags |= SB_NODIRATIME;
flush_delayed_work(&sbi->sb_work);
- sbi->s_flags = mount_flags;
- sbi->s_mode = mode;
- sbi->s_uid = uid;
- sbi->s_gid = gid;
+ /*
+ * NB: Historically, only mount_flags, mode, uid, gic, prefix,
+ * and volume are accepted during remount.
+ */
+ sbi->s_flags = ctx->mount_flags;
+ sbi->s_mode = ctx->mode;
+ sbi->s_uid = ctx->uid;
+ sbi->s_gid = ctx->gid;
/* protect against readers */
spin_lock(&sbi->symlink_lock);
- if (prefix) {
+ if (ctx->prefix) {
kfree(sbi->s_prefix);
- sbi->s_prefix = prefix;
+ sbi->s_prefix = ctx->prefix;
+ ctx->prefix = NULL;
}
- memcpy(sbi->s_volume, volume, 32);
+ memcpy(sbi->s_volume, ctx->volume, 32);
spin_unlock(&sbi->symlink_lock);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (*flags & SB_RDONLY)
+ if (fc->sb_flags & SB_RDONLY)
affs_free_bitmap(sb);
else
- res = affs_init_bitmap(sb, flags);
+ res = affs_init_bitmap(sb, &fc->sb_flags);
return res;
}
@@ -624,10 +578,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct dentry *affs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int affs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+ return get_tree_bdev(fc, affs_fill_super);
}
static void affs_kill_sb(struct super_block *sb)
@@ -643,12 +596,61 @@ static void affs_kill_sb(struct super_block *sb)
}
}
+static void affs_free_fc(struct fs_context *fc)
+{
+ struct affs_context *ctx = fc->fs_private;
+
+ kfree(ctx->prefix);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations affs_context_ops = {
+ .parse_param = affs_parse_param,
+ .get_tree = affs_get_tree,
+ .reconfigure = affs_reconfigure,
+ .free = affs_free_fc,
+};
+
+static int affs_init_fs_context(struct fs_context *fc)
+{
+ struct affs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct affs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct affs_sb_info *sbi = AFFS_SB(sb);
+
+ /*
+ * NB: historically, no options other than volume were
+ * preserved across a remount unless they were explicitly
+ * passed in.
+ */
+ memcpy(ctx->volume, sbi->s_volume, 32);
+ } else {
+ ctx->uid = current_uid();
+ ctx->gid = current_gid();
+ ctx->reserved = 2;
+ ctx->root_block = -1;
+ ctx->blocksize = -1;
+ ctx->volume[0] = ':';
+ }
+
+ fc->ops = &affs_context_ops;
+ fc->fs_private = ctx;
+
+ return 0;
+}
+
static struct file_system_type affs_fs_type = {
.owner = THIS_MODULE,
.name = "affs",
- .mount = affs_mount,
.kill_sb = affs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = affs_init_fs_context,
+ .parameters = affs_param_spec,
};
MODULE_ALIAS_FS("affs");
diff --git a/fs/aio.c b/fs/aio.c
index e8920178b50f..50671640b588 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1335,7 +1335,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
if (until == 0 || ret < 0 || ret >= min_nr)
return ret;
- hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
if (until != KTIME_MAX) {
hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns);
hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL);
@@ -2191,7 +2191,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
- /* TODO: use a hash or array, this sucks. */
list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
if (kiocb->ki_res.obj == obj) {
ret = kiocb->ki_cancel(&kiocb->rw);
diff --git a/fs/attr.c b/fs/attr.c
index c04d19b58f12..9caf63d20d03 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -272,6 +272,47 @@ out_big:
EXPORT_SYMBOL(inode_newsize_ok);
/**
+ * setattr_copy_mgtime - update timestamps for mgtime inodes
+ * @inode: inode timestamps to be updated
+ * @attr: attrs for the update
+ *
+ * With multigrain timestamps, take more care to prevent races when
+ * updating the ctime. Always update the ctime to the very latest using
+ * the standard mechanism, and use that to populate the atime and mtime
+ * appropriately (unless those are being set to specific values).
+ */
+static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
+{
+ unsigned int ia_valid = attr->ia_valid;
+ struct timespec64 now;
+
+ if (ia_valid & ATTR_CTIME) {
+ /*
+ * In the case of an update for a write delegation, we must respect
+ * the value in ia_ctime and not use the current time.
+ */
+ if (ia_valid & ATTR_DELEG)
+ now = inode_set_ctime_deleg(inode, attr->ia_ctime);
+ else
+ now = inode_set_ctime_current(inode);
+ } else {
+ /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */
+ WARN_ON_ONCE(ia_valid & ATTR_MTIME);
+ now = current_time(inode);
+ }
+
+ if (ia_valid & ATTR_ATIME_SET)
+ inode_set_atime_to_ts(inode, attr->ia_atime);
+ else if (ia_valid & ATTR_ATIME)
+ inode_set_atime_to_ts(inode, now);
+
+ if (ia_valid & ATTR_MTIME_SET)
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
+ else if (ia_valid & ATTR_MTIME)
+ inode_set_mtime_to_ts(inode, now);
+}
+
+/**
* setattr_copy - copy simple metadata updates into the generic inode
* @idmap: idmap of the mount the inode was found from
* @inode: the inode to be updated
@@ -303,12 +344,6 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
i_uid_update(idmap, attr, inode);
i_gid_update(idmap, attr, inode);
- if (ia_valid & ATTR_ATIME)
- inode_set_atime_to_ts(inode, attr->ia_atime);
- if (ia_valid & ATTR_MTIME)
- inode_set_mtime_to_ts(inode, attr->ia_mtime);
- if (ia_valid & ATTR_CTIME)
- inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
if (!in_group_or_capable(idmap, inode,
@@ -316,6 +351,20 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
mode &= ~S_ISGID;
inode->i_mode = mode;
}
+
+ if (is_mgtime(inode))
+ return setattr_copy_mgtime(inode, attr);
+
+ if (ia_valid & ATTR_ATIME)
+ inode_set_atime_to_ts(inode, attr->ia_atime);
+ if (ia_valid & ATTR_MTIME)
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
+ if (ia_valid & ATTR_CTIME) {
+ if (ia_valid & ATTR_DELEG)
+ inode_set_ctime_deleg(inode, attr->ia_ctime);
+ else
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
+ }
}
EXPORT_SYMBOL(setattr_copy);
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 09a9be945d45..526ddb4d6f76 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -80,7 +80,7 @@ struct backing_aio {
refcount_t ref;
struct kiocb *orig_iocb;
/* used for aio completion */
- void (*end_write)(struct file *, loff_t, ssize_t);
+ void (*end_write)(struct kiocb *iocb, ssize_t);
struct work_struct work;
long res;
};
@@ -108,10 +108,10 @@ static void backing_aio_cleanup(struct backing_aio *aio, long res)
struct kiocb *iocb = &aio->iocb;
struct kiocb *orig_iocb = aio->orig_iocb;
+ orig_iocb->ki_pos = iocb->ki_pos;
if (aio->end_write)
- aio->end_write(orig_iocb->ki_filp, iocb->ki_pos, res);
+ aio->end_write(orig_iocb, res);
- orig_iocb->ki_pos = iocb->ki_pos;
backing_aio_put(aio);
}
@@ -176,7 +176,7 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
- old_cred = override_creds(ctx->cred);
+ old_cred = override_creds_light(ctx->cred);
if (is_sync_kiocb(iocb)) {
rwf_t rwf = iocb_to_rw_flags(flags);
@@ -197,10 +197,10 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
backing_aio_cleanup(aio, ret);
}
out:
- revert_creds(old_cred);
+ revert_creds_light(old_cred);
if (ctx->accessed)
- ctx->accessed(ctx->user_file);
+ ctx->accessed(iocb->ki_filp);
return ret;
}
@@ -219,7 +219,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
if (!iov_iter_count(iter))
return 0;
- ret = file_remove_privs(ctx->user_file);
+ ret = file_remove_privs(iocb->ki_filp);
if (ret)
return ret;
@@ -233,13 +233,13 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
*/
flags &= ~IOCB_DIO_CALLER_COMP;
- old_cred = override_creds(ctx->cred);
+ old_cred = override_creds_light(ctx->cred);
if (is_sync_kiocb(iocb)) {
rwf_t rwf = iocb_to_rw_flags(flags);
ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
if (ctx->end_write)
- ctx->end_write(ctx->user_file, iocb->ki_pos, ret);
+ ctx->end_write(iocb, ret);
} else {
struct backing_aio *aio;
@@ -264,13 +264,13 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
backing_aio_cleanup(aio, ret);
}
out:
- revert_creds(old_cred);
+ revert_creds_light(old_cred);
return ret;
}
EXPORT_SYMBOL_GPL(backing_file_write_iter);
-ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags,
struct backing_file_ctx *ctx)
@@ -281,20 +281,20 @@ ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
return -EIO;
- old_cred = override_creds(ctx->cred);
- ret = vfs_splice_read(in, ppos, pipe, len, flags);
- revert_creds(old_cred);
+ old_cred = override_creds_light(ctx->cred);
+ ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
+ revert_creds_light(old_cred);
if (ctx->accessed)
- ctx->accessed(ctx->user_file);
+ ctx->accessed(iocb->ki_filp);
return ret;
}
EXPORT_SYMBOL_GPL(backing_file_splice_read);
ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
- struct file *out, loff_t *ppos, size_t len,
- unsigned int flags,
+ struct file *out, struct kiocb *iocb,
+ size_t len, unsigned int flags,
struct backing_file_ctx *ctx)
{
const struct cred *old_cred;
@@ -306,18 +306,18 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
if (!out->f_op->splice_write)
return -EINVAL;
- ret = file_remove_privs(ctx->user_file);
+ ret = file_remove_privs(iocb->ki_filp);
if (ret)
return ret;
- old_cred = override_creds(ctx->cred);
+ old_cred = override_creds_light(ctx->cred);
file_start_write(out);
- ret = out->f_op->splice_write(pipe, out, ppos, len, flags);
+ ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
file_end_write(out);
- revert_creds(old_cred);
+ revert_creds_light(old_cred);
if (ctx->end_write)
- ctx->end_write(ctx->user_file, ppos ? *ppos : 0, ret);
+ ctx->end_write(iocb, ret);
return ret;
}
@@ -329,8 +329,7 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
const struct cred *old_cred;
int ret;
- if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) ||
- WARN_ON_ONCE(ctx->user_file != vma->vm_file))
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
return -EIO;
if (!file->f_op->mmap)
@@ -338,12 +337,12 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
vma_set_file(vma, file);
- old_cred = override_creds(ctx->cred);
+ old_cred = override_creds_light(ctx->cred);
ret = call_mmap(vma->vm_file, vma);
- revert_creds(old_cred);
+ revert_creds_light(old_cred);
if (ctx->accessed)
- ctx->accessed(ctx->user_file);
+ ctx->accessed(vma->vm_file);
return ret;
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index f92f108840f5..8f430ff8e445 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -11,12 +11,13 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/errno.h>
#include <linux/stat.h>
#include <linux/nls.h>
#include <linux/buffer_head.h>
#include <linux/vfs.h>
-#include <linux/parser.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/cred.h>
@@ -54,22 +55,20 @@ static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static void befs_put_super(struct super_block *);
-static int befs_remount(struct super_block *, int *, char *);
static int befs_statfs(struct dentry *, struct kstatfs *);
static int befs_show_options(struct seq_file *, struct dentry *);
-static int parse_options(char *, struct befs_mount_options *);
static struct dentry *befs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
static struct dentry *befs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
static struct dentry *befs_get_parent(struct dentry *child);
+static void befs_free_fc(struct fs_context *fc);
static const struct super_operations befs_sops = {
.alloc_inode = befs_alloc_inode, /* allocate a new inode */
.free_inode = befs_free_inode, /* deallocate an inode */
.put_super = befs_put_super, /* uninit super */
.statfs = befs_statfs, /* statfs */
- .remount_fs = befs_remount,
.show_options = befs_show_options,
};
@@ -672,92 +671,53 @@ static struct dentry *befs_get_parent(struct dentry *child)
}
enum {
- Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
+ Opt_uid, Opt_gid, Opt_charset, Opt_debug,
};
-static const match_table_t befs_tokens = {
- {Opt_uid, "uid=%d"},
- {Opt_gid, "gid=%d"},
- {Opt_charset, "iocharset=%s"},
- {Opt_debug, "debug"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec befs_param_spec[] = {
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_string ("iocharset", Opt_charset),
+ fsparam_flag ("debug", Opt_debug),
+ {}
};
static int
-parse_options(char *options, struct befs_mount_options *opts)
+befs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- kuid_t uid;
- kgid_t gid;
-
- /* Initialize options */
- opts->uid = GLOBAL_ROOT_UID;
- opts->gid = GLOBAL_ROOT_GID;
- opts->use_uid = 0;
- opts->use_gid = 0;
- opts->iocharset = NULL;
- opts->debug = 0;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, befs_tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return 0;
- uid = INVALID_UID;
- if (option >= 0)
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid)) {
- pr_err("Invalid uid %d, "
- "using default\n", option);
- break;
- }
- opts->uid = uid;
- opts->use_uid = 1;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return 0;
- gid = INVALID_GID;
- if (option >= 0)
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid)) {
- pr_err("Invalid gid %d, "
- "using default\n", option);
- break;
- }
- opts->gid = gid;
- opts->use_gid = 1;
- break;
- case Opt_charset:
- kfree(opts->iocharset);
- opts->iocharset = match_strdup(&args[0]);
- if (!opts->iocharset) {
- pr_err("allocation failure for "
- "iocharset string\n");
- return 0;
- }
- break;
- case Opt_debug:
- opts->debug = 1;
- break;
- default:
- pr_err("Unrecognized mount option \"%s\" "
- "or missing value\n", p);
- return 0;
- }
+ struct befs_mount_options *opts = fc->fs_private;
+ int token;
+ struct fs_parse_result result;
+
+ /* befs ignores all options on remount */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
+
+ token = fs_parse(fc, befs_param_spec, param, &result);
+ if (token < 0)
+ return token;
+
+ switch (token) {
+ case Opt_uid:
+ opts->uid = result.uid;
+ opts->use_uid = 1;
+ break;
+ case Opt_gid:
+ opts->gid = result.gid;
+ opts->use_gid = 1;
+ break;
+ case Opt_charset:
+ kfree(opts->iocharset);
+ opts->iocharset = param->string;
+ param->string = NULL;
+ break;
+ case Opt_debug:
+ opts->debug = 1;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
static int befs_show_options(struct seq_file *m, struct dentry *root)
@@ -793,6 +753,21 @@ befs_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
+/*
+ * Copy the parsed options into the sbi mount_options member
+ */
+static void
+befs_set_options(struct befs_sb_info *sbi, struct befs_mount_options *opts)
+{
+ sbi->mount_opts.uid = opts->uid;
+ sbi->mount_opts.gid = opts->gid;
+ sbi->mount_opts.use_uid = opts->use_uid;
+ sbi->mount_opts.use_gid = opts->use_gid;
+ sbi->mount_opts.debug = opts->debug;
+ sbi->mount_opts.iocharset = opts->iocharset;
+ opts->iocharset = NULL;
+}
+
/* Allocate private field of the superblock, fill it.
*
* Finish filling the public superblock fields
@@ -800,7 +775,7 @@ befs_put_super(struct super_block *sb)
* Load a set of NLS translations if needed.
*/
static int
-befs_fill_super(struct super_block *sb, void *data, int silent)
+befs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct buffer_head *bh;
struct befs_sb_info *befs_sb;
@@ -810,6 +785,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
const unsigned long sb_block = 0;
const off_t x86_sb_off = 512;
int blocksize;
+ struct befs_mount_options *parsed_opts = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
if (sb->s_fs_info == NULL)
@@ -817,11 +794,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
befs_sb = BEFS_SB(sb);
- if (!parse_options((char *) data, &befs_sb->mount_opts)) {
- if (!silent)
- befs_error(sb, "cannot parse mount options");
- goto unacquire_priv_sbp;
- }
+ befs_set_options(befs_sb, parsed_opts);
befs_debug(sb, "---> %s", __func__);
@@ -934,10 +907,10 @@ unacquire_none:
}
static int
-befs_remount(struct super_block *sb, int *flags, char *data)
+befs_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- if (!(*flags & SB_RDONLY))
+ sync_filesystem(fc->root->d_sb);
+ if (!(fc->sb_flags & SB_RDONLY))
return -EINVAL;
return 0;
}
@@ -965,19 +938,51 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct dentry *
-befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
- void *data)
+static int befs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, befs_fill_super);
+}
+
+static const struct fs_context_operations befs_context_ops = {
+ .parse_param = befs_parse_param,
+ .get_tree = befs_get_tree,
+ .reconfigure = befs_reconfigure,
+ .free = befs_free_fc,
+};
+
+static int befs_init_fs_context(struct fs_context *fc)
+{
+ struct befs_mount_options *opts;
+
+ opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ /* Initialize options */
+ opts->uid = GLOBAL_ROOT_UID;
+ opts->gid = GLOBAL_ROOT_GID;
+
+ fc->fs_private = opts;
+ fc->ops = &befs_context_ops;
+
+ return 0;
+}
+
+static void befs_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+ struct befs_mount_options *opts = fc->fs_private;
+
+ kfree(opts->iocharset);
+ kfree(fc->fs_private);
}
static struct file_system_type befs_fs_type = {
.owner = THIS_MODULE,
.name = "befs",
- .mount = befs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = befs_init_fs_context,
+ .parameters = befs_param_spec,
};
MODULE_ALIAS_FS("befs");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 06dc4a57ba78..3039a6b7aba4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -258,6 +258,12 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
#ifdef ELF_HWCAP2
NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
#endif
+#ifdef ELF_HWCAP3
+ NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+#endif
+#ifdef ELF_HWCAP4
+ NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
+#endif
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
if (k_platform) {
NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 4fe5bb9f1b1f..31d253bd3961 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -624,6 +624,12 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
#ifdef ELF_HWCAP2
NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
#endif
+#ifdef ELF_HWCAP3
+ NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+#endif
+#ifdef ELF_HWCAP4
+ NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
+#endif
NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 4fb925e8c981..fa8515598341 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -78,6 +78,32 @@ config BTRFS_ASSERT
If unsure, say N.
+config BTRFS_EXPERIMENTAL
+ bool "Btrfs experimental features"
+ depends on BTRFS_FS
+ default n
+ help
+ Enable experimental features. These features may not be stable enough
+ for end users. This is meant for btrfs developers or users who wish
+ to test the functionality and report problems.
+
+ Current list:
+
+ - extent map shrinker - performance problems with too frequent shrinks
+
+ - send stream protocol v3 - fs-verity support
+
+ - checksum offload mode - sysfs knob to affect when checksums are
+ calculated (at IO time, or in a thread)
+
+ - raid-stripe-tree - additional mapping of extents to devices to
+ support RAID1* profiles on zoned devices,
+ RAID56 not yet supported
+
+ - extent tree v2 - complex rework of extent tracking
+
+ If unsure, say N.
+
config BTRFS_FS_REF_VERIFY
bool "Btrfs with the ref verify tool compiled in"
depends on BTRFS_FS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 87617f2968bc..3cfc440c636c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
- tests/free-space-tree-tests.o tests/extent-map-tests.o
+ tests/free-space-tree-tests.o tests/extent-map-tests.o \
+ tests/raid-stripe-tree-tests.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f8e1d5b2c512..04f53ca548e1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1442,7 +1442,8 @@ again:
*/
delayed_refs = &ctx->trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr);
+ head = btrfs_find_delayed_ref_head(ctx->fs_info, delayed_refs,
+ ctx->bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 7e0f9600b80c..1f216d07eff6 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -587,7 +587,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
{
bool auto_csum_mode = true;
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 4423d8b716a5..4427c1b835e8 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2797,7 +2797,7 @@ next:
* uncompressed data size, because the compression is only done
* when writeback triggered and we don't know how much space we
* are actually going to need, so we reserve the uncompressed
- * size because the data may be uncompressible in the worst case.
+ * size because the data may be incompressible in the worst case.
*/
if (ret == 0) {
bool used;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e152fde888fc..aa1f55cd81b7 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
struct extent_state *other);
void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *orig, u64 split);
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
void btrfs_evict_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
@@ -613,11 +612,17 @@ int btrfs_writepage_cow_fixup(struct folio *folio);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size,
- struct page **pages);
+ u64 disk_bytenr, u64 disk_io_size,
+ struct page **pages, void *uring_ctx);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded);
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size);
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed, bool *unlocked);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 90aef2627ca2..0c4d486c3048 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -453,7 +453,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- folio = __filemap_get_folio(mapping, pg_index, 0, 0);
+ folio = filemap_get_folio(mapping, pg_index);
if (!IS_ERR(folio)) {
u64 folio_sz = folio_size(folio);
u64 offset = offset_in_folio(folio, cur);
@@ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page.
*/
if (fs_info->sectorsize < PAGE_SIZE)
- btrfs_subpage_start_reader(fs_info, folio, cur,
- add_size);
+ btrfs_folio_set_lock(fs_info, folio, cur, add_size);
folio_put(folio);
cur += add_size;
}
@@ -702,7 +701,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(unsigned int level)
+static struct list_head *alloc_heuristic_ws(void)
{
struct heuristic_ws *ws;
@@ -744,9 +743,9 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
static struct list_head *alloc_workspace(int type, unsigned int level)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
+ case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
- case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level);
+ case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace();
case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
default:
/*
@@ -1030,6 +1029,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping
{
int type = btrfs_compress_type(type_level);
int level = btrfs_compress_level(type_level);
+ const unsigned long orig_len = *total_out;
struct list_head *workspace;
int ret;
@@ -1037,6 +1037,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping
workspace = get_workspace(type, level);
ret = compression_compress_pages(type, workspace, mapping, start, folios,
out_folios, total_in, total_out);
+ /* The total read-in bytes should be no larger than the input. */
+ ASSERT(*total_in <= orig_len);
put_workspace(type, workspace);
return ret;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index b6563b6a333e..954034086d0d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -175,7 +175,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-struct list_head *lzo_alloc_workspace(unsigned int level);
+struct list_head *lzo_alloc_workspace(void);
void lzo_free_workspace(struct list_head *ws);
int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0cc919d15b14..148648ea1c8b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1508,26 +1508,26 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
- struct extent_buffer **eb_ret, int level, int slot,
+ struct extent_buffer **eb_ret, int slot,
const struct btrfs_key *key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_tree_parent_check check = { 0 };
u64 blocknr;
- u64 gen;
- struct extent_buffer *tmp;
- int ret;
+ struct extent_buffer *tmp = NULL;
+ int ret = 0;
int parent_level;
- bool unlock_up;
+ int err;
+ bool read_tmp = false;
+ bool tmp_locked = false;
+ bool path_released = false;
- unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
blocknr = btrfs_node_blockptr(*eb_ret, slot);
- gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
check.has_first_key = true;
check.level = parent_level - 1;
- check.transid = gen;
+ check.transid = btrfs_node_ptr_generation(*eb_ret, slot);
check.owner_root = btrfs_root_id(root);
/*
@@ -1540,79 +1540,115 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
if (p->reada == READA_FORWARD_ALWAYS)
- reada_for_search(fs_info, p, level, slot, key->objectid);
+ reada_for_search(fs_info, p, parent_level, slot, key->objectid);
/* first we do an atomic uptodate check */
- if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+ if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
/*
* Do extra check for first_key, eb can be stale due to
* being cached, read from scrub, or have multiple
* parents (shared tree blocks).
*/
- if (btrfs_verify_level_key(tmp,
- parent_level - 1, &check.first_key, gen)) {
- free_extent_buffer(tmp);
- return -EUCLEAN;
+ if (btrfs_verify_level_key(tmp, &check)) {
+ ret = -EUCLEAN;
+ goto out;
}
*eb_ret = tmp;
- return 0;
+ tmp = NULL;
+ ret = 0;
+ goto out;
}
if (p->nowait) {
- free_extent_buffer(tmp);
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto out;
}
- if (unlock_up)
- btrfs_unlock_up_safe(p, level + 1);
-
- /* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_extent_buffer(tmp, &check);
- if (ret) {
- free_extent_buffer(tmp);
+ if (!p->skip_locking) {
+ btrfs_unlock_up_safe(p, parent_level + 1);
+ tmp_locked = true;
+ btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
- return ret;
+ ret = -EAGAIN;
+ path_released = true;
}
- if (unlock_up)
- ret = -EAGAIN;
+ /* Now we're allowed to do a blocking uptodate check. */
+ err = btrfs_read_extent_buffer(tmp, &check);
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ if (ret == 0) {
+ ASSERT(!tmp_locked);
+ *eb_ret = tmp;
+ tmp = NULL;
+ }
goto out;
} else if (p->nowait) {
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto out;
}
- if (unlock_up) {
- btrfs_unlock_up_safe(p, level + 1);
+ if (!p->skip_locking) {
+ btrfs_unlock_up_safe(p, parent_level + 1);
ret = -EAGAIN;
- } else {
- ret = 0;
}
if (p->reada != READA_NONE)
- reada_for_search(fs_info, p, level, slot, key->objectid);
+ reada_for_search(fs_info, p, parent_level, slot, key->objectid);
- tmp = read_tree_block(fs_info, blocknr, &check);
+ tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
if (IS_ERR(tmp)) {
+ ret = PTR_ERR(tmp);
+ tmp = NULL;
+ goto out;
+ }
+ read_tmp = true;
+
+ if (!p->skip_locking) {
+ ASSERT(ret == -EAGAIN);
+ tmp_locked = true;
+ btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
- return PTR_ERR(tmp);
+ path_released = true;
+ }
+
+ /* Now we're allowed to do a blocking uptodate check. */
+ err = btrfs_read_extent_buffer(tmp, &check);
+ if (err) {
+ ret = err;
+ goto out;
}
+
/*
* If the read above didn't mark this buffer up to date,
* it will never end up being up to date. Set ret to EIO now
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
- if (!extent_buffer_uptodate(tmp))
+ if (!extent_buffer_uptodate(tmp)) {
ret = -EIO;
+ goto out;
+ }
-out:
if (ret == 0) {
+ ASSERT(!tmp_locked);
*eb_ret = tmp;
- } else {
- free_extent_buffer(tmp);
- btrfs_release_path(p);
+ tmp = NULL;
+ }
+out:
+ if (tmp) {
+ if (tmp_locked)
+ btrfs_tree_read_unlock(tmp);
+ if (read_tmp && ret && ret != -EAGAIN)
+ free_extent_buffer_stale(tmp);
+ else
+ free_extent_buffer(tmp);
}
+ if (ret && !path_released)
+ btrfs_release_path(p);
return ret;
}
@@ -2197,8 +2233,8 @@ cow_done:
goto done;
}
- err = read_block_for_search(root, p, &b, level, slot, key);
- if (err == -EAGAIN)
+ err = read_block_for_search(root, p, &b, slot, key);
+ if (err == -EAGAIN && !p->nowait)
goto again;
if (err) {
ret = err;
@@ -2324,8 +2360,8 @@ again:
goto done;
}
- err = read_block_for_search(root, p, &b, level, slot, key);
- if (err == -EAGAIN)
+ err = read_block_for_search(root, p, &b, slot, key);
+ if (err == -EAGAIN && !p->nowait)
goto again;
if (err) {
ret = err;
@@ -2334,7 +2370,7 @@ again:
level = btrfs_header_level(b);
btrfs_tree_read_lock(b);
- b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
+ b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq);
if (!b) {
ret = -ENOMEM;
goto done;
@@ -4930,8 +4966,7 @@ again:
}
next = c;
- ret = read_block_for_search(root, path, &next, level,
- slot, &key);
+ ret = read_block_for_search(root, path, &next, slot, &key);
if (ret == -EAGAIN && !path->nowait)
goto again;
@@ -4974,8 +5009,7 @@ again:
if (!level)
break;
- ret = read_block_for_search(root, path, &next, level,
- 0, &key);
+ ret = read_block_for_search(root, path, &next, 0, &key);
if (ret == -EAGAIN && !path->nowait)
goto again;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 317a3712270f..307dedf95c70 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -744,16 +744,11 @@ const char *btrfs_super_csum_driver(u16 csum_type);
size_t __attribute_const__ btrfs_get_num_csums(void);
/*
- * We use page status Private2 to indicate there is an ordered extent with
+ * We use folio flag owner_2 to indicate there is an ordered extent with
* unfinished IO.
- *
- * Rename the Private2 accessors to Ordered, to improve readability.
*/
-#define PageOrdered(page) PagePrivate2(page)
-#define SetPageOrdered(page) SetPagePrivate2(page)
-#define ClearPageOrdered(page) ClearPagePrivate2(page)
-#define folio_test_ordered(folio) folio_test_private_2(folio)
-#define folio_set_ordered(folio) folio_set_private_2(folio)
-#define folio_clear_ordered(folio) folio_clear_private_2(folio)
+#define folio_test_ordered(folio) folio_test_owner_2(folio)
+#define folio_set_ordered(folio) folio_set_owner_2(folio)
+#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
#endif
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 7cfefdfe54ea..f4d9feac0d0e 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -64,9 +64,9 @@ struct btrfs_delayed_node {
struct mutex mutex;
struct btrfs_inode_item inode_item;
refcount_t refs;
+ int count;
u64 index_cnt;
unsigned long flags;
- int count;
/*
* The size of the next batch of dir index items to insert (if this
* node is from a directory inode). Protected by @mutex.
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index cab94d141f66..0d878dbbabba 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -9,6 +9,7 @@
#include "messages.h"
#include "ctree.h"
#include "delayed-ref.h"
+#include "extent-tree.h"
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
@@ -313,39 +314,6 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
return 0;
}
-/* insert a new ref to head ref rbtree */
-static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
- struct rb_node *node)
-{
- struct rb_node **p = &root->rb_root.rb_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_delayed_ref_head *entry;
- struct btrfs_delayed_ref_head *ins;
- u64 bytenr;
- bool leftmost = true;
-
- ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
- bytenr = ins->bytenr;
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
- href_node);
-
- if (bytenr < entry->bytenr) {
- p = &(*p)->rb_left;
- } else if (bytenr > entry->bytenr) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- return entry;
- }
- }
-
- rb_link_node(node, parent_node, p);
- rb_insert_color_cached(node, root, leftmost);
- return NULL;
-}
-
static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
@@ -380,75 +348,32 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
static struct btrfs_delayed_ref_head *find_first_ref_head(
struct btrfs_delayed_ref_root *dr)
{
- struct rb_node *n;
- struct btrfs_delayed_ref_head *entry;
+ unsigned long from = 0;
- n = rb_first_cached(&dr->href_root);
- if (!n)
- return NULL;
+ lockdep_assert_held(&dr->lock);
- entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
-
- return entry;
+ return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT);
}
-/*
- * Find a head entry based on bytenr. This returns the delayed ref head if it
- * was able to find one, or NULL if nothing was in that spot. If return_bigger
- * is given, the next bigger entry is returned if no exact match is found.
- */
-static struct btrfs_delayed_ref_head *find_ref_head(
- struct btrfs_delayed_ref_root *dr, u64 bytenr,
- bool return_bigger)
-{
- struct rb_root *root = &dr->href_root.rb_root;
- struct rb_node *n;
- struct btrfs_delayed_ref_head *entry;
-
- n = root->rb_node;
- entry = NULL;
- while (n) {
- entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
-
- if (bytenr < entry->bytenr)
- n = n->rb_left;
- else if (bytenr > entry->bytenr)
- n = n->rb_right;
- else
- return entry;
- }
- if (entry && return_bigger) {
- if (bytenr > entry->bytenr) {
- n = rb_next(&entry->href_node);
- if (!n)
- return NULL;
- entry = rb_entry(n, struct btrfs_delayed_ref_head,
- href_node);
- }
- return entry;
- }
- return NULL;
-}
-
-int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head)
+static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
{
lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex))
- return 0;
+ return true;
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
spin_lock(&delayed_refs->lock);
- if (RB_EMPTY_NODE(&head->href_node)) {
+ if (!head->tracked) {
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
- return -EAGAIN;
+ return false;
}
btrfs_put_delayed_ref_head(head);
- return 0;
+ return true;
}
static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
@@ -462,7 +387,6 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
btrfs_put_delayed_ref(ref);
- atomic_dec(&delayed_refs->num_entries);
btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}
@@ -558,33 +482,31 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
}
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs)
{
struct btrfs_delayed_ref_head *head;
+ unsigned long start_index;
+ unsigned long found_index;
+ bool found_head = false;
+ bool locked;
- lockdep_assert_held(&delayed_refs->lock);
+ spin_lock(&delayed_refs->lock);
again:
- head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
- true);
- if (!head && delayed_refs->run_delayed_start != 0) {
- delayed_refs->run_delayed_start = 0;
- head = find_first_ref_head(delayed_refs);
+ start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits);
+ xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) {
+ if (!head->processing) {
+ found_head = true;
+ break;
+ }
}
- if (!head)
- return NULL;
-
- while (head->processing) {
- struct rb_node *node;
-
- node = rb_next(&head->href_node);
- if (!node) {
- if (delayed_refs->run_delayed_start == 0)
- return NULL;
- delayed_refs->run_delayed_start = 0;
- goto again;
+ if (!found_head) {
+ if (delayed_refs->run_delayed_start == 0) {
+ spin_unlock(&delayed_refs->lock);
+ return NULL;
}
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
+ delayed_refs->run_delayed_start = 0;
+ goto again;
}
head->processing = true;
@@ -592,18 +514,42 @@ again:
delayed_refs->num_heads_ready--;
delayed_refs->run_delayed_start = head->bytenr +
head->num_bytes;
+
+ locked = btrfs_delayed_ref_lock(delayed_refs, head);
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * We may have dropped the spin lock to get the head mutex lock, and
+ * that might have given someone else time to free the head. If that's
+ * true, it has been removed from our list and we can move on.
+ */
+ if (!locked)
+ return ERR_PTR(-EAGAIN);
+
return head;
}
-void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ spin_lock(&delayed_refs->lock);
+ head->processing = false;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(head);
+}
+
+void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
+ const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits);
+
lockdep_assert_held(&delayed_refs->lock);
lockdep_assert_held(&head->lock);
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
- atomic_dec(&delayed_refs->num_entries);
+ xa_erase(&delayed_refs->head_refs, index);
+ head->tracked = false;
delayed_refs->num_heads--;
if (!head->processing)
delayed_refs->num_heads_ready--;
@@ -629,7 +575,6 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
if (!exist) {
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list);
- atomic_inc(&root->num_entries);
spin_unlock(&href->lock);
trans->delayed_ref_updates++;
return false;
@@ -813,7 +758,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID);
head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
- RB_CLEAR_NODE(&head_ref->href_node);
+ head_ref->tracked = false;
head_ref->processing = false;
head_ref->total_ref_mod = count_mod;
spin_lock_init(&head_ref->lock);
@@ -830,7 +775,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
qrecord->data_rsv = reserved;
qrecord->data_rsv_refroot = generic_ref->ref_root;
}
- qrecord->bytenr = generic_ref->bytenr;
qrecord->num_bytes = generic_ref->num_bytes;
qrecord->old_roots = NULL;
}
@@ -852,19 +796,33 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
+ const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits);
bool qrecord_inserted = false;
delayed_refs = &trans->transaction->delayed_refs;
+ lockdep_assert_held(&delayed_refs->lock);
+
+#if BITS_PER_LONG == 32
+ if (head_ref->bytenr >= MAX_LFS_FILESIZE) {
+ if (qrecord)
+ xa_release(&delayed_refs->dirty_extents, index);
+ btrfs_err_rl(fs_info,
+"delayed ref head %llu is beyond 32bit page cache and xarray index limit",
+ head_ref->bytenr);
+ btrfs_err_32bit_limit(fs_info);
+ return ERR_PTR(-EOVERFLOW);
+ }
+#endif
/* Record qgroup extent info if provided */
if (qrecord) {
int ret;
- ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord);
+ ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord,
+ head_ref->bytenr);
if (ret) {
/* Clean up if insertion fails or item exists. */
- xa_release(&delayed_refs->dirty_extents,
- qrecord->bytenr >> fs_info->sectorsize_bits);
+ xa_release(&delayed_refs->dirty_extents, index);
/* Caller responsible for freeing qrecord on error. */
if (ret < 0)
return ERR_PTR(ret);
@@ -876,8 +834,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
trace_add_delayed_ref_head(fs_info, head_ref, action);
- existing = htree_insert(&delayed_refs->href_root,
- &head_ref->href_node);
+ existing = xa_load(&delayed_refs->head_refs, index);
if (existing) {
update_existing_head_ref(trans, existing, head_ref);
/*
@@ -887,6 +844,19 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
+ existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC);
+ if (xa_is_err(existing)) {
+ /* Memory was preallocated by the caller. */
+ ASSERT(xa_err(existing) != -ENOMEM);
+ return ERR_PTR(xa_err(existing));
+ } else if (WARN_ON(existing)) {
+ /*
+ * Shouldn't happen we just did a lookup before under
+ * delayed_refs->lock.
+ */
+ return ERR_PTR(-EEXIST);
+ }
+ head_ref->tracked = true;
/*
* We reserve the amount of bytes needed to delete csums when
* adding the ref head and not when adding individual drop refs
@@ -900,7 +870,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
}
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
- atomic_inc(&delayed_refs->num_entries);
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
@@ -1008,6 +977,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *new_head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
+ const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits);
+ bool qrecord_reserved = false;
bool qrecord_inserted;
int action = generic_ref->action;
bool merged;
@@ -1023,25 +994,32 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
goto free_node;
}
+ delayed_refs = &trans->transaction->delayed_refs;
+
if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
ret = -ENOMEM;
goto free_head_ref;
}
- if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents,
- generic_ref->bytenr >> fs_info->sectorsize_bits,
- GFP_NOFS)) {
+ if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
ret = -ENOMEM;
goto free_record;
}
+ qrecord_reserved = true;
+ }
+
+ ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS);
+ if (ret) {
+ if (qrecord_reserved)
+ xa_release(&delayed_refs->dirty_extents, index);
+ goto free_record;
}
init_delayed_ref_common(fs_info, node, generic_ref);
init_delayed_ref_head(head_ref, generic_ref, record, reserved);
head_ref->extent_op = extent_op;
- delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
/*
@@ -1051,6 +1029,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
new_head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
if (IS_ERR(new_head_ref)) {
+ xa_release(&delayed_refs->head_refs, index);
spin_unlock(&delayed_refs->lock);
ret = PTR_ERR(new_head_ref);
goto free_record;
@@ -1074,7 +1053,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
if (qrecord_inserted)
- return btrfs_qgroup_trace_extent_post(trans, record);
+ return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr);
return 0;
free_record:
@@ -1113,6 +1092,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u8 level,
struct btrfs_delayed_extent_op *extent_op)
{
+ const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits);
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_head *head_ref_ret;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -1123,6 +1103,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
.num_bytes = num_bytes,
.tree_ref.level = level,
};
+ int ret;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
@@ -1132,16 +1113,23 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
+ ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS);
+ if (ret) {
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+ return ret;
+ }
+
+ spin_lock(&delayed_refs->lock);
head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL,
BTRFS_UPDATE_DELAYED_HEAD, NULL);
- spin_unlock(&delayed_refs->lock);
-
if (IS_ERR(head_ref_ret)) {
+ xa_release(&delayed_refs->head_refs, index);
+ spin_unlock(&delayed_refs->lock);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
return PTR_ERR(head_ref_ret);
}
+ spin_unlock(&delayed_refs->lock);
/*
* Need to update the delayed_refs_rsv with any changes we may have
@@ -1164,11 +1152,15 @@ void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
* head node if found, or NULL if not.
*/
struct btrfs_delayed_ref_head *
-btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
+btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ u64 bytenr)
{
+ const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
+
lockdep_assert_held(&delayed_refs->lock);
- return find_ref_head(delayed_refs, bytenr, false);
+ return xa_load(&delayed_refs->head_refs, index);
}
static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent)
@@ -1238,6 +1230,81 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
return found;
}
+void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ spin_lock(&delayed_refs->lock);
+ while (true) {
+ struct btrfs_delayed_ref_head *head;
+ struct rb_node *n;
+ bool pin_bytes = false;
+
+ head = find_first_ref_head(delayed_refs);
+ if (!head)
+ break;
+
+ if (!btrfs_delayed_ref_lock(delayed_refs, head))
+ continue;
+
+ spin_lock(&head->lock);
+ while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
+ struct btrfs_delayed_ref_node *ref;
+
+ ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node);
+ drop_delayed_ref(fs_info, delayed_refs, head, ref);
+ }
+ if (head->must_insert_reserved)
+ pin_bytes = true;
+ btrfs_free_delayed_extent_op(head->extent_op);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+ mutex_unlock(&head->mutex);
+
+ if (pin_bytes) {
+ struct btrfs_block_group *bg;
+
+ bg = btrfs_lookup_block_group(fs_info, head->bytenr);
+ if (WARN_ON_ONCE(bg == NULL)) {
+ /*
+ * Unexpected and there's nothing we can do here
+ * because we are in a transaction abort path,
+ * so any errors can only be ignored or reported
+ * while attempting to cleanup all resources.
+ */
+ btrfs_err(fs_info,
+"block group for delayed ref at %llu was not found while destroying ref head",
+ head->bytenr);
+ } else {
+ spin_lock(&bg->space_info->lock);
+ spin_lock(&bg->lock);
+ bg->pinned += head->num_bytes;
+ btrfs_space_info_update_bytes_pinned(fs_info,
+ bg->space_info,
+ head->num_bytes);
+ bg->reserved -= head->num_bytes;
+ bg->space_info->bytes_reserved -= head->num_bytes;
+ spin_unlock(&bg->lock);
+ spin_unlock(&bg->space_info->lock);
+
+ btrfs_put_block_group(bg);
+ }
+
+ btrfs_error_unpin_extent_range(fs_info, head->bytenr,
+ head->bytenr + head->num_bytes - 1);
+ }
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+ btrfs_put_delayed_ref_head(head);
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+ btrfs_qgroup_destroy_extent_records(trans);
+
+ spin_unlock(&delayed_refs->lock);
+}
+
void __cold btrfs_delayed_ref_exit(void)
{
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 352921e76c74..611fb3388f82 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -61,7 +61,8 @@ struct btrfs_delayed_ref_node {
/*
* If action is BTRFS_ADD_DELAYED_REF, also link this node to
* ref_head->ref_add_list, then we do not need to iterate the
- * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes.
+ * refs rbtree in the corresponding delayed ref head
+ * (struct btrfs_delayed_ref_head::ref_tree).
*/
struct list_head add_list;
@@ -123,12 +124,6 @@ struct btrfs_delayed_ref_head {
u64 bytenr;
u64 num_bytes;
/*
- * For insertion into struct btrfs_delayed_ref_root::href_root.
- * Keep it in the same cache line as 'bytenr' for more efficient
- * searches in the rbtree.
- */
- struct rb_node href_node;
- /*
* the mutex is held while running the refs, and it is also
* held when checking the sum of reference modifications.
*/
@@ -191,6 +186,11 @@ struct btrfs_delayed_ref_head {
bool is_data;
bool is_system;
bool processing;
+ /*
+ * Indicate if it's currently in the data structure that tracks head
+ * refs (struct btrfs_delayed_ref_root::head_refs).
+ */
+ bool tracked;
};
enum btrfs_delayed_ref_flags {
@@ -199,38 +199,52 @@ enum btrfs_delayed_ref_flags {
};
struct btrfs_delayed_ref_root {
- /* head ref rbtree */
- struct rb_root_cached href_root;
-
/*
- * Track dirty extent records.
+ * Track head references.
* The keys correspond to the logical address of the extent ("bytenr")
* right shifted by fs_info->sectorsize_bits. This is both to get a more
* dense index space (optimizes xarray structure) and because indexes in
* xarrays are of "unsigned long" type, meaning they are 32 bits wide on
* 32 bits platforms, limiting the extent range to 4G which is too low
* and makes it unusable (truncated index values) on 32 bits platforms.
+ * Protected by the spinlock 'lock' defined below.
*/
- struct xarray dirty_extents;
+ struct xarray head_refs;
- /* this spin lock protects the rbtree and the entries inside */
- spinlock_t lock;
+ /*
+ * Track dirty extent records.
+ * The keys correspond to the logical address of the extent ("bytenr")
+ * right shifted by fs_info->sectorsize_bits, for same reasons as above.
+ */
+ struct xarray dirty_extents;
- /* how many delayed ref updates we've queued, used by the
- * throttling code
+ /*
+ * Protects the xarray head_refs, its entries and the following fields:
+ * num_heads, num_heads_ready, pending_csums and run_delayed_start.
*/
- atomic_t num_entries;
+ spinlock_t lock;
- /* total number of head nodes in tree */
+ /* Total number of head refs, protected by the spinlock 'lock'. */
unsigned long num_heads;
- /* total number of head nodes ready for processing */
+ /*
+ * Total number of head refs ready for processing, protected by the
+ * spinlock 'lock'.
+ */
unsigned long num_heads_ready;
+ /*
+ * Track space reserved for deleting csums of data extents.
+ * Protected by the spinlock 'lock'.
+ */
u64 pending_csums;
unsigned long flags;
+ /*
+ * Track from which bytenr to start searching ref heads.
+ * Protected by the spinlock 'lock'.
+ */
u64 run_delayed_start;
/*
@@ -372,19 +386,22 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *
-btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr);
-int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head);
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
mutex_unlock(&head->mutex);
}
-void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs);
+void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
@@ -399,6 +416,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
u64 root, u64 parent);
+void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans);
static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
{
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 83d5cdd77f29..ac8e97ed13f7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -45,7 +45,7 @@
*
* - Copy existing extents
*
- * This happens by re-using scrub facility, as scrub also iterates through
+ * This happens by reusing scrub facility, as scrub also iterates through
* existing extents from commit root.
*
* Location: scrub_write_block_to_dev_replace() from
@@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret;
down_write(&dev_replace->rwsem);
+ dev_replace->replace_task = current;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -994,6 +995,7 @@ error:
list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
fs_devices->rw_devices++;
+ dev_replace->replace_task = NULL;
up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1e8cd7c9472e..1ea5d8fcfbf7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
const char *name,
int name_len)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *ptr;
struct extent_buffer *leaf;
@@ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
if (ret == -EEXIST) {
struct btrfs_dir_item *di;
- di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ di = btrfs_match_dir_item_name(path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
btrfs_extend_item(trans, path, data_size);
@@ -190,7 +189,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
if (ret > 0)
return ERR_PTR(-ENOENT);
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return btrfs_match_dir_item_name(path, name, name_len);
}
/*
@@ -341,8 +340,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
- di = btrfs_match_dir_item_name(root->fs_info, path,
- name->name, name->len);
+ di = btrfs_match_dir_item_name(path, name->name, name->len);
if (di)
return di;
}
@@ -378,8 +376,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* this walks through all the entries in a dir item and finds one
* for a specific name.
*/
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- const struct btrfs_path *path,
+struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path,
const char *name, int name_len)
{
struct btrfs_dir_item *dir_item;
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 5f6dfafc91f1..28d69970bc70 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -44,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
const char *name, u16 name_len,
int mod);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- const struct btrfs_path *path,
+struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path,
const char *name,
int name_len);
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index bd38df5647e3..a7c3e221378d 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -834,7 +834,7 @@ relock:
return ret;
}
- ret = btrfs_write_check(iocb, from, ret);
+ ret = btrfs_write_check(iocb, ret);
if (ret < 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto out;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b11bfe68dd65..814320948645 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -917,8 +917,7 @@ fail:
return ERR_PTR(ret);
}
-static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
@@ -966,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
{
struct btrfs_root *log_root;
- log_root = alloc_log_tree(trans, fs_info);
+ log_root = alloc_log_tree(fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
@@ -992,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item;
int ret;
- log_root = alloc_log_tree(trans, fs_info);
+ log_root = alloc_log_tree(fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
@@ -2786,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_scrub(fs_info);
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
+ btrfs_init_extent_map_shrinker_work(fs_info);
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@@ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
- spin_lock_init(&fs_info->extent_map_shrinker_lock);
-
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -3202,8 +3200,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
return 0;
}
-int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
- const char *options)
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
{
u32 sectorsize;
u32 nodesize;
@@ -4186,7 +4183,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info,
"transaction %llu (with %llu dirty metadata bytes) is not committed",
trans->transid, dirty_bytes);
- btrfs_cleanup_one_transaction(trans, fs_info);
+ btrfs_cleanup_one_transaction(trans);
if (trans == fs_info->running_transaction)
fs_info->running_transaction = NULL;
@@ -4294,6 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
+ cancel_work_sync(&fs_info->em_shrinker_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4531,75 +4529,6 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
}
-static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct rb_node *node;
- struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
- struct btrfs_delayed_ref_node *ref;
-
- spin_lock(&delayed_refs->lock);
- while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
- struct btrfs_delayed_ref_head *head;
- struct rb_node *n;
- bool pin_bytes = false;
-
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
- if (btrfs_delayed_ref_lock(delayed_refs, head))
- continue;
-
- spin_lock(&head->lock);
- while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
- ref = rb_entry(n, struct btrfs_delayed_ref_node,
- ref_node);
- rb_erase_cached(&ref->ref_node, &head->ref_tree);
- RB_CLEAR_NODE(&ref->ref_node);
- if (!list_empty(&ref->add_list))
- list_del(&ref->add_list);
- atomic_dec(&delayed_refs->num_entries);
- btrfs_put_delayed_ref(ref);
- btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
- }
- if (head->must_insert_reserved)
- pin_bytes = true;
- btrfs_free_delayed_extent_op(head->extent_op);
- btrfs_delete_ref_head(delayed_refs, head);
- spin_unlock(&head->lock);
- spin_unlock(&delayed_refs->lock);
- mutex_unlock(&head->mutex);
-
- if (pin_bytes) {
- struct btrfs_block_group *cache;
-
- cache = btrfs_lookup_block_group(fs_info, head->bytenr);
- BUG_ON(!cache);
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->pinned += head->num_bytes;
- btrfs_space_info_update_bytes_pinned(fs_info,
- cache->space_info, head->num_bytes);
- cache->reserved -= head->num_bytes;
- cache->space_info->bytes_reserved -= head->num_bytes;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- btrfs_put_block_group(cache);
-
- btrfs_error_unpin_extent_range(fs_info, head->bytenr,
- head->bytenr + head->num_bytes - 1);
- }
- btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
- btrfs_put_delayed_ref_head(head);
- cond_resched();
- spin_lock(&delayed_refs->lock);
- }
- btrfs_qgroup_destroy_extent_records(trans);
-
- spin_unlock(&delayed_refs->lock);
-}
-
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
@@ -4805,9 +4734,9 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->fs_roots_radix_lock);
}
-void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans)
{
+ struct btrfs_fs_info *fs_info = cur_trans->fs_info;
struct btrfs_device *dev, *tmp;
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
@@ -4819,7 +4748,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
list_del_init(&dev->post_commit_list);
}
- btrfs_destroy_delayed_refs(cur_trans, fs_info);
+ btrfs_destroy_delayed_refs(cur_trans);
cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&fs_info->transaction_blocked_wait);
@@ -4865,7 +4794,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
} else {
spin_unlock(&fs_info->trans_lock);
}
- btrfs_cleanup_one_transaction(t, fs_info);
+ btrfs_cleanup_one_transaction(t);
spin_lock(&fs_info->trans_lock);
if (t == fs_info->running_transaction)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 99af64d3f277..a7051e2570c1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,8 +52,7 @@ struct extent_buffer *btrfs_find_create_tree_block(
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
-int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
- const char *options);
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb, int mirror_num);
@@ -127,8 +126,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
-void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
- struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d9f511babd89..412e318e4a22 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -182,7 +182,7 @@ search_again:
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs);
@@ -795,7 +795,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
if (insert) {
extra_size = btrfs_extent_inline_ref_size(want);
path->search_for_extension = 1;
- path->keep_locks = 1;
} else
extra_size = -1;
@@ -946,6 +945,25 @@ again:
ret = -EAGAIN;
goto out;
}
+
+ if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) {
+ struct btrfs_key tmp_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1);
+ if (tmp_key.objectid == bytenr &&
+ tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ goto out_no_entry;
+ }
+
+ if (!path->keep_locks) {
+ btrfs_release_path(path);
+ path->keep_locks = 1;
+ goto again;
+ }
+
/*
* To add new inline back ref, we have to make sure
* there is no corresponding back ref item.
@@ -959,13 +977,15 @@ again:
goto out;
}
}
+out_no_entry:
*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
- if (insert) {
+ if (path->keep_locks) {
path->keep_locks = 0;
- path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1);
}
+ if (insert)
+ path->search_for_extension = 0;
return ret;
}
@@ -1807,16 +1827,6 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
return ref;
}
-static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head)
-{
- spin_lock(&delayed_refs->lock);
- head->processing = false;
- delayed_refs->num_heads_ready++;
- spin_unlock(&delayed_refs->lock);
- btrfs_delayed_ref_unlock(head);
-}
-
static struct btrfs_delayed_extent_op *cleanup_extent_op(
struct btrfs_delayed_ref_head *head)
{
@@ -1891,7 +1901,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
ret = run_and_cleanup_extent_op(trans, head);
if (ret < 0) {
- unselect_delayed_ref_head(delayed_refs, head);
+ btrfs_unselect_ref_head(delayed_refs, head);
btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
return ret;
} else if (ret) {
@@ -1910,7 +1920,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
return 1;
}
- btrfs_delete_ref_head(delayed_refs, head);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
@@ -1933,39 +1943,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
return ret;
}
-static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
- struct btrfs_trans_handle *trans)
-{
- struct btrfs_delayed_ref_root *delayed_refs =
- &trans->transaction->delayed_refs;
- struct btrfs_delayed_ref_head *head = NULL;
- int ret;
-
- spin_lock(&delayed_refs->lock);
- head = btrfs_select_ref_head(delayed_refs);
- if (!head) {
- spin_unlock(&delayed_refs->lock);
- return head;
- }
-
- /*
- * Grab the lock that says we are going to process all the refs for
- * this head
- */
- ret = btrfs_delayed_ref_lock(delayed_refs, head);
- spin_unlock(&delayed_refs->lock);
-
- /*
- * We may have dropped the spin lock to get the head mutex lock, and
- * that might have given someone else time to free the head. If that's
- * true, it has been removed from our list and we can move on.
- */
- if (ret == -EAGAIN)
- head = ERR_PTR(-EAGAIN);
-
- return head;
-}
-
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *locked_ref,
u64 *bytes_released)
@@ -1986,7 +1963,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
- unselect_delayed_ref_head(delayed_refs, locked_ref);
+ btrfs_unselect_ref_head(delayed_refs, locked_ref);
return -EAGAIN;
}
@@ -2009,7 +1986,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
default:
WARN_ON(1);
}
- atomic_dec(&delayed_refs->num_entries);
/*
* Record the must_insert_reserved flag before we drop the
@@ -2035,7 +2011,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- unselect_delayed_ref_head(delayed_refs, locked_ref);
+ btrfs_unselect_ref_head(delayed_refs, locked_ref);
btrfs_put_delayed_ref(ref);
return ret;
}
@@ -2073,7 +2049,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
do {
if (!locked_ref) {
- locked_ref = btrfs_obtain_ref_head(trans);
+ locked_ref = btrfs_select_ref_head(fs_info, delayed_refs);
if (IS_ERR_OR_NULL(locked_ref)) {
if (PTR_ERR(locked_ref) == -EAGAIN) {
continue;
@@ -2220,7 +2196,7 @@ again:
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
- if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
+ if (xa_empty(&delayed_refs->head_refs)) {
spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -2275,7 +2251,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
delayed_refs = &cur_trans->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr);
if (!head) {
spin_unlock(&delayed_refs->lock);
btrfs_put_transaction(cur_trans);
@@ -3144,7 +3120,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
break;
}
- /* Quick path didn't find the EXTEMT/METADATA_ITEM */
+ /* Quick path didn't find the EXTENT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
@@ -3377,13 +3353,14 @@ out:
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
u64 bytenr)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr);
if (!head)
goto out_delayed_unlock;
@@ -3401,7 +3378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (!mutex_trylock(&head->mutex))
goto out;
- btrfs_delete_ref_head(delayed_refs, head);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
head->processing = false;
spin_unlock(&head->lock);
@@ -3411,7 +3388,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->must_insert_reserved)
ret = 1;
- btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
return ret;
@@ -5270,7 +5247,7 @@ struct walk_control {
* corrupted file systems must have been caught before calling this function.
*/
static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
- struct extent_buffer *eb, u64 refs, u64 flags, int slot)
+ struct extent_buffer *eb, u64 flags, int slot)
{
struct btrfs_key key;
u64 generation;
@@ -5384,7 +5361,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
/* If we don't need to visit this node don't reada. */
- if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
+ if (!visit_node_for_delete(root, wc, eb, flags, slot))
continue;
reada:
btrfs_readahead_node_child(eb, slot);
@@ -5518,7 +5495,7 @@ again:
*/
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr);
if (!head)
goto out;
if (!mutex_trylock(&head->mutex)) {
@@ -5737,8 +5714,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
/* If we don't have to walk into this node skip it. */
if (!visit_node_for_delete(root, wc, path->nodes[level],
- wc->refs[level - 1], wc->flags[level - 1],
- path->slots[level]))
+ wc->flags[level - 1], path->slots[level]))
goto skip;
/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 872cca54cc6c..b923d0cec61c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info,
btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
if (folio != locked_folio && (page_ops & PAGE_UNLOCK))
- btrfs_folio_end_writer_lock(fs_info, folio, start, len);
+ btrfs_folio_end_lock(fs_info, folio, start, len);
}
static void __process_folios_contig(struct address_space *mapping,
@@ -276,7 +276,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
range_start = max_t(u64, folio_pos(folio), start);
range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
end + 1) - range_start;
- btrfs_folio_set_writer_lock(fs_info, folio, range_start, range_len);
+ btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
processed_end = range_start + range_len - 1;
}
@@ -438,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
if (!btrfs_is_subpage(fs_info, folio->mapping))
folio_unlock(folio);
else
- btrfs_subpage_end_reader(fs_info, folio, start, len);
+ btrfs_folio_end_lock(fs_info, folio, start, len);
}
/*
@@ -495,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
return;
ASSERT(folio_test_private(folio));
- btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+ btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
}
/*
@@ -786,7 +786,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
}
if (bio_ctrl->wbc)
- wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page,
+ wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
len);
size -= len;
@@ -1102,6 +1102,45 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
return ret;
}
+static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap,
+ u64 start, u32 len)
+{
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+ const u64 folio_start = folio_pos(folio);
+ unsigned int start_bit;
+ unsigned int nbits;
+
+ ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+ start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
+ nbits = len >> fs_info->sectorsize_bits;
+ ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
+ bitmap_set(delalloc_bitmap, start_bit, nbits);
+}
+
+static bool find_next_delalloc_bitmap(struct folio *folio,
+ unsigned long *delalloc_bitmap, u64 start,
+ u64 *found_start, u32 *found_len)
+{
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+ const u64 folio_start = folio_pos(folio);
+ const unsigned int bitmap_size = fs_info->sectors_per_page;
+ unsigned int start_bit;
+ unsigned int first_zero;
+ unsigned int first_set;
+
+ ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+
+ start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
+ first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
+ if (first_set >= bitmap_size)
+ return false;
+
+ *found_start = folio_start + (first_set << fs_info->sectorsize_bits);
+ first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set);
+ *found_len = (first_zero - first_set) << fs_info->sectorsize_bits;
+ return true;
+}
+
/*
* helper for extent_writepage(), doing all of the delayed allocation setup.
*
@@ -1121,6 +1160,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1;
+ unsigned long delalloc_bitmap = 0;
/*
* Save the last found delalloc end. As the delalloc end can go beyond
* page boundary, thus we cannot rely on subpage bitmap to locate the
@@ -1131,6 +1171,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
int ret = 0;
+ int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
@@ -1140,6 +1181,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
bio_ctrl->submit_bitmap = 1;
}
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ u64 start = page_start + (bit << fs_info->sectorsize_bits);
+
+ btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
+ }
+
/* Lock all (subpage) delalloc ranges inside the folio first. */
while (delalloc_start < page_end) {
delalloc_end = page_end;
@@ -1148,9 +1195,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = delalloc_end + 1;
continue;
}
- btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start,
- min(delalloc_end, page_end) + 1 -
- delalloc_start);
+ set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
+ min(delalloc_end, page_end) + 1 - delalloc_start);
last_delalloc_end = delalloc_end;
delalloc_start = delalloc_end + 1;
}
@@ -1175,7 +1221,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
found = true;
} else {
- found = btrfs_subpage_find_writer_locked(fs_info, folio,
+ found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
delalloc_start, &found_start, &found_len);
}
if (!found)
@@ -1314,7 +1360,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
* a folio for a range already written to disk.
*/
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
- btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
+ btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
/*
* Above call should set the whole folio with writeback flag, even
* just for a single subpage sector.
@@ -1391,8 +1437,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
goto out;
submitted_io = true;
}
-
- btrfs_folio_assert_not_dirty(fs_info, folio, start, len);
out:
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
@@ -1476,7 +1520,7 @@ done:
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
*/
- btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
+ btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
ASSERT(ret <= 0);
return ret;
}
@@ -1708,7 +1752,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
ret = bio_add_folio(&bbio->bio, folio, eb->len,
eb->start - folio_pos(folio));
ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
+ wbc_account_cgroup_owner(wbc, folio, eb->len);
folio_unlock(folio);
} else {
int num_folios = num_extent_folios(eb);
@@ -1722,8 +1766,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
folio_start_writeback(folio);
ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
- eb->folio_size);
+ wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
wbc->nr_to_write -= folio_nr_pages(folio);
folio_unlock(folio);
}
@@ -2116,7 +2159,27 @@ retry:
continue;
}
- if (wbc->sync_mode != WB_SYNC_NONE) {
+ /*
+ * For subpage case, compression can lead to mixed
+ * writeback and dirty flags, e.g:
+ * 0 32K 64K 96K 128K
+ * | |//////||/////| |//|
+ *
+ * In above case, [32K, 96K) is asynchronously submitted
+ * for compression, and [124K, 128K) needs to be written back.
+ *
+ * If we didn't wait wrtiteback for page 64K, [128K, 128K)
+ * won't be submitted as the page still has writeback flag
+ * and will be skipped in the next check.
+ *
+ * This mixed writeback and dirty case is only possible for
+ * subpage case.
+ *
+ * TODO: Remove this check after migrating compression to
+ * regular submission.
+ */
+ if (wbc->sync_mode != WB_SYNC_NONE ||
+ btrfs_is_subpage(inode_to_fs_info(inode), mapping)) {
if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
folio_wait_writeback(folio);
@@ -2201,7 +2264,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
u32 cur_len = cur_end + 1 - cur;
struct folio *folio;
- folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0);
+ folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
/*
* This shouldn't happen, the pages are pinned and locked, this
@@ -2234,7 +2297,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
cur, cur_len, !ret);
mapping_set_error(mapping, ret);
}
- btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len);
+ btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
next_page:
@@ -2318,7 +2381,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* to drop the page.
*/
static bool try_release_extent_state(struct extent_io_tree *tree,
- struct folio *folio, gfp_t mask)
+ struct folio *folio)
{
u64 start = folio_pos(folio);
u64 end = start + PAGE_SIZE - 1;
@@ -2429,7 +2492,7 @@ next:
cond_resched();
}
}
- return try_release_extent_state(io_tree, folio, mask);
+ return try_release_extent_state(io_tree, folio);
}
static void __free_extent_buffer(struct extent_buffer *eb)
@@ -2443,7 +2506,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
+static bool folio_range_has_eb(struct folio *folio)
{
struct btrfs_subpage *subpage;
@@ -2453,12 +2516,6 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli
subpage = folio_get_private(folio);
if (atomic_read(&subpage->eb_refs))
return true;
- /*
- * Even there is no eb refs here, we may still have
- * end_folio_read() call relying on page::private.
- */
- if (atomic_read(&subpage->readers))
- return true;
}
return false;
}
@@ -2517,7 +2574,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* We can only detach the folio private if there are no other ebs in the
* page range and no unfinished IO.
*/
- if (!folio_range_has_eb(fs_info, folio))
+ if (!folio_range_has_eb(folio))
btrfs_detach_subpage(fs_info, folio);
spin_unlock(&folio->mapping->i_private_lock);
@@ -3122,7 +3179,7 @@ out:
}
/*
* Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
- * so it can be cleaned up without utlizing page->mapping.
+ * so it can be cleaned up without utilizing page->mapping.
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -4222,7 +4279,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level)
{
struct btrfs_tree_parent_check check = {
- .has_first_key = 0,
.level = level,
.transid = gen
};
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1d93e1202c33..67ce85ff0ae2 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
-static void dec_evictable_extent_maps(struct btrfs_inode *inode)
+static void remove_em(struct btrfs_inode *inode, struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ rb_erase(&em->rb_node, &inode->extent_tree.root);
+ RB_CLEAR_NODE(&em->rb_node);
+
if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
percpu_counter_dec(&fs_info->evictable_extent_maps);
}
@@ -339,7 +342,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map
static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_map_tree *tree = &inode->extent_tree;
struct extent_map *merge = NULL;
struct rb_node *rb;
@@ -371,10 +373,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
em->flags |= EXTENT_FLAG_MERGED;
validate_extent_map(fs_info, em);
- rb_erase(&merge->rb_node, &tree->root);
- RB_CLEAR_NODE(&merge->rb_node);
+ remove_em(inode, merge);
free_extent_map(merge);
- dec_evictable_extent_maps(inode);
}
}
@@ -386,12 +386,10 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
merge_ondisk_extents(em, merge, em);
validate_extent_map(fs_info, em);
- rb_erase(&merge->rb_node, &tree->root);
- RB_CLEAR_NODE(&merge->rb_node);
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
+ remove_em(inode, merge);
free_extent_map(merge);
- dec_evictable_extent_maps(inode);
}
}
@@ -588,12 +586,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
lockdep_assert_held_write(&tree->lock);
WARN_ON(em->flags & EXTENT_FLAG_PINNED);
- rb_erase(&em->rb_node, &tree->root);
if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
- RB_CLEAR_NODE(&em->rb_node);
- dec_evictable_extent_maps(inode);
+ remove_em(inode, em);
}
static void replace_extent_mapping(struct btrfs_inode *inode,
@@ -1122,13 +1118,12 @@ out_free_pre:
struct btrfs_em_shrink_ctx {
long nr_to_scan;
long scanned;
- u64 last_ino;
- u64 last_root;
};
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
- const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
long nr_dropped = 0;
struct rb_node *node;
@@ -1201,7 +1196,8 @@ next:
* lock. This is to avoid slowing other tasks trying to take the
* lock.
*/
- if (need_resched() || rwlock_needbreak(&tree->lock))
+ if (need_resched() || rwlock_needbreak(&tree->lock) ||
+ btrfs_fs_closing(fs_info))
break;
node = next;
}
@@ -1213,19 +1209,21 @@ next:
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
- u64 min_ino = ctx->last_ino + 1;
+ u64 min_ino = fs_info->em_shrinker_last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, ctx);
min_ino = btrfs_ino(inode) + 1;
- ctx->last_ino = btrfs_ino(inode);
+ fs_info->em_shrinker_last_ino = btrfs_ino(inode);
btrfs_add_delayed_iput(inode);
- if (ctx->scanned >= ctx->nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan ||
+ btrfs_fs_closing(inode->root->fs_info))
break;
cond_resched();
@@ -1241,52 +1239,43 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
* inode if there is one or we will find out this was the last
* one and move to the next root.
*/
- ctx->last_root = btrfs_root_id(root);
+ fs_info->em_shrinker_last_root = btrfs_root_id(root);
} else {
/*
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
* that when processing the next root we start from its first inode.
*/
- ctx->last_ino = 0;
- ctx->last_root = btrfs_root_id(root) + 1;
+ fs_info->em_shrinker_last_ino = 0;
+ fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1;
}
return nr_dropped;
}
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
{
+ struct btrfs_fs_info *fs_info;
struct btrfs_em_shrink_ctx ctx;
u64 start_root_id;
u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
- ctx.scanned = 0;
- ctx.nr_to_scan = nr_to_scan;
+ fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work);
- /*
- * In case we have multiple tasks running this shrinker, make the next
- * one start from the next inode in case it starts before we finish.
- */
- spin_lock(&fs_info->extent_map_shrinker_lock);
- ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
- fs_info->extent_map_shrinker_last_ino++;
- ctx.last_root = fs_info->extent_map_shrinker_last_root;
- spin_unlock(&fs_info->extent_map_shrinker_lock);
+ ctx.scanned = 0;
+ ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan);
- start_root_id = ctx.last_root;
- next_root_id = ctx.last_root;
+ start_root_id = fs_info->em_shrinker_last_root;
+ next_root_id = fs_info->em_shrinker_last_root;
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
- nr, ctx.last_root,
- ctx.last_ino);
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr);
}
- while (ctx.scanned < ctx.nr_to_scan) {
+ while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
struct btrfs_root *root;
unsigned long count;
@@ -1300,8 +1289,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
spin_unlock(&fs_info->fs_roots_radix_lock);
if (start_root_id > 0 && !cycled) {
next_root_id = 0;
- ctx.last_root = 0;
- ctx.last_ino = 0;
+ fs_info->em_shrinker_last_root = 0;
+ fs_info->em_shrinker_last_ino = 0;
cycled = true;
continue;
}
@@ -1320,29 +1309,40 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
btrfs_put_root(root);
}
- /*
- * In case of multiple tasks running this extent map shrinking code this
- * isn't perfect but it's simple and silences things like KCSAN. It's
- * not possible to know which task made more progress because we can
- * cycle back to the first root and first inode if it's not the first
- * time the shrinker ran, see the above logic. Also a task that started
- * later may finish ealier than another task and made less progress. So
- * make this simple and update to the progress of the last task that
- * finished, with the occasional possiblity of having two consecutive
- * runs of the shrinker process the same inodes.
- */
- spin_lock(&fs_info->extent_map_shrinker_lock);
- fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
- fs_info->extent_map_shrinker_last_root = ctx.last_root;
- spin_unlock(&fs_info->extent_map_shrinker_lock);
-
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
- nr, ctx.last_root,
- ctx.last_ino);
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
}
- return nr_dropped;
+ atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
+}
+
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+ /*
+ * Do nothing if the shrinker is already running. In case of high memory
+ * pressure we can have a lot of tasks calling us and all passing the
+ * same nr_to_scan value, but in reality we may need only to free
+ * nr_to_scan extent maps (or less). In case we need to free more than
+ * that, we will be called again by the fs shrinker, so no worries about
+ * not doing enough work to reclaim memory from extent maps.
+ * We can also be repeatedly called with the same nr_to_scan value
+ * simply because the shrinker runs asynchronously and multiple calls
+ * to this function are made before the shrinker does enough progress.
+ *
+ * That's why we set the atomic counter to nr_to_scan only if its
+ * current value is zero, instead of incrementing the counter by
+ * nr_to_scan.
+ */
+ if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+ return;
+
+ queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
+}
+
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
+{
+ atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
+ INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker);
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5154a8f1d26c..cd123b266b64 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map *new_em,
bool modified);
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index df7f09f3b02e..b80c07ad8c5e 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* we have in the cache is the last delalloc range we
* found while the file extent item we found can be
* either for a whole delalloc range we previously
- * emmitted or only a part of that range.
+ * emitted or only a part of that range.
*
* We have two cases here:
*
@@ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* cached extent's end. In this case just ignore the
* current file extent item because we don't want to
* overlap with previous ranges that may have been
- * emmitted already;
+ * emitted already;
*
* 2) The file extent item starts behind the currently
* cached extent but its end offset goes beyond the
* end offset of the cached extent. We don't want to
* overlap with a previous range that may have been
- * emmitted already, so we emit the currently cached
+ * emitted already, so we emit the currently cached
* extent and then partially store the current file
* extent item's range in the cache, for the subrange
* going the cached extent's end to the end of the
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4fb521d91b06..588c353d2969 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -37,33 +37,30 @@
#include "file.h"
#include "super.h"
-/* simple helper to fault in pages and copy. This should go away
- * and be replaced with calls into generic code.
+/*
+ * Helper to fault in page and copy. This should go away and be replaced with
+ * calls into generic code.
*/
static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
- struct page **prepared_pages,
- struct iov_iter *i)
+ struct folio *folio, struct iov_iter *i)
{
size_t copied = 0;
size_t total_copied = 0;
- int pg = 0;
int offset = offset_in_page(pos);
while (write_bytes > 0) {
- size_t count = min_t(size_t,
- PAGE_SIZE - offset, write_bytes);
- struct page *page = prepared_pages[pg];
+ size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);
/*
* Copy data from userspace to the current page
*/
- copied = copy_page_from_iter_atomic(page, offset, count, i);
+ copied = copy_folio_from_iter_atomic(folio, offset, count, i);
/* Flush processor's dcache for this page */
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
/*
* if we get a partial write, we can end up with
- * partially up to date pages. These add
+ * partially up to date page. These add
* a lot of complexity, so make sure they don't
* happen by forcing this copy to be retried.
*
@@ -71,7 +68,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
* back to page at a time copies after we return 0.
*/
if (unlikely(copied < count)) {
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
iov_iter_revert(i, copied);
copied = 0;
}
@@ -82,54 +79,44 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
write_bytes -= copied;
total_copied += copied;
offset += copied;
- if (offset == PAGE_SIZE) {
- pg++;
- offset = 0;
- }
}
return total_copied;
}
/*
- * unlocks pages after btrfs_file_write is done with them
+ * Unlock folio after btrfs_file_write() is done with it.
*/
-static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
- struct page **pages, size_t num_pages,
+static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
u64 pos, u64 copied)
{
- size_t i;
u64 block_start = round_down(pos, fs_info->sectorsize);
u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
ASSERT(block_len <= U32_MAX);
- for (i = 0; i < num_pages; i++) {
- /* page checked is some magic around finding pages that
- * have been modified without going through btrfs_set_page_dirty
- * clear it here. There should be no need to mark the pages
- * accessed as prepare_pages should have marked them accessed
- * in prepare_pages via find_or_create_page()
- */
- btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
- block_start, block_len);
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ /*
+ * Folio checked is some magic around finding folios that have been
+ * modified without going through btrfs_dirty_folio(). Clear it here.
+ * There should be no need to mark the pages accessed as
+ * prepare_one_folio() should have marked them accessed in
+ * prepare_one_folio() via find_or_create_page()
+ */
+ btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
+ folio_unlock(folio);
+ folio_put(folio);
}
/*
* After btrfs_copy_from_user(), update the following things for delalloc:
- * - Mark newly dirtied pages as DELALLOC in the io tree.
+ * - Mark newly dirtied folio as DELALLOC in the io tree.
* Used to advise which range is to be written back.
- * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
+ * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
* - Update inode size for past EOF write
*/
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached, bool noreserve)
+int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
+ size_t write_bytes, struct extent_state **cached, bool noreserve)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
- int i;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
@@ -147,6 +134,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX);
+ ASSERT(folio_pos(folio) <= pos &&
+ folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
end_of_last_block = start_pos + num_bytes - 1;
@@ -163,16 +152,9 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
if (ret)
return ret;
- for (i = 0; i < num_pages; i++) {
- struct page *p = pages[i];
-
- btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
- start_pos, num_bytes);
- btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
- start_pos, num_bytes);
- btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
- start_pos, num_bytes);
- }
+ btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
+ btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
+ btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
/*
* we've only changed i_size in ram, and we haven't updated
@@ -851,53 +833,47 @@ out:
}
/*
- * on error we return an unlocked page and the error value
- * on success we return a locked page and 0
+ * On error return an unlocked folio and the error value
+ * On success return a locked folio and 0
*/
-static int prepare_uptodate_page(struct inode *inode,
- struct page *page, u64 pos,
- bool force_uptodate)
+static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
+ u64 len, bool force_uptodate)
{
- struct folio *folio = page_folio(page);
+ u64 clamp_start = max_t(u64, pos, folio_pos(folio));
+ u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
int ret = 0;
- if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
- !PageUptodate(page)) {
- ret = btrfs_read_folio(NULL, folio);
- if (ret)
- return ret;
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- return -EIO;
- }
-
- /*
- * Since btrfs_read_folio() will unlock the folio before it
- * returns, there is a window where btrfs_release_folio() can be
- * called to release the page. Here we check both inode
- * mapping and PagePrivate() to make sure the page was not
- * released.
- *
- * The private flag check is essential for subpage as we need
- * to store extra bitmap using folio private.
- */
- if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
- unlock_page(page);
- return -EAGAIN;
- }
- }
- return 0;
-}
+ if (folio_test_uptodate(folio))
+ return 0;
-static fgf_t get_prepare_fgp_flags(bool nowait)
-{
- fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+ if (!force_uptodate &&
+ IS_ALIGNED(clamp_start, PAGE_SIZE) &&
+ IS_ALIGNED(clamp_end, PAGE_SIZE))
+ return 0;
- if (nowait)
- fgp_flags |= FGP_NOWAIT;
+ ret = btrfs_read_folio(NULL, folio);
+ if (ret)
+ return ret;
+ folio_lock(folio);
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ return -EIO;
+ }
- return fgp_flags;
+ /*
+ * Since btrfs_read_folio() will unlock the folio before it returns,
+ * there is a window where btrfs_release_folio() can be called to
+ * release the page. Here we check both inode mapping and page
+ * private to make sure the page was not released.
+ *
+ * The private flag check is essential for subpage as we need to store
+ * extra bitmap using folio private.
+ */
+ if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
+ folio_unlock(folio);
+ return -EAGAIN;
+ }
+ return 0;
}
static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
@@ -914,89 +890,67 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
}
/*
- * this just gets pages into the page cache and locks them down.
+ * Get folio into the page cache and lock it.
*/
-static noinline int prepare_pages(struct inode *inode, struct page **pages,
- size_t num_pages, loff_t pos,
- size_t write_bytes, bool force_uptodate,
- bool nowait)
+static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
+ loff_t pos, size_t write_bytes,
+ bool force_uptodate, bool nowait)
{
- int i;
unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
- fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
+ fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
+ struct folio *folio;
int ret = 0;
- int faili;
- for (i = 0; i < num_pages; i++) {
again:
- pages[i] = pagecache_get_page(inode->i_mapping, index + i,
- fgp_flags, mask | __GFP_WRITE);
- if (!pages[i]) {
- faili = i - 1;
- if (nowait)
- ret = -EAGAIN;
- else
- ret = -ENOMEM;
- goto fail;
- }
-
- ret = set_page_extent_mapped(pages[i]);
- if (ret < 0) {
- faili = i;
- goto fail;
- }
-
- if (i == 0)
- ret = prepare_uptodate_page(inode, pages[i], pos,
- force_uptodate);
- if (!ret && i == num_pages - 1)
- ret = prepare_uptodate_page(inode, pages[i],
- pos + write_bytes, false);
- if (ret) {
- put_page(pages[i]);
- if (!nowait && ret == -EAGAIN) {
- ret = 0;
- goto again;
- }
- faili = i - 1;
- goto fail;
+ folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
+ if (IS_ERR(folio)) {
+ if (nowait)
+ ret = -EAGAIN;
+ else
+ ret = PTR_ERR(folio);
+ return ret;
+ }
+ /* Only support page sized folio yet. */
+ ASSERT(folio_order(folio) == 0);
+ ret = set_folio_extent_mapped(folio);
+ if (ret < 0) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+ }
+ ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
+ if (ret) {
+ /* The folio is already unlocked. */
+ folio_put(folio);
+ if (!nowait && ret == -EAGAIN) {
+ ret = 0;
+ goto again;
}
- wait_on_page_writeback(pages[i]);
+ return ret;
}
-
+ *folio_ret = folio;
return 0;
-fail:
- while (faili >= 0) {
- unlock_page(pages[faili]);
- put_page(pages[faili]);
- faili--;
- }
- return ret;
-
}
/*
- * This function locks the extent and properly waits for data=ordered extents
- * to finish before allowing the pages to be modified if need.
+ * Locks the extent and properly waits for data=ordered extents to finish
+ * before allowing the folios to be modified if need.
*
- * The return value:
+ * Return:
* 1 - the extent is locked
* 0 - the extent is not locked, and everything is OK
- * -EAGAIN - need re-prepare the pages
- * the other < 0 number - Something wrong happens
+ * -EAGAIN - need to prepare the folios again
*/
static noinline int
-lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos,
- size_t write_bytes,
+lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
+ loff_t pos, size_t write_bytes,
u64 *lockstart, u64 *lockend, bool nowait,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start_pos;
u64 last_pos;
- int i;
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
@@ -1008,12 +962,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
if (nowait) {
if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
cached_state)) {
- for (i = 0; i < num_pages; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- pages[i] = NULL;
- }
-
+ folio_unlock(folio);
+ folio_put(folio);
return -EAGAIN;
}
} else {
@@ -1027,10 +977,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
ordered->file_offset <= last_pos) {
unlock_extent(&inode->io_tree, start_pos, last_pos,
cached_state);
- for (i = 0; i < num_pages; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ folio_unlock(folio);
+ folio_put(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
@@ -1044,11 +992,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
/*
- * We should be called after prepare_pages() which should have locked
+ * We should be called after prepare_one_folio() which should have locked
* all pages in the range.
*/
- for (i = 0; i < num_pages; i++)
- WARN_ON(!PageLocked(pages[i]));
+ WARN_ON(!folio_test_locked(folio));
return ret;
}
@@ -1120,27 +1067,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
-static void update_time_for_write(struct inode *inode)
-{
- struct timespec64 now, ts;
-
- if (IS_NOCMTIME(inode))
- return;
-
- now = current_time(inode);
- ts = inode_get_mtime(inode);
- if (!timespec64_equal(&ts, &now))
- inode_set_mtime_to_ts(inode, now);
-
- ts = inode_get_ctime(inode);
- if (!timespec64_equal(&ts, &now))
- inode_set_ctime_to_ts(inode, now);
-
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
-}
-
-int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
+int btrfs_write_check(struct kiocb *iocb, size_t count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -1170,7 +1097,10 @@ int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
* need to start yet another transaction to update the inode as we will
* update the inode when we finish writing whatever data we write.
*/
- update_time_for_write(inode);
+ if (!IS_NOCMTIME(inode)) {
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ inode_inc_iversion(inode);
+ }
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
@@ -1192,20 +1122,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
loff_t pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
size_t num_written = 0;
- int nrptrs;
ssize_t ret;
- bool only_release_metadata = false;
- bool force_page_uptodate = false;
loff_t old_isize = i_size_read(inode);
unsigned int ilock_flags = 0;
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
+ bool only_release_metadata = false;
if (nowait)
ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1218,38 +1145,26 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
if (ret <= 0)
goto out;
- ret = btrfs_write_check(iocb, i, ret);
+ ret = btrfs_write_check(iocb, ret);
if (ret < 0)
goto out;
pos = iocb->ki_pos;
- nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
- PAGE_SIZE / (sizeof(struct page *)));
- nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
- nrptrs = max(nrptrs, 8);
- pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out;
- }
-
while (iov_iter_count(i) > 0) {
struct extent_state *cached_state = NULL;
size_t offset = offset_in_page(pos);
size_t sector_offset;
- size_t write_bytes = min(iov_iter_count(i),
- nrptrs * (size_t)PAGE_SIZE -
- offset);
- size_t num_pages;
+ size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
size_t reserve_bytes;
- size_t dirty_pages;
size_t copied;
size_t dirty_sectors;
size_t num_sectors;
+ struct folio *folio = NULL;
int extents_locked;
+ bool force_page_uptodate = false;
/*
- * Fault pages before locking them in prepare_pages
+ * Fault pages before locking them in prepare_one_folio()
* to avoid recursive lock
*/
if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
@@ -1288,8 +1203,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
only_release_metadata = true;
}
- num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
- WARN_ON(num_pages > nrptrs);
reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
WARN_ON(reserve_bytes == 0);
@@ -1317,23 +1230,17 @@ again:
break;
}
- /*
- * This is going to setup the pages array with the number of
- * pages we want, so we don't really need to worry about the
- * contents of pages from loop to loop
- */
- ret = prepare_pages(inode, pages, num_pages,
- pos, write_bytes, force_page_uptodate, false);
+ ret = prepare_one_folio(inode, &folio, pos, write_bytes,
+ force_page_uptodate, false);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
break;
}
- extents_locked = lock_and_cleanup_extent_if_need(
- BTRFS_I(inode), pages,
- num_pages, pos, write_bytes, &lockstart,
- &lockend, nowait, &cached_state);
+ extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
+ folio, pos, write_bytes, &lockstart,
+ &lockend, nowait, &cached_state);
if (extents_locked < 0) {
if (!nowait && extents_locked == -EAGAIN)
goto again;
@@ -1344,28 +1251,18 @@ again:
break;
}
- copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
+ copied = btrfs_copy_from_user(pos, write_bytes, folio, i);
num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
dirty_sectors = round_up(copied + sector_offset,
fs_info->sectorsize);
dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
- /*
- * if we have trouble faulting in the pages, fall
- * back to one page at a time
- */
- if (copied < write_bytes)
- nrptrs = 1;
-
if (copied == 0) {
force_page_uptodate = true;
dirty_sectors = 0;
- dirty_pages = 0;
} else {
force_page_uptodate = false;
- dirty_pages = DIV_ROUND_UP(copied + offset,
- PAGE_SIZE);
}
if (num_sectors > dirty_sectors) {
@@ -1375,13 +1272,10 @@ again:
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
- u64 __pos;
-
- __pos = round_down(pos,
- fs_info->sectorsize) +
- (dirty_pages << PAGE_SHIFT);
+ u64 release_start = round_up(pos + copied,
+ fs_info->sectorsize);
btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, __pos,
+ data_reserved, release_start,
release_bytes, true);
}
}
@@ -1389,15 +1283,14 @@ again:
release_bytes = round_up(copied + sector_offset,
fs_info->sectorsize);
- ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
- dirty_pages, pos, copied,
+ ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
&cached_state, only_release_metadata);
/*
* If we have not locked the extent range, because the range's
* start offset is >= i_size, we might still have a non-NULL
* cached extent state, acquired while marking the extent range
- * as delalloc through btrfs_dirty_pages(). Therefore free any
+ * as delalloc through btrfs_dirty_page(). Therefore free any
* possible cached extent state to avoid a memory leak.
*/
if (extents_locked)
@@ -1408,7 +1301,7 @@ again:
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
- btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
+ btrfs_drop_folio(fs_info, folio, pos, copied);
break;
}
@@ -1416,7 +1309,7 @@ again:
if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
+ btrfs_drop_folio(fs_info, folio, pos, copied);
cond_resched();
@@ -1424,8 +1317,6 @@ again:
num_written += copied;
}
- kfree(pages);
-
if (release_bytes) {
if (only_release_metadata) {
btrfs_check_nocow_unlock(BTRFS_I(inode));
@@ -1470,7 +1361,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (ret || encoded->len == 0)
goto out;
- ret = btrfs_write_check(iocb, from, encoded->len);
+ ret = btrfs_write_check(iocb, encoded->len);
if (ret < 0)
goto out;
@@ -3802,6 +3693,7 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_compat_ioctl,
#endif
.remap_file_range = btrfs_remap_file_range,
+ .uring_cmd = btrfs_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
};
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 912254e653cf..de89e644be29 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -34,9 +34,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached, bool noreserve);
+int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
+ size_t write_bytes, struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait);
@@ -44,7 +43,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret);
-int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count);
+int btrfs_write_check(struct kiocb *iocb, size_t count);
ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i);
#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f4bcb2530660..cfa52ef40b06 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,6 +11,7 @@
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
+#include <linux/string_choices.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
@@ -1387,6 +1388,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
int bitmaps = 0;
int ret;
int must_iput = 0;
+ int i_size;
if (!i_size_read(inode))
return -EIO;
@@ -1457,11 +1459,16 @@ static int __btrfs_write_out_cache(struct inode *inode,
io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
- ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
- io_ctl->num_pages, 0, i_size_read(inode),
- &cached_state, false);
- if (ret)
- goto out_nospc;
+ i_size = i_size_read(inode);
+ for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) {
+ u64 dirty_start = i * PAGE_SIZE;
+ u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start;
+
+ ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]),
+ dirty_start, dirty_len, &cached_state, false);
+ if (ret < 0)
+ goto out_nospc;
+ }
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
@@ -2936,12 +2943,11 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
if (info->bytes >= bytes && !block_group->ro)
count++;
btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
- info->offset, info->bytes,
- (info->bitmap) ? "yes" : "no");
+ info->offset, info->bytes, str_yes_no(info->bitmap));
}
spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s",
- list_empty(&block_group->cluster_list) ? "no" : "yes");
+ str_no_yes(list_empty(&block_group->cluster_list)));
btrfs_info(fs_info,
"%d free space entries at or bigger than %llu bytes",
count, bytes);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 79f64e383edd..79a1a3d6f04d 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -263,10 +263,10 @@ enum {
BTRFS_FEATURE_INCOMPAT_ZONED | \
BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/*
* Features under developmen like Extent tree v2 support is enabled
- * only under CONFIG_BTRFS_DEBUG.
+ * only under CONFIG_BTRFS_EXPERIMENTAL
*/
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
@@ -317,6 +317,8 @@ struct btrfs_dev_replace {
struct percpu_counter bio_counter;
wait_queue_head_t replace_wait;
+
+ struct task_struct *replace_task;
};
/*
@@ -633,9 +635,10 @@ struct btrfs_fs_info {
s32 delalloc_batch;
struct percpu_counter evictable_extent_maps;
- spinlock_t extent_map_shrinker_lock;
- u64 extent_map_shrinker_last_root;
- u64 extent_map_shrinker_last_ino;
+ u64 em_shrinker_last_root;
+ u64 em_shrinker_last_ino;
+ atomic64_t em_shrinker_nr_to_scan;
+ struct work_struct em_shrinker_work;
/* Protected by 'trans_lock'. */
struct list_head dirty_cowonly_roots;
@@ -876,12 +879,9 @@ struct btrfs_fs_info {
#endif
};
-#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \
- struct page *: (_page))->mapping->host))
#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \
struct folio *: (_folio))->mapping->host))
-#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info)
#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info)
#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e4ca1e7d2e5..03fe0de2cd0d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -421,7 +421,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
index++;
continue;
}
- folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
index++;
if (IS_ERR(folio))
continue;
@@ -556,8 +556,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
} else {
struct folio *folio;
- folio = __filemap_get_folio(inode->vfs_inode.i_mapping,
- 0, 0, 0);
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
ASSERT(!IS_ERR(folio));
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_local_folio(folio, 0);
@@ -646,7 +645,7 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
* If being used directly, you must have already checked we're allowed to cow
* the range by getting true from can_cow_file_range_inline().
*/
-static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
+static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
u64 size, size_t compressed_size,
int compress_type,
struct folio *compressed_folio,
@@ -736,7 +735,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
return 1;
lock_extent(&inode->io_tree, offset, end, &cached);
- ret = __cow_file_range_inline(inode, offset, size, compressed_size,
+ ret = __cow_file_range_inline(inode, size, compressed_size,
compress_type, compressed_folio,
update_i_size);
if (ret > 0) {
@@ -832,32 +831,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
return 0;
}
/*
- * Special check for subpage.
+ * Only enable sector perfect compression for experimental builds.
*
- * We lock the full page then run each delalloc range in the page, thus
- * for the following case, we will hit some subpage specific corner case:
+ * This is a big feature change for subpage cases, and can hit
+ * different corner cases, so only limit this feature for
+ * experimental build for now.
*
- * 0 32K 64K
- * | |///////| |///////|
- * \- A \- B
- *
- * In above case, both range A and range B will try to unlock the full
- * page [0, 64K), causing the one finished later will have page
- * unlocked already, triggering various page lock requirement BUG_ON()s.
- *
- * So here we add an artificial limit that subpage compression can only
- * if the range is fully page aligned.
- *
- * In theory we only need to ensure the first page is fully covered, but
- * the tailing partial page will be locked until the full compression
- * finishes, delaying the write of other range.
- *
- * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
- * first to prevent any submitted async extent to unlock the full page.
- * By this, we can ensure for subpage case that only the last async_cow
- * will unlock the full page.
+ * ETA for moving this out of experimental builds is 6.15.
*/
- if (fs_info->sectorsize < PAGE_SIZE) {
+ if (fs_info->sectorsize < PAGE_SIZE &&
+ !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
if (!PAGE_ALIGNED(start) ||
!PAGE_ALIGNED(end + 1))
return 0;
@@ -896,13 +879,14 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e
for (unsigned long index = start >> PAGE_SHIFT;
index <= end_index; index++) {
- folio = __filemap_get_folio(inode->i_mapping, index, 0, 0);
+ folio = filemap_get_folio(inode->i_mapping, index);
if (IS_ERR(folio)) {
if (!ret)
ret = PTR_ERR(folio);
continue;
}
- folio_clear_dirty_for_io(folio);
+ btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start,
+ end + 1 - start);
folio_put(folio);
}
return ret;
@@ -1001,17 +985,6 @@ again:
(start > 0 || end + 1 < inode->disk_i_size))
goto cleanup_and_bail_uncompressed;
- /*
- * For subpage case, we require full page alignment for the sector
- * aligned range.
- * Thus we must also check against @actual_end, not just @end.
- */
- if (blocksize < PAGE_SIZE) {
- if (!PAGE_ALIGNED(start) ||
- !PAGE_ALIGNED(round_up(actual_end, blocksize)))
- goto cleanup_and_bail_uncompressed;
- }
-
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
@@ -1359,7 +1332,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
- unsigned long ram_size;
u64 cur_alloc_size = 0;
u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
@@ -1367,7 +1339,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
struct extent_map *em;
unsigned clear_bits;
unsigned long page_ops;
- bool extent_reserved = false;
int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
@@ -1421,8 +1392,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
struct btrfs_ordered_extent *ordered;
struct btrfs_file_extent file_extent;
- cur_alloc_size = num_bytes;
- ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
+ ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret == -EAGAIN) {
@@ -1453,9 +1423,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret < 0)
goto out_unlock;
cur_alloc_size = ins.offset;
- extent_reserved = true;
- ram_size = ins.offset;
file_extent.disk_bytenr = ins.objectid;
file_extent.disk_num_bytes = ins.offset;
file_extent.num_bytes = ins.offset;
@@ -1463,14 +1431,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
file_extent.offset = 0;
file_extent.compression = BTRFS_COMPRESS_NONE;
- lock_extent(&inode->io_tree, start, start + ram_size - 1,
+ lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
&cached);
em = btrfs_create_io_em(inode, start, &file_extent,
BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) {
unlock_extent(&inode->io_tree, start,
- start + ram_size - 1, &cached);
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
@@ -1480,7 +1448,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
1 << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
unlock_extent(&inode->io_tree, start,
- start + ram_size - 1, &cached);
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1501,7 +1469,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
*/
if (ret)
btrfs_drop_extent_map_range(inode, start,
- start + ram_size - 1,
+ start + cur_alloc_size - 1,
false);
}
btrfs_put_ordered_extent(ordered);
@@ -1513,13 +1481,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* (which the caller expects to stay locked), don't clear any
* dirty bits and don't set any writeback bits
*
- * Do set the Ordered (Private2) bit so we know this page was
+ * Do set the Ordered flag so we know this page was
* properly setup for writepage.
*/
page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;
- extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1,
locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
@@ -1529,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
- extent_reserved = false;
+ cur_alloc_size = 0;
/*
* btrfs_reloc_clone_csums() error, since start is increased
@@ -1545,7 +1513,7 @@ done:
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
+ btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
@@ -1599,13 +1567,12 @@ out_unlock:
* to decrement again the data space_info's bytes_may_use counter,
* therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/
- if (extent_reserved) {
+ if (cur_alloc_size) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
locked_folio, &cached, clear_bits,
page_ops);
btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
- start += cur_alloc_size;
}
/*
@@ -1614,11 +1581,13 @@ out_unlock:
* space_info's bytes_may_use counter, reserved in
* btrfs_check_data_free_space().
*/
- if (start < end) {
+ if (start + cur_alloc_size < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
- extent_clear_unlock_delalloc(inode, start, end, locked_folio,
+ extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
+ end, locked_folio,
&cached, clear_bits, page_ops);
- btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL);
+ btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
+ end - start - cur_alloc_size + 1, NULL);
}
return ret;
}
@@ -1729,7 +1698,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* need full accuracy. Just account the whole thing
* against the first page.
*/
- wbc_account_cgroup_owner(wbc, &locked_folio->page,
+ wbc_account_cgroup_owner(wbc, locked_folio,
cur_end - start);
async_chunk[i].locked_folio = locked_folio;
locked_folio = NULL;
@@ -3094,34 +3063,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
- BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
-
- btrfs_inode_safe_disk_i_size_write(inode, 0);
- if (freespace_inode)
- trans = btrfs_join_transaction_spacecache(root);
- else
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
- trans->block_rsv = &inode->block_rsv;
- ret = btrfs_update_inode_fallback(trans, inode);
- if (ret) /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, ret);
-
- ret = btrfs_insert_raid_extent(trans, ordered_extent);
- if (ret)
- btrfs_abort_transaction(trans, ret);
-
- goto out;
- }
-
- clear_bits |= EXTENT_LOCKED;
- lock_extent(io_tree, start, end, &cached_state);
-
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
@@ -3135,8 +3076,31 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
ret = btrfs_insert_raid_extent(trans, ordered_extent);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ /* Logic error */
+ ASSERT(list_empty(&ordered_extent->list));
+ if (!list_empty(&ordered_extent->list)) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
+ ret = btrfs_update_inode_fallback(trans, inode);
+ if (ret) {
+ /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, ret);
+ }
goto out;
+ }
+
+ clear_bits |= EXTENT_LOCKED;
+ lock_extent(io_tree, start, end, &cached_state);
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
@@ -3791,14 +3755,45 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
return 0;
}
+static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_inode *existing;
+ const u64 ino = btrfs_ino(inode);
+ int ret;
+
+ if (inode_unhashed(&inode->vfs_inode))
+ return 0;
+
+ if (prealloc) {
+ ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+
+ existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
+
+ if (xa_is_err(existing)) {
+ ret = xa_err(existing);
+ ASSERT(ret != -EINVAL);
+ ASSERT(ret != -ENOMEM);
+ return ret;
+ } else if (existing) {
+ WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
+ }
+
+ return 0;
+}
+
/*
- * read an inode from the btree into the in-memory inode
+ * Read a locked inode from the btree into the in-memory inode and add it to
+ * its root list/tree.
+ *
+ * On failure clean up the inode.
*/
-static int btrfs_read_locked_inode(struct inode *inode,
- struct btrfs_path *in_path)
+static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path = in_path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3812,25 +3807,25 @@ static int btrfs_read_locked_inode(struct inode *inode,
ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
if (ret)
- return ret;
+ goto out;
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
- if (!path) {
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- }
+ ASSERT(path);
btrfs_get_inode_key(BTRFS_I(inode), &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
- if (path != in_path)
- btrfs_free_path(path);
- return ret;
+ /*
+ * ret > 0 can come from btrfs_search_slot called by
+ * btrfs_lookup_inode(), this means the inode was not found.
+ */
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
}
leaf = path->nodes[0];
@@ -3965,8 +3960,6 @@ cache_acl:
btrfs_ino(BTRFS_I(inode)),
btrfs_root_id(root), ret);
}
- if (path != in_path)
- btrfs_free_path(path);
if (!maybe_acls)
cache_no_acl(inode);
@@ -3993,7 +3986,15 @@ cache_acl:
}
btrfs_sync_inode_flags_to_i_flags(inode);
+
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
+ if (ret)
+ goto out;
+
return 0;
+out:
+ iget_failed(inode);
+ return ret;
}
/*
@@ -5502,35 +5503,7 @@ out:
return err;
}
-static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_inode *existing;
- const u64 ino = btrfs_ino(inode);
- int ret;
-
- if (inode_unhashed(&inode->vfs_inode))
- return 0;
- if (prealloc) {
- ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
- if (ret)
- return ret;
- }
-
- existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
-
- if (xa_is_err(existing)) {
- ret = xa_err(existing);
- ASSERT(ret != -EINVAL);
- ASSERT(ret != -ENOMEM);
- return ret;
- } else if (existing) {
- WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
- }
-
- return 0;
-}
static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
{
@@ -5592,10 +5565,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
}
/*
- * Get an inode object given its inode number and corresponding root.
- * Path can be preallocated to prevent recursing back to iget through
- * allocator. NULL is also valid but may require an additional allocation
- * later.
+ * Get an inode object given its inode number and corresponding root. Path is
+ * preallocated to prevent recursing back to iget through allocator.
*/
struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
struct btrfs_path *path)
@@ -5611,30 +5582,40 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
return inode;
ret = btrfs_read_locked_inode(inode, path);
- /*
- * ret > 0 can come from btrfs_search_slot called by
- * btrfs_read_locked_inode(), this means the inode item was not found.
- */
- if (ret > 0)
- ret = -ENOENT;
- if (ret < 0)
- goto error;
-
- ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
- if (ret < 0)
- goto error;
+ if (ret)
+ return ERR_PTR(ret);
unlock_new_inode(inode);
-
return inode;
-error:
- iget_failed(inode);
- return ERR_PTR(ret);
}
+/*
+ * Get an inode object given its inode number and corresponding root.
+ */
struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- return btrfs_iget_path(ino, root, NULL);
+ struct inode *inode;
+ struct btrfs_path *path;
+ int ret;
+
+ inode = btrfs_iget_locked(ino, root);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = btrfs_read_locked_inode(inode, path);
+ btrfs_free_path(path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ unlock_new_inode(inode);
+ return inode;
}
static struct inode *new_simple_dir(struct inode *dir,
@@ -6023,7 +6004,7 @@ again:
* offset. This means that new entries created during readdir
* are *guaranteed* to be seen in the future by that readdir.
* This has broken buggy programs which operate on names as
- * they're returned by readdir. Until we re-use freed offsets
+ * they're returned by readdir. Until we reuse freed offsets
* we have this hack to stop new entries from being returned
* under the assumption that they'll never reach this huge
* offset.
@@ -6765,8 +6746,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
return ret;
}
-static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
- struct folio *folio)
+static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
{
struct btrfs_file_extent_item *fi;
void *kaddr;
@@ -6964,7 +6944,7 @@ next:
ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
- ret = read_inline_extent(inode, path, folio);
+ ret = read_inline_extent(path, folio);
if (ret < 0)
goto out;
goto insert;
@@ -7294,7 +7274,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*
* But already submitted bio can still be finished on this folio.
* Furthermore, endio function won't skip folio which has Ordered
- * (Private2) already cleared, so it's possible for endio and
+ * already cleared, so it's possible for endio and
* invalidate_folio to do the same ordered extent accounting twice
* on one folio.
*
@@ -7360,7 +7340,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
range_len = range_end + 1 - cur;
if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
- * If Ordered (Private2) is cleared, it means endio has
+ * If Ordered is cleared, it means endio has
* already been executed for the range.
* We can't delete the extent states as
* btrfs_finish_ordered_io() may still use some of them.
@@ -7433,7 +7413,7 @@ next:
}
/*
* We have iterated through all ordered extents of the page, the page
- * should not have Ordered (Private2) anymore, or the above iteration
+ * should not have Ordered anymore, or the above iteration
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
@@ -8972,28 +8952,6 @@ out_inode:
return finish_open_simple(file, ret);
}
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct folio *folio;
- u32 len;
-
- ASSERT(end + 1 - start <= U32_MAX);
- len = end + 1 - start;
- while (index <= end_index) {
- folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
- ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */
-
- /* This is for data, which doesn't yet support larger folio. */
- ASSERT(folio_order(folio) == 0);
- btrfs_folio_set_writeback(fs_info, folio, start, len);
- folio_put(folio);
- index++;
- }
-}
-
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type)
{
@@ -9038,12 +8996,16 @@ static ssize_t btrfs_encoded_read_inline(
unsigned long ptr;
void *tmp;
ssize_t ret;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
+
+ path->nowait = nowait;
+
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
extent_start, 0);
if (ret) {
@@ -9107,6 +9069,7 @@ out:
struct btrfs_encoded_read_private {
wait_queue_head_t wait;
+ void *uring_ctx;
atomic_t pending;
blk_status_t status;
};
@@ -9126,26 +9089,40 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
- if (!atomic_dec_return(&priv->pending))
- wake_up(&priv->wait);
+ if (atomic_dec_return(&priv->pending) == 0) {
+ int err = blk_status_to_errno(READ_ONCE(priv->status));
+
+ if (priv->uring_ctx) {
+ btrfs_uring_read_extent_endio(priv->uring_ctx, err);
+ kfree(priv);
+ } else {
+ wake_up(&priv->wait);
+ }
+ }
bio_put(&bbio->bio);
}
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size, struct page **pages)
+ u64 disk_bytenr, u64 disk_io_size,
+ struct page **pages, void *uring_ctx)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_encoded_read_private priv = {
- .pending = ATOMIC_INIT(1),
- };
+ struct btrfs_encoded_read_private *priv;
unsigned long i = 0;
struct btrfs_bio *bbio;
+ int ret;
+
+ priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
- init_waitqueue_head(&priv.wait);
+ init_waitqueue_head(&priv->wait);
+ atomic_set(&priv->pending, 1);
+ priv->status = 0;
+ priv->uring_ctx = uring_ctx;
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
@@ -9153,11 +9130,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
- atomic_inc(&priv.pending);
+ atomic_inc(&priv->pending);
btrfs_submit_bbio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
continue;
@@ -9168,22 +9145,33 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
disk_io_size -= bytes;
} while (disk_io_size);
- atomic_inc(&priv.pending);
+ atomic_inc(&priv->pending);
btrfs_submit_bbio(bbio, 0);
- if (atomic_dec_return(&priv.pending))
- io_wait_event(priv.wait, !atomic_read(&priv.pending));
- /* See btrfs_encoded_read_endio() for ordering. */
- return blk_status_to_errno(READ_ONCE(priv.status));
+ if (uring_ctx) {
+ if (atomic_dec_return(&priv->pending) == 0) {
+ ret = blk_status_to_errno(READ_ONCE(priv->status));
+ btrfs_uring_read_extent_endio(uring_ctx, ret);
+ kfree(priv);
+ return ret;
+ }
+
+ return -EIOCBQUEUED;
+ } else {
+ if (atomic_dec_return(&priv->pending) != 0)
+ io_wait_event(priv->wait, !atomic_read(&priv->pending));
+ /* See btrfs_encoded_read_endio() for ordering. */
+ ret = blk_status_to_errno(READ_ONCE(priv->status));
+ kfree(priv);
+ return ret;
+ }
}
-static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
- struct iov_iter *iter,
- u64 start, u64 lockend,
- struct extent_state **cached_state,
- u64 disk_bytenr, u64 disk_io_size,
- size_t count, bool compressed,
- bool *unlocked)
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed, bool *unlocked)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -9203,8 +9191,8 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
goto out;
}
- ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
- disk_io_size, pages);
+ ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
+ disk_io_size, pages, NULL);
if (ret)
goto out;
@@ -9244,21 +9232,26 @@ out:
}
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded)
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
ssize_t ret;
size_t count = iov_iter_count(iter);
- u64 start, lockend, disk_bytenr, disk_io_size;
- struct extent_state *cached_state = NULL;
+ u64 start, lockend;
struct extent_map *em;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
bool unlocked = false;
file_accessed(iocb->ki_filp);
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ ret = btrfs_inode_lock(inode,
+ BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
+ if (ret)
+ return ret;
if (iocb->ki_pos >= inode->vfs_inode.i_size) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -9271,21 +9264,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
*/
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
- for (;;) {
+ if (nowait) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(inode, start,
- lockend - start + 1);
- if (ret)
+ if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
+ start, lockend)) {
+ ret = -EAGAIN;
+ goto out_unlock_inode;
+ }
+
+ if (!try_lock_extent(io_tree, start, lockend, cached_state)) {
+ ret = -EAGAIN;
goto out_unlock_inode;
- lock_extent(io_tree, start, lockend, &cached_state);
+ }
+
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
- if (!ordered)
- break;
- btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, &cached_state);
- cond_resched();
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ ret = -EAGAIN;
+ goto out_unlock_inode;
+ }
+ } else {
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+
+ lock_extent(io_tree, start, lockend, cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ cond_resched();
+ }
}
em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
@@ -9304,9 +9322,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
free_extent_map(em);
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
- &cached_state, extent_start,
+ cached_state, extent_start,
count, encoded, &unlocked);
- goto out;
+ goto out_unlock_extent;
}
/*
@@ -9317,12 +9335,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
- disk_bytenr = EXTENT_MAP_HOLE;
+ *disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
} else if (extent_map_is_compressed(em)) {
- disk_bytenr = em->disk_bytenr;
+ *disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
* compressed extent.
@@ -9331,7 +9349,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ret = -ENOBUFS;
goto out_em;
}
- disk_io_size = em->disk_num_bytes;
+ *disk_io_size = em->disk_num_bytes;
count = em->disk_num_bytes;
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
@@ -9341,47 +9359,42 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_em;
encoded->compression = ret;
} else {
- disk_bytenr = extent_map_block_start(em) + (start - em->start);
+ *disk_bytenr = extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
* Don't read beyond what we locked. This also limits the page
* allocations that we'll do.
*/
- disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
- count = start + disk_io_size - iocb->ki_pos;
+ *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + *disk_io_size - iocb->ki_pos;
encoded->len = count;
encoded->unencoded_len = count;
- disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
}
free_extent_map(em);
em = NULL;
- if (disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ if (*disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
if (ret != count)
ret = -EFAULT;
} else {
- ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
- &cached_state, disk_bytenr,
- disk_io_size, count,
- encoded->compression,
- &unlocked);
+ ret = -EIOCBQUEUED;
+ goto out_unlock_extent;
}
-out:
- if (ret >= 0)
- iocb->ki_pos += encoded->len;
out_em:
free_extent_map(em);
out_unlock_extent:
- if (!unlocked)
- unlock_extent(io_tree, start, lockend, &cached_state);
+ /* Leave inode and extent locked if we need to do a read. */
+ if (!unlocked && ret != -EIOCBQUEUED)
+ unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode:
- if (!unlocked)
+ if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
@@ -9492,7 +9505,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
- folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
if (!folios)
return -ENOMEM;
for (i = 0; i < nr_folios; i++) {
@@ -9556,7 +9569,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->unencoded_len == encoded->len &&
encoded->unencoded_offset == 0 &&
can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
- ret = __cow_file_range_inline(inode, start, encoded->len,
+ ret = __cow_file_range_inline(inode, encoded->len,
orig_count, compression, folios[0],
true);
if (ret <= 0) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 226c91fe31a7..c9302d193187 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -29,6 +29,7 @@
#include <linux/fileattr.h>
#include <linux/fsverity.h>
#include <linux/sched/xacct.h>
+#include <linux/io_uring/cmd.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -1048,7 +1049,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
struct btrfs_qgroup_inherit *inherit)
{
int ret;
- bool snapshot_force_cow = false;
/*
* Force new buffered writes to reserve space even when NOCOW is
@@ -1067,15 +1067,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
* creation.
*/
atomic_inc(&root->snapshot_force_cow);
- snapshot_force_cow = true;
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
ret = btrfs_mksubvol(parent, idmap, name, namelen,
root, readonly, inherit);
+ atomic_dec(&root->snapshot_force_cow);
out:
- if (snapshot_force_cow)
- atomic_dec(&root->snapshot_force_cow);
btrfs_drew_read_unlock(&root->snapshot_lock);
return ret;
}
@@ -1308,9 +1306,9 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
ret = btrfs_mksubvol(&file->f_path, idmap, name,
namelen, NULL, readonly, inherit);
} else {
- struct fd src = fdget(fd);
+ CLASS(fd, src)(fd);
struct inode *src_inode;
- if (!fd_file(src)) {
+ if (fd_empty(src)) {
ret = -EINVAL;
goto out_drop_write;
}
@@ -1341,7 +1339,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
BTRFS_I(src_inode)->root,
readonly, inherit);
}
- fdput(src);
}
out_drop_write:
mnt_drop_write_file(file);
@@ -4058,8 +4055,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
return 0;
}
-static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
- void __user *arg)
+static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4514,12 +4510,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
flags);
size_t copy_end;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(file));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
loff_t pos;
struct kiocb kiocb;
ssize_t ret;
+ u64 disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4572,7 +4573,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos;
- ret = btrfs_encoded_read(&kiocb, &iter, &args);
+ ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ &disk_bytenr, &disk_io_size);
+
+ if (ret == -EIOCBQUEUED) {
+ bool unlocked = false;
+ u64 start, lockend, count;
+
+ start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize);
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ if (args.compression)
+ count = disk_io_size;
+ else
+ count = args.len;
+
+ ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ args.compression, &unlocked);
+
+ if (!unlocked) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ }
+ }
+
if (ret >= 0) {
fsnotify_access(file);
if (copy_to_user(argp + copy_end,
@@ -4690,6 +4716,439 @@ out_acct:
return ret;
}
+/*
+ * Context that's attached to an encoded read io_uring command, in cmd->pdu. It
+ * contains the fields in btrfs_uring_read_extent that are necessary to finish
+ * off and cleanup the I/O in btrfs_uring_read_finished.
+ */
+struct btrfs_uring_priv {
+ struct io_uring_cmd *cmd;
+ struct page **pages;
+ unsigned long nr_pages;
+ struct kiocb iocb;
+ struct iovec *iov;
+ struct iov_iter iter;
+ struct extent_state *cached_state;
+ u64 count;
+ u64 start;
+ u64 lockend;
+ int err;
+ bool compressed;
+};
+
+struct io_btrfs_cmd {
+ struct btrfs_uring_priv *priv;
+};
+
+static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_priv *priv = bc->priv;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ unsigned long index;
+ u64 cur;
+ size_t page_offset;
+ ssize_t ret;
+
+ if (priv->err) {
+ ret = priv->err;
+ goto out;
+ }
+
+ if (priv->compressed) {
+ index = 0;
+ page_offset = 0;
+ } else {
+ index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
+ page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
+ }
+ cur = 0;
+ while (cur < priv->count) {
+ size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset);
+
+ if (copy_page_to_iter(priv->pages[index], page_offset, bytes,
+ &priv->iter) != bytes) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ index++;
+ cur += bytes;
+ page_offset = 0;
+ }
+ ret = priv->count;
+
+out:
+ unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+
+ io_uring_cmd_done(cmd, ret, 0, issue_flags);
+ add_rchar(current, ret);
+
+ for (index = 0; index < priv->nr_pages; index++)
+ __free_page(priv->pages[index]);
+
+ kfree(priv->pages);
+ kfree(priv->iov);
+ kfree(priv);
+}
+
+void btrfs_uring_read_extent_endio(void *ctx, int err)
+{
+ struct btrfs_uring_priv *priv = ctx;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd);
+
+ priv->err = err;
+ bc->priv = priv;
+
+ io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
+}
+
+static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state *cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ struct iovec *iov, struct io_uring_cmd *cmd)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct page **pages;
+ struct btrfs_uring_priv *priv = NULL;
+ unsigned long nr_pages;
+ int ret;
+
+ nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ priv = kmalloc(sizeof(*priv), GFP_NOFS);
+ if (!priv) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ priv->iocb = *iocb;
+ priv->iov = iov;
+ priv->iter = *iter;
+ priv->count = count;
+ priv->cmd = cmd;
+ priv->cached_state = cached_state;
+ priv->compressed = compressed;
+ priv->nr_pages = nr_pages;
+ priv->pages = pages;
+ priv->start = start;
+ priv->lockend = lockend;
+ priv->err = 0;
+
+ ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
+ disk_io_size, pages, priv);
+ if (ret && ret != -EIOCBQUEUED)
+ goto out_fail;
+
+ /*
+ * If we return -EIOCBQUEUED, we're deferring the cleanup to
+ * btrfs_uring_read_finished(), which will handle unlocking the extent
+ * and inode and freeing the allocations.
+ */
+
+ return -EIOCBQUEUED;
+
+out_fail:
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ kfree(priv);
+ return ret;
+}
+
+static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
+ size_t copy_end;
+ struct btrfs_ioctl_encoded_io_args args = { 0 };
+ int ret;
+ u64 disk_bytenr, disk_io_size;
+ struct file *file;
+ struct btrfs_inode *inode;
+ struct btrfs_fs_info *fs_info;
+ struct extent_io_tree *io_tree;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ struct extent_state *cached_state = NULL;
+ u64 start, lockend;
+ void __user *sqe_addr;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+ file = cmd->file;
+ inode = BTRFS_I(file->f_inode);
+ fs_info = inode->root->fs_info;
+ io_tree = &inode->io_tree;
+ sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
+
+ if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
+ if (copy_from_user(&args32, sqe_addr, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ copy_end = copy_end_kernel;
+ if (copy_from_user(&args, sqe_addr, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ if (args.flags != 0)
+ return -EINVAL;
+
+ ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_free;
+ }
+
+ pos = args.offset;
+ ret = rw_verify_area(READ, file, &pos, args.len);
+ if (ret < 0)
+ goto out_free;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ kiocb.ki_flags |= IOCB_NOWAIT;
+
+ start = ALIGN_DOWN(pos, fs_info->sectorsize);
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ &disk_bytenr, &disk_io_size);
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ goto out_free;
+
+ file_accessed(file);
+
+ if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel,
+ sizeof(args) - copy_end_kernel)) {
+ if (ret == -EIOCBQUEUED) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ }
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ if (ret == -EIOCBQUEUED) {
+ u64 count;
+
+ /*
+ * If we've optimized things by storing the iovecs on the stack,
+ * undo this.
+ */
+ if (!iov) {
+ iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
+ if (!iov) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ ret = -ENOMEM;
+ goto out_acct;
+ }
+
+ memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
+ }
+
+ count = min_t(u64, iov_iter_count(&iter), disk_io_size);
+
+ /* Match ioctl by not returning past EOF if uncompressed. */
+ if (!args.compression)
+ count = min_t(u64, count, args.len);
+
+ ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
+ cached_state, disk_bytenr,
+ disk_io_size, count,
+ args.compression, iov, cmd);
+
+ goto out_acct;
+ }
+
+out_free:
+ kfree(iov);
+
+out_acct:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+
+ return ret;
+}
+
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ switch (cmd->cmd_op) {
+ case BTRFS_IOC_ENCODED_READ:
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_READ_32:
+#endif
+ return btrfs_uring_encoded_read(cmd, issue_flags);
+ }
+
+ return -EINVAL;
+}
+
+static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp)
+{
+ struct btrfs_root *root;
+ struct btrfs_ioctl_subvol_wait args = { 0 };
+ signed long sched_ret;
+ int refs;
+ u64 root_flags;
+ bool wait_for_deletion = false;
+ bool found = false;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ switch (args.mode) {
+ case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED:
+ /*
+ * Wait for the first one deleted that waits until all previous
+ * are cleaned.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (!list_empty(&fs_info->dead_roots)) {
+ root = list_last_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ args.subvolid = btrfs_root_id(root);
+ found = true;
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (!found)
+ return -ENOENT;
+
+ fallthrough;
+ case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE:
+ if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) ||
+ BTRFS_LAST_FREE_OBJECTID < args.subvolid)
+ return -EINVAL;
+ break;
+ case BTRFS_SUBVOL_SYNC_COUNT:
+ spin_lock(&fs_info->trans_lock);
+ args.count = list_count_nodes(&fs_info->dead_roots);
+ spin_unlock(&fs_info->trans_lock);
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ case BTRFS_SUBVOL_SYNC_PEEK_FIRST:
+ spin_lock(&fs_info->trans_lock);
+ /* Last in the list was deleted first. */
+ if (!list_empty(&fs_info->dead_roots)) {
+ root = list_last_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ args.subvolid = btrfs_root_id(root);
+ } else {
+ args.subvolid = 0;
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ case BTRFS_SUBVOL_SYNC_PEEK_LAST:
+ spin_lock(&fs_info->trans_lock);
+ /* First in the list was deleted last. */
+ if (!list_empty(&fs_info->dead_roots)) {
+ root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ args.subvolid = btrfs_root_id(root);
+ } else {
+ args.subvolid = 0;
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+
+ /* 32bit limitation: fs_roots_radix key is not wide enough. */
+ if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX)
+ return -EOVERFLOW;
+
+ while (1) {
+ /* Wait for the specific one. */
+ if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR)
+ return -EINTR;
+ refs = -1;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)args.subvolid);
+ if (root) {
+ spin_lock(&root->root_item_lock);
+ refs = btrfs_root_refs(&root->root_item);
+ root_flags = btrfs_root_flags(&root->root_item);
+ spin_unlock(&root->root_item_lock);
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ up_read(&fs_info->subvol_sem);
+
+ /* Subvolume does not exist. */
+ if (!root)
+ return -ENOENT;
+
+ /* Subvolume not deleted at all. */
+ if (refs > 0)
+ return -EEXIST;
+ /* We've waited and now the subvolume is gone. */
+ if (wait_for_deletion && refs == -1) {
+ /* Return the one we waited for as the last one. */
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ }
+
+ /* Subvolume not found on the first try (deleted or never existed). */
+ if (refs == -1)
+ return -ENOENT;
+
+ wait_for_deletion = true;
+ ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD);
+ sched_ret = schedule_timeout_interruptible(HZ);
+ /* Early wake up or error. */
+ if (sched_ret != 0)
+ return -EINTR;
+ }
+
+ return 0;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -4812,7 +5271,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_QUOTA_RESCAN_STATUS:
return btrfs_ioctl_quota_rescan_status(fs_info, argp);
case BTRFS_IOC_QUOTA_RESCAN_WAIT:
- return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
+ return btrfs_ioctl_quota_rescan_wait(fs_info);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(fs_info, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
@@ -4841,6 +5300,8 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_ENCODED_WRITE_32:
return btrfs_ioctl_encoded_write(file, argp, true);
#endif
+ case BTRFS_IOC_SUBVOL_SYNC_WAIT:
+ return btrfs_ioctl_subvol_sync(fs_info, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 19cd26b0244a..2b760c8778f8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,5 +22,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(const u8 *uuid);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void btrfs_uring_read_extent_endio(void *ctx, int err);
#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6a0b7abb5bd9..9a7a7b723305 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -162,21 +162,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
}
/*
- * Try-lock for write.
- *
- * Return 1 if the rwlock has been taken, 0 otherwise
- */
-int btrfs_try_tree_write_lock(struct extent_buffer *eb)
-{
- if (down_write_trylock(&eb->lock)) {
- btrfs_set_eb_lock_owner(eb, current->pid);
- trace_btrfs_try_tree_write_lock(eb);
- return 1;
- }
- return 0;
-}
-
-/*
* Release read lock.
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 3c15c75e0582..46c8be2afab1 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
void btrfs_tree_read_unlock(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
-int btrfs_try_tree_write_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 72856f6775f7..a45bc11f8665 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *lzo_alloc_workspace(unsigned int level)
+struct list_head *lzo_alloc_workspace(void)
{
struct workspace *workspace;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2104d60c2161..30eceaf829a7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -111,8 +111,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
return NULL;
}
-static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
- u64 len)
+static int btrfs_range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+ u64 len)
{
if (file_offset + len <= entry->file_offset ||
entry->file_offset + entry->num_bytes <= file_offset)
@@ -346,10 +346,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
/*
- * Ordered (Private2) bit indicates whether we still have
+ * Ordered flag indicates whether we still have
* pending io unfinished for the ordered extent.
*
- * If there's no such bit, we need to skip to next range.
+ * If it's not set, we need to skip to next range.
*/
if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len))
return false;
@@ -985,7 +985,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
while (1) {
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (range_overlaps(entry, file_offset, len))
+ if (btrfs_range_overlaps(entry, file_offset, len))
break;
if (entry->file_offset >= file_offset + len) {
@@ -1114,12 +1114,12 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
}
if (prev) {
entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
- if (range_overlaps(entry, file_offset, len))
+ if (btrfs_range_overlaps(entry, file_offset, len))
goto out;
}
if (next) {
entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
- if (range_overlaps(entry, file_offset, len))
+ if (btrfs_range_overlaps(entry, file_offset, len))
goto out;
}
/* No ordered extent in the range */
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a0e8deca87a7..a6f92836c9b1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -226,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
return qgroup;
}
-static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup *qgroup)
+static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
{
struct btrfs_qgroup_list *list;
@@ -258,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return -ENOENT;
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
- __del_qgroup_rb(fs_info, qgroup);
+ __del_qgroup_rb(qgroup);
return 0;
}
@@ -469,7 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
/*
* If a qgroup exists for a subvolume ID, it is possible
* that subvolume has been deleted, in which case
- * re-using that ID would lead to incorrect accounting.
+ * reusing that ID would lead to incorrect accounting.
*
* Ensure that we skip any such subvol ids.
*
@@ -643,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
- __del_qgroup_rb(fs_info, qgroup);
+ __del_qgroup_rb(qgroup);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
}
@@ -2001,27 +2000,27 @@ out:
* Return <0 for insertion failure, caller can free @record safely.
*/
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record)
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record,
+ u64 bytenr)
{
struct btrfs_qgroup_extent_record *existing, *ret;
- const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits);
+ const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
if (!btrfs_qgroup_full_accounting(fs_info))
return 1;
#if BITS_PER_LONG == 32
- if (record->bytenr >= MAX_LFS_FILESIZE) {
+ if (bytenr >= MAX_LFS_FILESIZE) {
btrfs_err_rl(fs_info,
"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit",
- record->bytenr);
+ bytenr);
btrfs_err_32bit_limit(fs_info);
return -EOVERFLOW;
}
#endif
- lockdep_assert_held(&delayed_refs->lock);
- trace_btrfs_qgroup_trace_extent(fs_info, record);
+ trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr);
xa_lock(&delayed_refs->dirty_extents);
existing = xa_load(&delayed_refs->dirty_extents, index);
@@ -2066,12 +2065,17 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
* transaction committing, but not now as qgroup accounting will be wrong again.
*/
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
- struct btrfs_qgroup_extent_record *qrecord)
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr)
{
- struct btrfs_backref_walk_ctx ctx = { 0 };
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_backref_walk_ctx ctx = {
+ .bytenr = bytenr,
+ .fs_info = fs_info,
+ };
int ret;
- if (!btrfs_qgroup_full_accounting(trans->fs_info))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
/*
* We are always called in a context where we are already holding a
@@ -2094,16 +2098,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
*/
ASSERT(trans != NULL);
- if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
return 0;
- ctx.bytenr = qrecord->bytenr;
- ctx.fs_info = trans->fs_info;
-
ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) {
- qgroup_mark_inconsistent(trans->fs_info);
- btrfs_warn(trans->fs_info,
+ qgroup_mark_inconsistent(fs_info);
+ btrfs_warn(fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
return 0;
@@ -2138,7 +2139,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
- struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs;
const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
int ret;
@@ -2148,26 +2149,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!record)
return -ENOMEM;
- if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) {
+ if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
kfree(record);
return -ENOMEM;
}
- delayed_refs = &trans->transaction->delayed_refs;
- record->bytenr = bytenr;
record->num_bytes = num_bytes;
- record->old_roots = NULL;
- spin_lock(&delayed_refs->lock);
- ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
- spin_unlock(&delayed_refs->lock);
+ ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr);
if (ret) {
/* Clean up if insertion fails or item exists. */
xa_release(&delayed_refs->dirty_extents, index);
kfree(record);
return 0;
}
- return btrfs_qgroup_trace_extent_post(trans, record);
+ return btrfs_qgroup_trace_extent_post(trans, record, bytenr);
}
/*
@@ -2652,7 +2648,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
if (!extent_buffer_uptodate(root_eb)) {
struct btrfs_tree_parent_check check = {
- .has_first_key = false,
.transid = root_gen,
.level = root_level
};
@@ -3043,14 +3038,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
delayed_refs = &trans->transaction->delayed_refs;
qgroup_to_skip = delayed_refs->qgroup_to_skip;
xa_for_each(&delayed_refs->dirty_extents, index, record) {
+ const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits);
+
num_dirty_extents++;
- trace_btrfs_qgroup_account_extents(fs_info, record);
+ trace_btrfs_qgroup_account_extents(fs_info, record, bytenr);
if (!ret && !(fs_info->qgroup_flags &
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
struct btrfs_backref_walk_ctx ctx = { 0 };
- ctx.bytenr = record->bytenr;
+ ctx.bytenr = bytenr;
ctx.fs_info = fs_info;
/*
@@ -3092,7 +3089,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
ulist_del(record->old_roots, qgroup_to_skip,
0);
}
- ret = btrfs_qgroup_account_extent(trans, record->bytenr,
+ ret = btrfs_qgroup_account_extent(trans, bytenr,
record->num_bytes,
record->old_roots,
new_roots);
@@ -4196,13 +4193,20 @@ static int try_flush_qgroup(struct btrfs_root *root)
return 0;
}
- btrfs_run_delayed_iputs(root->fs_info);
- btrfs_wait_on_delayed_iputs(root->fs_info);
ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0)
goto out;
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
+ /*
+ * After waiting for ordered extents run delayed iputs in order to free
+ * space from unlinked files before committing the current transaction,
+ * as ordered extents may have been holding the last reference of an
+ * inode and they add a delayed iput when they complete.
+ */
+ btrfs_run_delayed_iputs(root->fs_info);
+ btrfs_wait_on_delayed_iputs(root->fs_info);
+
ret = btrfs_commit_current_transaction(root);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
@@ -4687,8 +4691,7 @@ out:
* BOTH POINTERS ARE BEFORE TREE SWAP
* @last_snapshot: last snapshot generation of the subvolume tree
*/
-int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
- struct btrfs_root *subvol_root,
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
@@ -4894,17 +4897,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
xa_destroy(&trans->delayed_refs.dirty_extents);
}
-void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
-{
- if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
- return;
-
- if (!is_fstree(root))
- return;
-
- btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA);
-}
-
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
const struct btrfs_squota_delta *delta)
{
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index c229256d6fd5..e233cc79af18 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -127,7 +127,12 @@ struct btrfs_inode;
* Record a dirty extent, and info qgroup to update quota on it
*/
struct btrfs_qgroup_extent_record {
- u64 bytenr;
+ /*
+ * The bytenr of the extent is given by its index in the dirty_extents
+ * xarray of struct btrfs_delayed_ref_root left shifted by
+ * fs_info->sectorsize_bits.
+ */
+
u64 num_bytes;
/*
@@ -345,9 +350,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record);
+ struct btrfs_qgroup_extent_record *record,
+ u64 bytenr);
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
- struct btrfs_qgroup_extent_record *qrecord);
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr);
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes);
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
@@ -432,8 +439,7 @@ void btrfs_qgroup_init_swapped_blocks(
struct btrfs_qgroup_swapped_blocks *swapped_blocks);
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
-int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
- struct btrfs_root *subvol_root,
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
@@ -442,7 +448,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info);
-void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
const struct btrfs_squota_delta *delta);
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 4c859b550f6c..9ffc79f250fb 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -13,6 +13,39 @@
#include "volumes.h"
#include "print-tree.h"
+static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ const struct btrfs_key *oldkey,
+ u64 newlen, u64 frontpad)
+{
+ struct btrfs_stripe_extent *extent;
+ struct extent_buffer *leaf;
+ int slot;
+ size_t item_size;
+ struct btrfs_key newkey = {
+ .objectid = oldkey->objectid + frontpad,
+ .type = BTRFS_RAID_STRIPE_KEY,
+ .offset = newlen,
+ };
+
+ ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size(leaf, slot);
+ extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+ for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
+ struct btrfs_raid_stride *stride = &extent->strides[i];
+ u64 phys;
+
+ phys = btrfs_raid_stride_physical(leaf, stride);
+ btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
+ }
+
+ btrfs_set_item_key_safe(trans, path, &newkey);
+}
+
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -36,23 +69,24 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
while (1) {
key.objectid = start;
key.type = BTRFS_RAID_STRIPE_KEY;
- key.offset = length;
+ key.offset = 0;
ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
if (ret < 0)
break;
- if (ret > 0) {
- ret = 0;
- if (path->slots[0] == 0)
- break;
+
+ if (path->slots[0] == btrfs_header_nritems(path->nodes[0]))
path->slots[0]--;
- }
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
found_start = key.objectid;
found_end = found_start + key.offset;
+ ret = 0;
+
+ if (key.type != BTRFS_RAID_STRIPE_KEY)
+ break;
/* That stripe ends before we start, we're done. */
if (found_end <= start)
@@ -61,7 +95,40 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
trace_btrfs_raid_extent_delete(fs_info, start, end,
found_start, found_end);
- ASSERT(found_start >= start && found_end <= end);
+ /*
+ * The stripe extent starts before the range we want to delete:
+ *
+ * |--- RAID Stripe Extent ---|
+ * |--- keep ---|--- drop ---|
+ *
+ * This means we have to duplicate the tree item, truncate the
+ * length to the new size and then re-insert the item.
+ */
+ if (found_start < start) {
+ u64 diff = start - found_start;
+
+ btrfs_partially_delete_raid_extent(trans, path, &key,
+ diff, 0);
+ break;
+ }
+
+ /*
+ * The stripe extent ends after the range we want to delete:
+ *
+ * |--- RAID Stripe Extent ---|
+ * |--- drop ---|--- keep ---|
+ *
+ * This means we have to duplicate the tree item, truncate the
+ * length to the new size and then re-insert the item.
+ */
+ if (found_end > end) {
+ u64 diff = found_end - end;
+
+ btrfs_partially_delete_raid_extent(trans, path, &key,
+ diff, diff);
+ break;
+ }
+
ret = btrfs_del_item(trans, stripe_root, path);
if (ret)
break;
@@ -108,8 +175,9 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
return ret;
}
-static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
- struct btrfs_io_context *bioc)
+EXPORT_FOR_TESTS
+int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_key stripe_key;
@@ -233,7 +301,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
found_end = found_logical + found_length;
if (found_logical > end) {
- ret = -ENOENT;
+ ret = -ENODATA;
goto out;
}
@@ -279,10 +347,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
}
/* If we're here, we haven't found the requested devid in the stripe. */
- ret = -ENOENT;
+ ret = -ENODATA;
out:
if (ret > 0)
- ret = -ENOENT;
+ ret = -ENODATA;
if (ret && ret != -EIO && !stripe->rst_search_commit_root) {
btrfs_debug(fs_info,
"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 1ac1c21aac2f..541836421778 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *ordered_extent);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_io_context *bioc);
+#endif
+
static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
u64 map_type)
{
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 39bec672df0c..cdd373c27784 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list)
static void assert_rbio(struct btrfs_raid_bio *rbio)
{
- if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
- !IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f3834f8d26b4..bf267bdfa8f8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1244,7 +1244,7 @@ again:
* The real subtree rescan is delayed until we have new
* CoW on the subtree root node before transaction commit.
*/
- ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+ ret = btrfs_qgroup_add_swapped_blocks(dest,
rc->block_group, parent, slot,
path->nodes[level], path->slots[level],
last_snapshot);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3a3427428074..204c928beaf9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1656,8 +1656,7 @@ static u32 stripe_length(const struct scrub_stripe *stripe)
stripe->bg->start + stripe->bg->length - stripe->logical);
}
-static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
- struct scrub_stripe *stripe)
+static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
@@ -1704,8 +1703,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
&stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
if (err < 0) {
- set_bit(i, &stripe->io_error_bitmap);
- set_bit(i, &stripe->error_bitmap);
+ if (err != -ENODATA) {
+ /*
+ * Earlier btrfs_get_raid_extent_offset()
+ * returned -ENODATA, which means there's
+ * no entry for the corresponding range
+ * in the stripe tree. But if it's in
+ * the extent tree, then it's a preallocated
+ * extent and not an error.
+ */
+ set_bit(i, &stripe->io_error_bitmap);
+ set_bit(i, &stripe->error_bitmap);
+ }
continue;
}
@@ -1743,7 +1752,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
- scrub_submit_extent_sector_read(sctx, stripe);
+ scrub_submit_extent_sector_read(stripe);
return;
}
@@ -1954,7 +1963,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
ASSERT(sctx->raid56_data_stripes);
/*
- * For data stripe search, we cannot re-use the same extent/csum paths,
+ * For data stripe search, we cannot reuse the same extent/csum paths,
* as the data stripe bytenr may be smaller than previous extent. Thus
* we have to use our own extent/csum paths.
*/
@@ -2103,7 +2112,6 @@ out:
*/
static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct btrfs_chunk_map *map,
u64 logical_start, u64 logical_length,
struct btrfs_device *device,
u64 physical, int mirror_num)
@@ -2222,7 +2230,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
* just RAID1, so we can reuse scrub_simple_mirror() to scrub
* this stripe.
*/
- ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
+ ret = scrub_simple_mirror(sctx, bg, cur_logical,
BTRFS_STRIPE_LEN, device, cur_physical,
mirror_num);
if (ret)
@@ -2256,7 +2264,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
/* Offset inside the chunk */
u64 offset;
u64 stripe_logical;
- int stop_loop = 0;
/* Extent_path should be released by now. */
ASSERT(sctx->extent_path.nodes[0] == NULL);
@@ -2307,7 +2314,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* Only @physical and @mirror_num needs to calculated using
* @stripe_index.
*/
- ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
+ ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length,
scrub_dev, map->stripes[stripe_index].physical,
stripe_index + 1);
offset = 0;
@@ -2362,7 +2369,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* We can reuse scrub_simple_mirror() here, as the repair part
* is still based on @mirror_num.
*/
- ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
+ ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN,
scrub_dev, physical, 1);
if (ret < 0)
goto out;
@@ -2370,14 +2377,8 @@ next:
logical += increment;
physical += BTRFS_STRIPE_LEN;
spin_lock(&sctx->stat_lock);
- if (stop_loop)
- sctx->stat.last_physical =
- map->stripes[stripe_index].physical + dev_stripe_len;
- else
- sctx->stat.last_physical = physical;
+ sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
- if (stop_loop)
- break;
}
out:
ret2 = flush_scrub_stripes(sctx);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b068469871f8..7254279c3cc9 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -980,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
return ret;
}
-typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
- struct fs_path *p,
- void *ctx);
+typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx);
/*
* Helper function to iterate the entries in ONE btrfs_inode_ref or
@@ -1007,8 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
u32 name_len;
char *start;
int ret = 0;
- int num = 0;
- int index;
u64 dir;
unsigned long name_off;
unsigned long elem_size;
@@ -1043,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
iref = (struct btrfs_inode_ref *)(ptr + cur);
name_len = btrfs_inode_ref_name_len(eb, iref);
name_off = (unsigned long)(iref + 1);
- index = btrfs_inode_ref_index(eb, iref);
dir = found_key->offset;
} else {
extref = (struct btrfs_inode_extref *)(ptr + cur);
name_len = btrfs_inode_extref_name_len(eb, extref);
name_off = (unsigned long)&extref->name;
- index = btrfs_inode_extref_index(eb, extref);
dir = btrfs_inode_extref_parent(eb, extref);
}
@@ -1094,10 +1088,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
}
cur += elem_size + name_len;
- ret = iterate(num, dir, index, p, ctx);
+ ret = iterate(dir, p, ctx);
if (ret)
goto out;
- num++;
}
out:
@@ -1227,8 +1220,7 @@ out:
return ret;
}
-static int __copy_first_ref(int num, u64 dir, int index,
- struct fs_path *p, void *ctx)
+static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx)
{
int ret;
struct fs_path *pt = ctx;
@@ -3768,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref,
const bool is_orphan)
{
- struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key di_key;
@@ -3797,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out;
}
- di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
+ di = btrfs_match_dir_item_name(path, parent_ref->name,
parent_ref->name_len);
if (!di) {
ret = 0;
@@ -4708,8 +4699,7 @@ out:
return ret;
}
-static int record_new_ref_if_needed(int num, u64 dir, int index,
- struct fs_path *name, void *ctx)
+static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
int ret = 0;
struct send_ctx *sctx = ctx;
@@ -4738,8 +4728,7 @@ out:
return ret;
}
-static int record_deleted_ref_if_needed(int num, u64 dir, int index,
- struct fs_path *name, void *ctx)
+static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
int ret = 0;
struct send_ctx *sctx = ctx;
@@ -5677,10 +5666,11 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* Note that send_buf is a mapping of send_buf_pages, so this is really
* reading into send_buf.
*/
- ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
+ ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
disk_bytenr, disk_num_bytes,
sctx->send_buf_pages +
- (data_offset >> PAGE_SHIFT));
+ (data_offset >> PAGE_SHIFT),
+ NULL);
if (ret)
goto out;
@@ -8135,7 +8125,20 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
* making it RW. This also protects against deletion.
*/
spin_lock(&send_root->root_item_lock);
- if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
+ /*
+ * Unlikely but possible, if the subvolume is marked for deletion but
+ * is slow to remove the directory entry, send can still be started.
+ */
+ if (btrfs_root_dead(send_root)) {
+ spin_unlock(&send_root->root_item_lock);
+ return -EPERM;
+ }
+ /* Userspace tools do the checks and warn the user if it's not RO. */
+ if (!btrfs_root_readonly(send_root)) {
+ spin_unlock(&send_root->root_item_lock);
+ return -EPERM;
+ }
+ if (send_root->dedupe_in_progress) {
dedupe_in_progress_warn(send_root);
spin_unlock(&send_root->root_item_lock);
return -EAGAIN;
@@ -8144,15 +8147,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
spin_unlock(&send_root->root_item_lock);
/*
- * Userspace tools do the checks and warn the user if it's
- * not RO.
- */
- if (!btrfs_root_readonly(send_root)) {
- ret = -EPERM;
- goto out;
- }
-
- /*
* Check that we don't overflow at later allocations, we request
* clone_sources_count + 1 items, and compare to unsigned long inside
* access_ok. Also set an upper limit for allocation size so this can't
@@ -8217,15 +8211,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
}
sctx->send_root = send_root;
- /*
- * Unlikely but possible, if the subvolume is marked for deletion but
- * is slow to remove the directory entry, send can still be started
- */
- if (btrfs_root_dead(sctx->send_root)) {
- ret = -EPERM;
- goto out;
- }
-
sctx->clone_roots_cnt = arg->clone_sources_count;
if (sctx->proto >= 2) {
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index b07f4aa66878..9309886c5ea1 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -16,7 +16,7 @@ struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
/* Conditional support for the upcoming protocol version. */
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
#define BTRFS_SEND_STREAM_VERSION 3
#else
#define BTRFS_SEND_STREAM_VERSION 2
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d5a9cd8a4fd8..255e85f78313 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1279,7 +1279,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* If we are freeing inodes, we want to make sure all delayed iputs have
* completed, because they could have been on an inode with i_nlink == 0, and
* thus have been truncated and freed up space. But again this space is not
- * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * immediately reusable, it comes in the form of a delayed ref, which must be
* run and then the transaction must be committed.
*
* COMMIT_TRANS
@@ -1488,8 +1488,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
}
-static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static void wait_reserve_ticket(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
@@ -1547,7 +1546,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
- wait_reserve_ticket(fs_info, space_info, ticket);
+ wait_reserve_ticket(space_info, ticket);
break;
case BTRFS_RESERVE_FLUSH_LIMIT:
priority_reclaim_metadata_space(fs_info, space_info, ticket,
@@ -1984,8 +1983,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
return unalloc < data_chunk_size;
}
-static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, int raid)
+static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid)
{
struct btrfs_block_group *bg;
int thresh_pct;
@@ -2081,6 +2079,6 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
if (!btrfs_should_periodic_reclaim(space_info))
continue;
for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
- do_reclaim_sweep(fs_info, space_info, raid);
+ do_reclaim_sweep(space_info, raid);
}
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index fe4d719d506b..8c68059ac1b0 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
return ERR_PTR(-ENOMEM);
spin_lock_init(&ret->lock);
- if (type == BTRFS_SUBPAGE_METADATA) {
+ if (type == BTRFS_SUBPAGE_METADATA)
atomic_set(&ret->eb_refs, 0);
- } else {
- atomic_set(&ret->readers, 0);
- atomic_set(&ret->writers, 0);
- }
+ else
+ atomic_set(&ret->nr_locked, 0);
return ret;
}
@@ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
__start_bit; \
})
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
- const int nbits = len >> fs_info->sectorsize_bits;
- unsigned long flags;
-
-
- btrfs_subpage_assert(fs_info, folio, start, len);
-
- spin_lock_irqsave(&subpage->lock, flags);
- /*
- * Even though it's just for reading the page, no one should have
- * locked the subpage range.
- */
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- bitmap_set(subpage->bitmaps, start_bit, nbits);
- atomic_add(nbits, &subpage->readers);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
- const int nbits = len >> fs_info->sectorsize_bits;
- unsigned long flags;
- bool is_data;
- bool last;
-
- btrfs_subpage_assert(fs_info, folio, start, len);
- is_data = is_data_inode(BTRFS_I(folio->mapping->host));
-
- spin_lock_irqsave(&subpage->lock, flags);
-
- /* The range should have already been locked. */
- ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
- ASSERT(atomic_read(&subpage->readers) >= nbits);
-
- bitmap_clear(subpage->bitmaps, start_bit, nbits);
- last = atomic_sub_and_test(nbits, &subpage->readers);
-
- /*
- * For data we need to unlock the page if the last read has finished.
- *
- * And please don't replace @last with atomic_sub_and_test() call
- * inside if () condition.
- * As we want the atomic_sub_and_test() to be always executed.
- */
- if (is_data && last)
- folio_unlock(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
{
u64 orig_start = *start;
@@ -295,28 +237,8 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
orig_start + orig_len) - *start;
}
-static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
- const int nbits = (len >> fs_info->sectorsize_bits);
- unsigned long flags;
- int ret;
-
- btrfs_subpage_assert(fs_info, folio, start, len);
-
- spin_lock_irqsave(&subpage->lock, flags);
- ASSERT(atomic_read(&subpage->readers) == 0);
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- bitmap_set(subpage->bitmaps, start_bit, nbits);
- ret = atomic_add_return(nbits, &subpage->writers);
- ASSERT(ret == nbits);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
-static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
@@ -334,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
* extent_clear_unlock_delalloc() for compression path.
*
* This @locked_page is locked by plain lock_page(), thus its
- * subpage::writers is 0. Handle them in a special way.
+ * subpage::locked is 0. Handle them in a special way.
*/
- if (atomic_read(&subpage->writers) == 0) {
+ if (atomic_read(&subpage->nr_locked) == 0) {
spin_unlock_irqrestore(&subpage->lock, flags);
return true;
}
@@ -345,40 +267,13 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
clear_bit(bit, subpage->bitmaps);
cleared++;
}
- ASSERT(atomic_read(&subpage->writers) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->writers);
+ ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &subpage->nr_locked);
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
}
/*
- * Lock a folio for delalloc page writeback.
- *
- * Return -EAGAIN if the page is not properly initialized.
- * Return 0 with the page locked, and writer counter updated.
- *
- * Even with 0 returned, the page still need extra check to make sure
- * it's really the correct page, as the caller is using
- * filemap_get_folios_contig(), which can race with page invalidating.
- */
-int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
- folio_lock(folio);
- return 0;
- }
- folio_lock(folio);
- if (!folio_test_private(folio) || !folio_get_private(folio)) {
- folio_unlock(folio);
- return -EAGAIN;
- }
- btrfs_subpage_clamp_range(folio, &start, &len);
- btrfs_subpage_start_writer(fs_info, folio, start, len);
- return 0;
-}
-
-/*
* Handle different locked folios:
*
* - Non-subpage folio
@@ -394,8 +289,8 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
* bitmap, reduce the writer lock number, and unlock the page if that's
* the last locked range.
*/
-void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
@@ -408,24 +303,24 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
/*
* For subpage case, there are two types of locked page. With or
- * without writers number.
+ * without locked number.
*
- * Since we own the page lock, no one else could touch subpage::writers
+ * Since we own the page lock, no one else could touch subpage::locked
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->writers) == 0) {
- /* No writers, locked by plain lock_page(). */
+ if (atomic_read(&subpage->nr_locked) == 0) {
+ /* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
}
btrfs_subpage_clamp_range(folio, &start, &len);
- if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
+ if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len))
folio_unlock(folio);
}
-void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
- struct folio *folio, unsigned long bitmap)
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, unsigned long bitmap)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
@@ -434,13 +329,13 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
int cleared = 0;
int bit;
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
folio_unlock(folio);
return;
}
- if (atomic_read(&subpage->writers) == 0) {
- /* No writers, locked by plain lock_page(). */
+ if (atomic_read(&subpage->nr_locked) == 0) {
+ /* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
}
@@ -450,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
cleared++;
}
- ASSERT(atomic_read(&subpage->writers) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->writers);
+ ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &subpage->nr_locked);
spin_unlock_irqrestore(&subpage->lock, flags);
if (last)
folio_unlock(folio);
@@ -776,8 +671,8 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* This populates the involved subpage ranges so that subpage helpers can
* properly unlock them.
*/
-void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage;
unsigned long flags;
@@ -796,58 +691,11 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
/* Target range should not yet be locked. */
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
bitmap_set(subpage->bitmaps, start_bit, nbits);
- ret = atomic_add_return(nbits, &subpage->writers);
+ ret = atomic_add_return(nbits, &subpage->nr_locked);
ASSERT(ret <= fs_info->sectors_per_page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
-/*
- * Find any subpage writer locked range inside @folio, starting at file offset
- * @search_start. The caller should ensure the folio is locked.
- *
- * Return true and update @found_start_ret and @found_len_ret to the first
- * writer locked range.
- * Return false if there is no writer locked range.
- */
-bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 search_start,
- u64 *found_start_ret, u32 *found_len_ret)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const u32 sectors_per_page = fs_info->sectors_per_page;
- const unsigned int len = PAGE_SIZE - offset_in_page(search_start);
- const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
- locked, search_start, len);
- const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked;
- const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page;
- unsigned long flags;
- int first_zero;
- int first_set;
- bool found = false;
-
- ASSERT(folio_test_locked(folio));
- spin_lock_irqsave(&subpage->lock, flags);
- first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit);
- if (first_set >= locked_bitmap_end)
- goto out;
-
- found = true;
-
- *found_start_ret = folio_pos(folio) +
- ((first_set - locked_bitmap_start) << fs_info->sectorsize_bits);
- /*
- * Since @first_set is ensured to be smaller than locked_bitmap_end
- * here, @found_start_ret should be inside the folio.
- */
- ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE);
-
- first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set);
- *found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits;
-out:
- spin_unlock_irqrestore(&subpage->lock, flags);
- return found;
-}
-
#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
{ \
const int sectors_per_page = fs_info->sectors_per_page; \
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 4b85d91d0e18..428fa9389fd4 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -45,14 +45,6 @@ enum {
struct btrfs_subpage {
/* Common members for both data and metadata pages */
spinlock_t lock;
- /*
- * Both data and metadata needs to track how many readers are for the
- * page.
- * Data relies on @readers to unlock the page when last reader finished.
- * While metadata doesn't need page unlock, it needs to prevent
- * page::private get cleared before the last end_page_read().
- */
- atomic_t readers;
union {
/*
* Structures only used by metadata
@@ -62,8 +54,12 @@ struct btrfs_subpage {
*/
atomic_t eb_refs;
- /* Structures only used by data */
- atomic_t writers;
+ /*
+ * Structures only used by data,
+ *
+ * How many sectors inside the page is locked.
+ */
+ atomic_t nr_locked;
};
unsigned long bitmaps[];
};
@@ -95,23 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage);
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-
-int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
- struct folio *folio, unsigned long bitmap);
-bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 search_start,
- u64 *found_start_ret, u32 *found_len_ret);
-
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, unsigned long bitmap);
/*
* Template for subpage related operations.
*
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c64d07134122..97a85d180b61 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -28,7 +28,6 @@
#include <linux/btrfs.h>
#include <linux/security.h>
#include <linux/fs_parser.h>
-#include <linux/swap.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -946,8 +945,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
}
static int btrfs_fill_super(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- void *data)
+ struct btrfs_fs_devices *fs_devices)
{
struct inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -971,7 +969,7 @@ static int btrfs_fill_super(struct super_block *sb,
return err;
}
- err = open_ctree(sb, fs_devices, (char *)data);
+ err = open_ctree(sb, fs_devices);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
return err;
@@ -1893,7 +1891,7 @@ static int btrfs_get_tree_super(struct fs_context *fc)
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
- ret = btrfs_fill_super(sb, fs_devices, NULL);
+ ret = btrfs_fill_super(sb, fs_devices);
}
if (ret) {
@@ -2191,7 +2189,8 @@ static struct file_system_type btrfs_fs_type = {
.init_fs_context = btrfs_init_fs_context,
.parameters = btrfs_fs_parameters,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA |
+ FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("btrfs");
@@ -2256,7 +2255,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
- ret = PTR_ERR(device);
+ if (IS_ERR(device))
+ ret = PTR_ERR(device);
+ else
+ ret = 0;
break;
}
ret = !(device->fs_devices->num_devices ==
@@ -2395,13 +2397,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro
trace_btrfs_extent_map_shrinker_count(fs_info, nr);
- /*
- * Only report the real number for DEBUG builds, as there are reports of
- * serious performance degradation caused by too frequent shrinks.
- */
- if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
- return nr;
- return 0;
+ return nr;
}
static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
@@ -2409,16 +2405,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- /*
- * We may be called from any task trying to allocate memory and we don't
- * want to slow it down with scanning and dropping extent maps. It would
- * also cause heavy lock contention if many tasks concurrently enter
- * here. Therefore only allow kswapd tasks to scan and drop extent maps.
- */
- if (!current_is_kswapd())
- return 0;
+ btrfs_free_extent_maps(fs_info, nr_to_scan);
- return btrfs_free_extent_maps(fs_info, nr_to_scan);
+ /* The extent map shrinker runs asynchronously, so always return 0. */
+ return 0;
}
static const struct super_operations btrfs_super_ops = {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 03926ad467c9..b843308e2bc6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1390,7 +1390,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store);
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
static ssize_t btrfs_offload_csum_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1450,7 +1450,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
BTRFS_ATTR_PTR(, temp_fsid),
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
BTRFS_ATTR_PTR(, offload_csum),
#endif
NULL,
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ce50847e1e01..e607b5d52fb1 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -29,6 +29,7 @@ const char *test_error[] = {
[TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
[TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
+ [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context",
};
static const struct super_operations btrfs_test_super_ops = {
@@ -291,6 +292,9 @@ int btrfs_run_sanity_tests(void)
ret = btrfs_test_free_space_tree(sectorsize, nodesize);
if (ret)
goto out;
+ ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);
+ if (ret)
+ goto out;
}
}
ret = btrfs_test_extent_map();
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index dc2f2ab15fa5..b524ecf2f452 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,6 +24,7 @@ enum {
TEST_ALLOC_BLOCK_GROUP,
TEST_ALLOC_EXTENT_MAP,
TEST_ALLOC_CHUNK_MAP,
+ TEST_ALLOC_IO_CONTEXT,
};
extern const char *test_error[];
@@ -37,6 +38,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
+int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_extent_map(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
new file mode 100644
index 000000000000..30f17eb7b6a8
--- /dev/null
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -0,0 +1,538 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/sizes.h>
+#include "../fs.h"
+#include "../disk-io.h"
+#include "../transaction.h"
+#include "../volumes.h"
+#include "../raid-stripe-tree.h"
+#include "btrfs-tests.h"
+
+#define RST_TEST_NUM_DEVICES (2)
+#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1)
+
+typedef int (*test_func_t)(struct btrfs_trans_handle *trans);
+
+static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices,
+ u64 devid)
+{
+ struct btrfs_device *dev;
+
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ if (dev->devid == devid)
+ return dev;
+ }
+
+ return NULL;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * delete the 1st 32K, making the new start address 1M+32K.
+ */
+static int test_front_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, SZ_32K);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + SZ_32K);
+ goto out;
+ }
+
+ len = SZ_32K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len,
+ map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical + SZ_32K, logical + SZ_32K + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical + SZ_32K) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical + SZ_32K, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_32K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_32K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (!ret) {
+ ret = -EINVAL;
+ test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+ logical, logical + SZ_32K);
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * truncate the stripe extent down to 32K.
+ */
+static int test_tail_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ if (!io_stripe.dev) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical + SZ_32K, logical + SZ_64K);
+ goto out;
+ }
+
+ len = SZ_32K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_32K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_32K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, len);
+ if (ret)
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * overwrite the whole range giving it new physical address at an offset of 1G.
+ * The intent of this test is to exercise the 'update_raid_extent_item()'
+ * function called be btrfs_insert_one_raid_extent().
+ */
+static int test_create_update_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ if (!io_stripe.dev) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = SZ_1G + logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("updating RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical + SZ_1G) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical + SZ_1G, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, len);
+ if (ret)
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M.
+ * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M.
+ */
+static int test_simple_create_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ bioc->map_type = map_type;
+ bioc->size = SZ_64K;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ if (!io_stripe.dev) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, len);
+ if (ret)
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+static const test_func_t tests[] = {
+ test_simple_create_delete,
+ test_create_update_delete,
+ test_tail_delete,
+ test_front_delete,
+};
+
+static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root = NULL;
+ int ret;
+
+ fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
+ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
+ root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ fs_info->stripe_root = root;
+ root->fs_info->tree_root = root;
+
+ root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
+ if (IS_ERR(root->node)) {
+ test_std_err(TEST_ALLOC_EXTENT_BUFFER);
+ ret = PTR_ERR(root->node);
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 2 * nodesize;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_device *dev;
+
+ dev = btrfs_alloc_dummy_device(fs_info);
+ if (IS_ERR(dev)) {
+ test_err("cannot allocate device");
+ ret = PTR_ERR(dev);
+ goto out;
+ }
+ dev->devid = i;
+ }
+
+ btrfs_init_dummy_trans(&trans, root->fs_info);
+ ret = test(&trans);
+ if (ret)
+ goto out;
+
+out:
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
+
+ return ret;
+}
+
+int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize)
+{
+ int ret = 0;
+
+ test_msg("running raid-stripe-tree tests");
+ for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+ ret = run_test(tests[i], sectorsize, nodesize);
+ if (ret) {
+ test_err("test-case %ps failed with %d\n", tests[i], ret);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0fc873af891f..dc0b837efd5d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -141,8 +141,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
- WARN_ON(!RB_EMPTY_ROOT(
- &transaction->delayed_refs.href_root.rb_root));
+ WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs));
WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
@@ -349,9 +348,8 @@ loop:
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
- cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
+ xa_init(&cur_trans->delayed_refs.head_refs);
xa_init(&cur_trans->delayed_refs.dirty_extents);
- atomic_set(&cur_trans->delayed_refs.num_entries, 0);
/*
* although the tree mod log is per file system and not per transaction,
@@ -2052,7 +2050,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
spin_unlock(&fs_info->trans_lock);
- btrfs_cleanup_one_transaction(trans->transaction, fs_info);
+ btrfs_cleanup_one_transaction(trans->transaction);
spin_lock(&fs_info->trans_lock);
if (cur_trans == fs_info->running_transaction)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index dd9ce9b9f69e..184fa5c0062a 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -33,7 +33,7 @@ struct btrfs_path;
*/
#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
-/* Radix-tree tag for roots that are part of the trasaction. */
+/* Radix-tree tag for roots that are part of the transaction. */
#define BTRFS_ROOT_TRANS_TAG 0
enum btrfs_trans_state {
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7b50263723bc..148d8cefa40e 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -2183,8 +2183,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
return 0;
}
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
- struct btrfs_key *first_key, u64 parent_transid)
+int btrfs_verify_level_key(struct extent_buffer *eb,
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int found_level;
@@ -2192,16 +2192,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
int ret;
found_level = btrfs_header_level(eb);
- if (found_level != level) {
+ if (found_level != check->level) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: tree level check failed\n");
btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
- eb->start, level, found_level);
+ eb->start, check->level, found_level);
return -EIO;
}
- if (!first_key)
+ if (!check->has_first_key)
return 0;
/*
@@ -2226,15 +2226,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
btrfs_node_key_to_cpu(eb, &found_key, 0);
else
btrfs_item_key_to_cpu(eb, &found_key, 0);
- ret = btrfs_comp_cpu_keys(first_key, &found_key);
+ ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
if (ret) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: tree first key check failed\n");
btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
- eb->start, parent_transid, first_key->objectid,
- first_key->type, first_key->offset,
+ eb->start, check->transid, check->first_key.objectid,
+ check->first_key.type, check->first_key.offset,
found_key.objectid, found_key.type,
found_key.offset);
}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 01669cfa6578..db67f96cbe4b 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -69,7 +69,7 @@ int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
- struct btrfs_key *first_key, u64 parent_transid);
+int btrfs_verify_level_key(struct extent_buffer *eb,
+ const struct btrfs_tree_parent_check *check);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9637c7cdc0cf..c8d6587688b3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -6204,7 +6204,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
const struct list_head *delayed_del_list,
const struct btrfs_delayed_item *first,
const struct btrfs_delayed_item **last_ret)
@@ -6265,7 +6264,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
if (ret < 0) {
return ret;
} else if (ret == 0) {
- ret = batch_delete_dir_index_items(trans, inode, path, ctx,
+ ret = batch_delete_dir_index_items(trans, inode, path,
delayed_del_list, curr,
&last);
if (ret)
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index b382a4c443d4..1ac2678fc4ca 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -909,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
* is freed (its refcount is decremented).
*/
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
struct extent_buffer *eb,
u64 time_seq)
{
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 6308c577a4a4..1c12566040db 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -41,7 +41,6 @@ int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op);
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
struct extent_buffer *eb,
u64 time_seq);
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index eb51b609190f..1cccaf9c2b0d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -733,6 +733,114 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
}
/*
+ * We can have very weird soft links passed in.
+ * One example is "/proc/self/fd/<fd>", which can be a soft link to
+ * a block device.
+ *
+ * But it's never a good idea to use those weird names.
+ * Here we check if the path (not following symlinks) is a good one inside
+ * "/dev/".
+ */
+static bool is_good_dev_path(const char *dev_path)
+{
+ struct path path = { .mnt = NULL, .dentry = NULL };
+ char *path_buf = NULL;
+ char *resolved_path;
+ bool is_good = false;
+ int ret;
+
+ if (!dev_path)
+ goto out;
+
+ path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!path_buf)
+ goto out;
+
+ /*
+ * Do not follow soft link, just check if the original path is inside
+ * "/dev/".
+ */
+ ret = kern_path(dev_path, 0, &path);
+ if (ret)
+ goto out;
+ resolved_path = d_path(&path, path_buf, PATH_MAX);
+ if (IS_ERR(resolved_path))
+ goto out;
+ if (strncmp(resolved_path, "/dev/", strlen("/dev/")))
+ goto out;
+ is_good = true;
+out:
+ kfree(path_buf);
+ path_put(&path);
+ return is_good;
+}
+
+static int get_canonical_dev_path(const char *dev_path, char *canonical)
+{
+ struct path path = { .mnt = NULL, .dentry = NULL };
+ char *path_buf = NULL;
+ char *resolved_path;
+ int ret;
+
+ if (!dev_path) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!path_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = kern_path(dev_path, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto out;
+ resolved_path = d_path(&path, path_buf, PATH_MAX);
+ ret = strscpy(canonical, resolved_path, PATH_MAX);
+out:
+ kfree(path_buf);
+ path_put(&path);
+ return ret;
+}
+
+static bool is_same_device(struct btrfs_device *device, const char *new_path)
+{
+ struct path old = { .mnt = NULL, .dentry = NULL };
+ struct path new = { .mnt = NULL, .dentry = NULL };
+ char *old_path = NULL;
+ bool is_same = false;
+ int ret;
+
+ if (!device->name)
+ goto out;
+
+ old_path = kzalloc(PATH_MAX, GFP_NOFS);
+ if (!old_path)
+ goto out;
+
+ rcu_read_lock();
+ ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
+ rcu_read_unlock();
+ if (ret < 0)
+ goto out;
+
+ ret = kern_path(old_path, LOOKUP_FOLLOW, &old);
+ if (ret)
+ goto out;
+ ret = kern_path(new_path, LOOKUP_FOLLOW, &new);
+ if (ret)
+ goto out;
+ if (path_equal(&old, &new))
+ is_same = true;
+out:
+ kfree(old_path);
+ path_put(&old);
+ path_put(&new);
+ return is_same;
+}
+
+/*
* Add new device to list of registered devices
*
* Returns:
@@ -852,7 +960,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
- } else if (!device->name || strcmp(device->name->str, path)) {
+ } else if (!device->name || !is_same_device(device, path)) {
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
@@ -1383,12 +1491,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct file *bdev_file;
+ char *canonical_path = NULL;
u64 bytenr;
dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
+ if (!is_good_dev_path(path)) {
+ canonical_path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (canonical_path) {
+ ret = get_canonical_dev_path(path, canonical_path);
+ if (ret < 0) {
+ kfree(canonical_path);
+ canonical_path = NULL;
+ }
+ }
+ }
/*
* Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command,
@@ -1433,7 +1552,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
goto free_disk_super;
}
- device = device_list_add(path, disk_super, &new_device_added);
+ device = device_list_add(canonical_path ? : path, disk_super,
+ &new_device_added);
if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device);
@@ -1442,6 +1562,7 @@ free_disk_super:
error_bdev_put:
fput(bdev_file);
+ kfree(canonical_path);
return device;
}
@@ -2721,8 +2842,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
- btrfs_clear_sb_rdonly(sb);
-
/* GFP_KERNEL allocation must not be under device_list_mutex */
seed_devices = btrfs_init_sprout(fs_info);
if (IS_ERR(seed_devices)) {
@@ -2865,8 +2984,6 @@ error_sysfs:
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans:
- if (seeding_dev)
- btrfs_set_sb_rdonly(sb);
if (trans)
btrfs_end_transaction(trans);
error_free_zone:
@@ -5310,7 +5427,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
- /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
+ /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */
if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
ctl->stripe_size) + ctl->nparity,
@@ -5842,24 +5959,6 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
-int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
-{
- struct btrfs_chunk_map *map;
- int ret = 0;
-
- if (!btrfs_fs_incompat(fs_info, RAID56))
- return 0;
-
- map = btrfs_get_chunk_map(fs_info, logical, len);
-
- if (!WARN_ON(IS_ERR(map))) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- ret = 1;
- btrfs_free_chunk_map(map);
- }
- return ret;
-}
-
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
@@ -5920,9 +6019,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
return preferred_mirror;
}
-static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
- u64 logical,
- u16 total_stripes)
+EXPORT_FOR_TESTS
+struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical, u16 total_stripes)
{
struct btrfs_io_context *bioc;
@@ -6481,13 +6580,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
max_len = btrfs_max_io_len(map, map_offset, &io_geom);
*length = min_t(u64, map->chunk_len - map_offset, max_len);
- down_read(&dev_replace->rwsem);
+ if (dev_replace->replace_task != current)
+ down_read(&dev_replace->rwsem);
+
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
/*
* Hold the semaphore for read during the whole operation, write is
* requested at commit time but must wait.
*/
- if (!dev_replace_is_ongoing)
+ if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
up_read(&dev_replace->rwsem);
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
@@ -6627,7 +6728,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
bioc->mirror_num = io_geom.mirror_num;
out:
- if (dev_replace_is_ongoing) {
+ if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
lockdep_assert_held(&dev_replace->rwsem);
/* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4481575dd70f..3a416b1bc24c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -306,7 +306,7 @@ enum btrfs_read_policy {
BTRFS_NR_READ_POLICY,
};
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/*
* Checksum mode - offload it to workqueues or do it synchronously in
* btrfs_submit_chunk().
@@ -430,7 +430,7 @@ struct btrfs_fs_devices {
/* Policy used to read the mirrored stripes. */
enum btrfs_read_policy read_policy;
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/* Checksum mode - offload it or do it synchronously. */
enum btrfs_offload_csum_mode offload_csum_mode;
#endif
@@ -741,8 +741,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
-int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
- u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
@@ -840,4 +838,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical, u16 total_stripes);
+#endif
+
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ce464cd8e0ac..bc18710d1dcf 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
{
struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
size_t name_len = strlen(name);
int ret = 0;
@@ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
*/
ret = 0;
btrfs_assert_tree_write_locked(path->nodes[0]);
- di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ di = btrfs_match_dir_item_name(path, name, name_len);
if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC;
goto out;
}
} else if (ret == -EEXIST) {
ret = 0;
- di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ di = btrfs_match_dir_item_name(path, name, name_len);
ASSERT(di); /* logic error */
} else if (ret) {
goto out;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 100abc00b794..ddf0d5a448a7 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
pg_off = offset_in_page(start);
cur_len = btrfs_calc_input_length(orig_end, start);
data_in = kmap_local_folio(in_folio, pg_off);
- start += PAGE_SIZE;
+ start += cur_len;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = cur_len;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 69d03feea4e0..11ed523e528e 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -707,11 +707,14 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
* zoned mode. In this case, we don't have a valid max zone
* append size.
*/
- if (bdev_is_zoned(device->bdev)) {
- blk_stack_limits(lim,
- &bdev_get_queue(device->bdev)->limits,
- 0);
- }
+ if (bdev_is_zoned(device->bdev))
+ blk_stack_limits(lim, bdev_limits(device->bdev), 0);
+ }
+
+ ret = blk_validate_limits(lim);
+ if (ret) {
+ btrfs_err(fs_info, "zoned: failed to validate queue limits");
+ return ret;
}
/*
@@ -1739,7 +1742,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
return false;
/*
- * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the
* extent layout the relocation code has.
* Furthermore we have set aside own block-group from which only the
* relocation "process" can allocate and make sure only one process at a
@@ -1973,7 +1976,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
if (block_group->meta_write_pointer > eb->start)
return -EBUSY;
- /* If for_sync, this hole will be filled with trasnsaction commit. */
+ /* If for_sync, this hole will be filled with transaction commit. */
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
return -EAGAIN;
return -EBUSY;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 866607fd3e58..5232b56d5892 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
+ ASSERT(timer == &wsm.timer);
+
spin_lock(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
@@ -495,7 +497,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
- tot_in += PAGE_SIZE;
+ tot_in += workspace->in_buf.size;
kunmap_local(workspace->in_buf.src);
workspace->in_buf.src = NULL;
folio_put(in_folio);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1fc9a50def0b..b158cb7a5038 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -855,8 +855,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
* done a sync(). Just drop the buffers from the inode list.
*
* NOTE: we take the inode's blockdev's mapping's i_private_lock. Which
- * assumes that all the buffers are against the blockdev. Not true
- * for reiserfs.
+ * assumes that all the buffers are against the blockdev.
*/
void invalidate_inode_buffers(struct inode *inode)
{
@@ -1649,6 +1648,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
if (length == folio_size(folio))
filemap_release_folio(folio, 0);
out:
+ folio_clear_mappedtodisk(folio);
return;
}
EXPORT_SYMBOL(block_invalidate_folio);
@@ -2803,7 +2803,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_write_hint = write_hint;
- __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+ bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
@@ -2813,7 +2813,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
if (wbc) {
wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
+ wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
}
submit_bio(bio);
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 35ba2117a6f6..3e63cfe15874 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -327,6 +327,8 @@ static void cachefiles_commit_object(struct cachefiles_object *object,
static void cachefiles_clean_up_object(struct cachefiles_object *object,
struct cachefiles_cache *cache)
{
+ struct file *file;
+
if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) {
if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
cachefiles_see_object(object, cachefiles_obj_see_clean_delete);
@@ -342,10 +344,14 @@ static void cachefiles_clean_up_object(struct cachefiles_object *object,
}
cachefiles_unmark_inode_in_use(object, object->file);
- if (object->file) {
- fput(object->file);
- object->file = NULL;
- }
+
+ spin_lock(&object->lock);
+ file = object->file;
+ object->file = NULL;
+ spin_unlock(&object->lock);
+
+ if (file)
+ fput(file);
}
/*
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 2b3f9935dbb4..7cf59713f0f7 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -691,11 +691,6 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
}
if (!d_is_negative(dentry)) {
- if (d_backing_inode(dentry) == file_inode(object->file)) {
- success = true;
- goto out_dput;
- }
-
ret = cachefiles_unlink(volume->cache, object, fan, dentry,
FSCACHE_OBJECT_IS_STALE);
if (ret < 0)
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 470c96658385..fe3de9ad57bf 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -60,26 +60,36 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
{
struct cachefiles_object *object = kiocb->ki_filp->private_data;
struct cachefiles_cache *cache = object->volume->cache;
- struct file *file = object->file;
- size_t len = iter->count;
+ struct file *file;
+ size_t len = iter->count, aligned_len = len;
loff_t pos = kiocb->ki_pos;
const struct cred *saved_cred;
int ret;
- if (!file)
+ spin_lock(&object->lock);
+ file = object->file;
+ if (!file) {
+ spin_unlock(&object->lock);
return -ENOBUFS;
+ }
+ get_file(file);
+ spin_unlock(&object->lock);
cachefiles_begin_secure(cache, &saved_cred);
- ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true);
+ ret = __cachefiles_prepare_write(object, file, &pos, &aligned_len, len, true);
cachefiles_end_secure(cache, saved_cred);
if (ret < 0)
- return ret;
+ goto out;
trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
- if (!ret)
+ if (!ret) {
ret = len;
+ kiocb->ki_pos += ret;
+ }
+out:
+ fput(file);
return ret;
}
@@ -87,12 +97,22 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
int whence)
{
struct cachefiles_object *object = filp->private_data;
- struct file *file = object->file;
+ struct file *file;
+ loff_t ret;
- if (!file)
+ spin_lock(&object->lock);
+ file = object->file;
+ if (!file) {
+ spin_unlock(&object->lock);
return -ENOBUFS;
+ }
+ get_file(file);
+ spin_unlock(&object->lock);
- return vfs_llseek(file, pos, whence);
+ ret = vfs_llseek(file, pos, whence);
+ fput(file);
+
+ return ret;
}
static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c2a9e2cc03de..4c82348fe1e6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1054,7 +1054,9 @@ get_more_pages:
if (!nr_folios && !locked_pages)
break;
for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
- page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
+
+ page = &folio->page;
doutc(cl, "? %p idx %lu\n", page, page->index);
if (locked_pages == 0)
lock_page(page); /* first page */
@@ -1081,8 +1083,6 @@ get_more_pages:
continue;
}
if (page_offset(page) >= ceph_wbc.i_size) {
- struct folio *folio = page_folio(page);
-
doutc(cl, "folio at %lu beyond eof %llu\n",
folio->index, ceph_wbc.i_size);
if ((ceph_wbc.size_stable ||
@@ -1098,16 +1098,16 @@ get_more_pages:
unlock_page(page);
break;
}
- if (PageWriteback(page) ||
- PagePrivate2(page) /* [DEPRECATED] */) {
+ if (folio_test_writeback(folio) ||
+ folio_test_private_2(folio) /* [DEPRECATED] */) {
if (wbc->sync_mode == WB_SYNC_NONE) {
- doutc(cl, "%p under writeback\n", page);
- unlock_page(page);
+ doutc(cl, "%p under writeback\n", folio);
+ folio_unlock(folio);
continue;
}
- doutc(cl, "waiting on writeback %p\n", page);
- wait_on_page_writeback(page);
- folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
+ doutc(cl, "waiting on writeback %p\n", folio);
+ folio_wait_writeback(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
}
if (!clear_page_dirty_for_io(page)) {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 57cc096c498a..c2ddb998f3c9 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -562,8 +562,8 @@ int cdev_device_add(struct cdev *cdev, struct device *dev)
/**
* cdev_device_del() - inverse of cdev_device_add
- * @dev: the device structure
* @cdev: the cdev structure
+ * @dev: the device structure
*
* cdev_device_del() is a helper function to call cdev_del and device_del.
* It should be used whenever cdev_device_add is used.
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 8f0af4f62631..d5ef5469e4e6 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -80,6 +80,16 @@
#define ELF_HWCAP2 COMPAT_ELF_HWCAP2
#endif
+#ifdef COMPAT_ELF_HWCAP3
+#undef ELF_HWCAP3
+#define ELF_HWCAP3 COMPAT_ELF_HWCAP3
+#endif
+
+#ifdef COMPAT_ELF_HWCAP4
+#undef ELF_HWCAP4
+#define ELF_HWCAP4 COMPAT_ELF_HWCAP4
+#endif
+
#ifdef COMPAT_ARCH_DLINFO
#undef ARCH_DLINFO
#define ARCH_DLINFO COMPAT_ARCH_DLINFO
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index e710a1782382..0b969d0eb8ff 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -55,6 +55,8 @@ struct configfs_dirent {
#define CONFIGFS_USET_IN_MKDIR 0x0200
#define CONFIGFS_USET_CREATING 0x0400
#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)
+#define CONFIGFS_PINNED \
+ (CONFIGFS_ROOT | CONFIGFS_DIR | CONFIGFS_ITEM_LINK)
extern struct mutex configfs_symlink_mutex;
extern spinlock_t configfs_dirent_lock;
@@ -73,8 +75,6 @@ extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *,
void *, umode_t, int, struct configfs_fragment *);
extern int configfs_dirent_is_ready(struct configfs_dirent *);
-extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
-
extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
extern int configfs_setattr(struct mnt_idmap *idmap,
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 43d6bde1adcc..7d10278db30d 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -207,7 +207,17 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren
return ERR_PTR(-ENOENT);
}
sd->s_frag = get_fragment(frag);
- list_add(&sd->s_sibling, &parent_sd->s_children);
+
+ /*
+ * configfs_lookup scans only for unpinned items. s_children is
+ * partitioned so that configfs_lookup can bail out early.
+ * CONFIGFS_PINNED and CONFIGFS_NOT_PINNED are not symmetrical. readdir
+ * cursors still need to be inserted at the front of the list.
+ */
+ if (sd->s_type & CONFIGFS_PINNED)
+ list_add_tail(&sd->s_sibling, &parent_sd->s_children);
+ else
+ list_add(&sd->s_sibling, &parent_sd->s_children);
spin_unlock(&configfs_dirent_lock);
return sd;
@@ -220,10 +230,11 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *paren
*
* called with parent inode's i_mutex held
*/
-static int configfs_dirent_exists(struct configfs_dirent *parent_sd,
- const unsigned char *new)
+static int configfs_dirent_exists(struct dentry *dentry)
{
- struct configfs_dirent * sd;
+ struct configfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+ const unsigned char *new = dentry->d_name.name;
+ struct configfs_dirent *sd;
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
if (sd->s_element) {
@@ -289,10 +300,6 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry,
BUG_ON(!item);
- error = configfs_dirent_exists(p->d_fsdata, dentry->d_name.name);
- if (unlikely(error))
- return error;
-
error = configfs_make_dirent(p->d_fsdata, dentry, item, mode,
CONFIGFS_DIR | CONFIGFS_USET_CREATING,
frag);
@@ -451,6 +458,18 @@ static struct dentry * configfs_lookup(struct inode *dir,
spin_lock(&configfs_dirent_lock);
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+
+ /*
+ * s_children is partitioned, see configfs_new_dirent. The first
+ * pinned item indicates we can stop scanning.
+ */
+ if (sd->s_type & CONFIGFS_PINNED)
+ break;
+
+ /*
+ * Note: CONFIGFS_PINNED and CONFIGFS_NOT_PINNED are asymmetric.
+ * there may be a readdir cursor in this list
+ */
if ((sd->s_type & CONFIGFS_NOT_PINNED) &&
!strcmp(configfs_get_name(sd), dentry->d_name.name)) {
struct configfs_attribute *attr = sd->s_element;
@@ -1885,8 +1904,11 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
if (dentry) {
d_add(dentry, NULL);
- err = configfs_attach_group(sd->s_element, &group->cg_item,
- dentry, frag);
+ err = configfs_dirent_exists(dentry);
+ if (!err)
+ err = configfs_attach_group(sd->s_element,
+ &group->cg_item,
+ dentry, frag);
if (err) {
BUG_ON(d_inode(dentry));
d_drop(dentry);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index dcc22f593e43..1d2e3a5738d1 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -216,28 +216,3 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
spin_unlock(&dentry->d_lock);
}
}
-
-void configfs_hash_and_remove(struct dentry * dir, const char * name)
-{
- struct configfs_dirent * sd;
- struct configfs_dirent * parent_sd = dir->d_fsdata;
-
- if (d_really_is_negative(dir))
- /* no inode means this hasn't been made visible yet */
- return;
-
- inode_lock(d_inode(dir));
- list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
- if (!sd->s_element)
- continue;
- if (!strcmp(configfs_get_name(sd), name)) {
- spin_lock(&configfs_dirent_lock);
- list_del_init(&sd->s_sibling);
- spin_unlock(&configfs_dirent_lock);
- configfs_drop_dentry(sd, dir);
- configfs_put(sd);
- break;
- }
- }
- inode_unlock(d_inode(dir));
-}
diff --git a/fs/coredump.c b/fs/coredump.c
index 45737b43dda5..d48edb37bc35 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -951,6 +951,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
} else {
dump_skip(cprm, PAGE_SIZE);
}
+ cond_resched();
}
dump_page_free(dump_page);
return 1;
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 206835e31efa..787e9c8938ba 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -22,6 +22,7 @@
#include <crypto/skcipher.h>
#include <linux/key-type.h>
#include <linux/random.h>
+#include <linux/once.h>
#include <linux/seq_file.h>
#include "fscrypt_private.h"
diff --git a/fs/dcache.c b/fs/dcache.c
index 0f6b16ba30d0..0099077a2982 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -135,6 +135,7 @@ struct dentry_stat_t {
static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);
+static int dentry_negative_policy;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
/* Statistics gathering. */
@@ -199,6 +200,15 @@ static struct ctl_table fs_dcache_sysctls[] = {
.mode = 0444,
.proc_handler = proc_nr_dentry,
},
+ {
+ .procname = "dentry-negative",
+ .data = &dentry_negative_policy,
+ .maxlen = sizeof(dentry_negative_policy),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
};
static int __init init_fs_dcache_sysctls(void)
@@ -2039,8 +2049,8 @@ EXPORT_SYMBOL(d_obtain_root);
/**
* d_add_ci - lookup or allocate new dentry with case-exact name
- * @inode: the inode case-insensitive lookup has found
* @dentry: the negative dentry that was passed to the parent's lookup func
+ * @inode: the inode case-insensitive lookup has found
* @name: the case-exact name to be associated with the returned dentry
*
* This is to avoid filling the dcache with case-insensitive names to the
@@ -2093,8 +2103,8 @@ EXPORT_SYMBOL(d_add_ci);
/**
* d_same_name - compare dentry name with case-exact name
- * @parent: parent dentry
* @dentry: the negative dentry that was passed to the parent's lookup func
+ * @parent: parent dentry
* @name: the case-exact name to be associated with the returned dentry
*
* Return: true if names are same, or false
@@ -2401,6 +2411,8 @@ void d_delete(struct dentry * dentry)
* Are we the only user?
*/
if (dentry->d_lockref.count == 1) {
+ if (dentry_negative_policy)
+ __d_drop(dentry);
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
dentry_unlink_inode(dentry);
} else {
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 67299e8b734e..47dc96dfe386 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -100,8 +100,16 @@ int debugfs_file_get(struct dentry *dentry)
if (!fsd)
return -ENOMEM;
- fsd->real_fops = (void *)((unsigned long)d_fsd &
- ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+ if ((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT) {
+ fsd->real_fops = NULL;
+ fsd->short_fops = (void *)((unsigned long)d_fsd &
+ ~(DEBUGFS_FSDATA_IS_REAL_FOPS_BIT |
+ DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT));
+ } else {
+ fsd->real_fops = (void *)((unsigned long)d_fsd &
+ ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
+ fsd->short_fops = NULL;
+ }
refcount_set(&fsd->active_users, 1);
init_completion(&fsd->active_users_drained);
INIT_LIST_HEAD(&fsd->cancellations);
@@ -241,9 +249,10 @@ static int debugfs_locked_down(struct inode *inode,
{
if ((inode->i_mode & 07777 & ~0444) == 0 &&
!(filp->f_mode & FMODE_WRITE) &&
- !real_fops->unlocked_ioctl &&
- !real_fops->compat_ioctl &&
- !real_fops->mmap)
+ (!real_fops ||
+ (!real_fops->unlocked_ioctl &&
+ !real_fops->compat_ioctl &&
+ !real_fops->mmap)))
return 0;
if (security_locked_down(LOCKDOWN_DEBUGFS))
@@ -316,19 +325,38 @@ static ret_type full_proxy_ ## name(proto) \
return r; \
}
-FULL_PROXY_FUNC(llseek, loff_t, filp,
- PROTO(struct file *filp, loff_t offset, int whence),
- ARGS(filp, offset, whence));
+#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args) \
+static ret_type full_proxy_ ## name(proto) \
+{ \
+ struct dentry *dentry = F_DENTRY(filp); \
+ struct debugfs_fsdata *fsd; \
+ ret_type r; \
+ \
+ r = debugfs_file_get(dentry); \
+ if (unlikely(r)) \
+ return r; \
+ fsd = dentry->d_fsdata; \
+ if (fsd->real_fops) \
+ r = fsd->real_fops->name(args); \
+ else \
+ r = fsd->short_fops->name(args); \
+ debugfs_file_put(dentry); \
+ return r; \
+}
+
+FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp,
+ PROTO(struct file *filp, loff_t offset, int whence),
+ ARGS(filp, offset, whence));
-FULL_PROXY_FUNC(read, ssize_t, filp,
- PROTO(struct file *filp, char __user *buf, size_t size,
- loff_t *ppos),
- ARGS(filp, buf, size, ppos));
+FULL_PROXY_FUNC_BOTH(read, ssize_t, filp,
+ PROTO(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos));
-FULL_PROXY_FUNC(write, ssize_t, filp,
- PROTO(struct file *filp, const char __user *buf, size_t size,
- loff_t *ppos),
- ARGS(filp, buf, size, ppos));
+FULL_PROXY_FUNC_BOTH(write, ssize_t, filp,
+ PROTO(struct file *filp, const char __user *buf,
+ size_t size, loff_t *ppos),
+ ARGS(filp, buf, size, ppos));
FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
@@ -363,7 +391,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
* not to leak any resources. Releasers must not assume that
* ->i_private is still being meaningful here.
*/
- if (real_fops->release)
+ if (real_fops && real_fops->release)
r = real_fops->release(inode, filp);
replace_fops(filp, d_inode(dentry)->i_fop);
@@ -373,39 +401,48 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
}
static void __full_proxy_fops_init(struct file_operations *proxy_fops,
- const struct file_operations *real_fops)
+ struct debugfs_fsdata *fsd)
{
proxy_fops->release = full_proxy_release;
- if (real_fops->llseek)
+
+ if ((fsd->real_fops && fsd->real_fops->llseek) ||
+ (fsd->short_fops && fsd->short_fops->llseek))
proxy_fops->llseek = full_proxy_llseek;
- if (real_fops->read)
+
+ if ((fsd->real_fops && fsd->real_fops->read) ||
+ (fsd->short_fops && fsd->short_fops->read))
proxy_fops->read = full_proxy_read;
- if (real_fops->write)
+
+ if ((fsd->real_fops && fsd->real_fops->write) ||
+ (fsd->short_fops && fsd->short_fops->write))
proxy_fops->write = full_proxy_write;
- if (real_fops->poll)
+
+ if (fsd->real_fops && fsd->real_fops->poll)
proxy_fops->poll = full_proxy_poll;
- if (real_fops->unlocked_ioctl)
+
+ if (fsd->real_fops && fsd->real_fops->unlocked_ioctl)
proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
}
static int full_proxy_open(struct inode *inode, struct file *filp)
{
struct dentry *dentry = F_DENTRY(filp);
- const struct file_operations *real_fops = NULL;
+ const struct file_operations *real_fops;
struct file_operations *proxy_fops = NULL;
+ struct debugfs_fsdata *fsd;
int r;
r = debugfs_file_get(dentry);
if (r)
return r == -EIO ? -ENOENT : r;
- real_fops = debugfs_real_fops(filp);
-
+ fsd = dentry->d_fsdata;
+ real_fops = fsd->real_fops;
r = debugfs_locked_down(inode, filp, real_fops);
if (r)
goto out;
- if (!fops_get(real_fops)) {
+ if (real_fops && !fops_get(real_fops)) {
#ifdef CONFIG_MODULES
if (real_fops->owner &&
real_fops->owner->state == MODULE_STATE_GOING) {
@@ -426,11 +463,14 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
r = -ENOMEM;
goto free_proxy;
}
- __full_proxy_fops_init(proxy_fops, real_fops);
+ __full_proxy_fops_init(proxy_fops, fsd);
replace_fops(filp, proxy_fops);
- if (real_fops->open) {
- r = real_fops->open(inode, filp);
+ if (!real_fops || real_fops->open) {
+ if (real_fops)
+ r = real_fops->open(inode, filp);
+ else
+ r = simple_open(inode, filp);
if (r) {
replace_fops(filp, d_inode(dentry)->i_fop);
goto free_proxy;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 66d9b3b4c588..38a9c7eb97e6 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -412,7 +412,7 @@ static struct dentry *end_creating(struct dentry *dentry)
static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *proxy_fops,
- const struct file_operations *real_fops)
+ const void *real_fops)
{
struct dentry *dentry;
struct inode *inode;
@@ -450,49 +450,38 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
return end_creating(dentry);
}
-/**
- * debugfs_create_file - create a file in the debugfs filesystem
- * @name: a pointer to a string containing the name of the file to create.
- * @mode: the permission that the file should have.
- * @parent: a pointer to the parent dentry for this file. This should be a
- * directory dentry if set. If this parameter is NULL, then the
- * file will be created in the root of the debugfs filesystem.
- * @data: a pointer to something that the caller will want to get to later
- * on. The inode.i_private pointer will point to this value on
- * the open() call.
- * @fops: a pointer to a struct file_operations that should be used for
- * this file.
- *
- * This is the basic "create a file" function for debugfs. It allows for a
- * wide range of flexibility in creating a file, or a directory (if you want
- * to create a directory, the debugfs_create_dir() function is
- * recommended to be used instead.)
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value -%ENODEV will be
- * returned.
- *
- * NOTE: it's expected that most callers should _ignore_ the errors returned
- * by this function. Other debugfs functions handle the fact that the "dentry"
- * passed to them could be an error and they don't crash in that case.
- * Drivers should generally work fine even if debugfs fails to init anyway.
- */
-struct dentry *debugfs_create_file(const char *name, umode_t mode,
- struct dentry *parent, void *data,
- const struct file_operations *fops)
+struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops)
{
+ if (WARN_ON((unsigned long)fops &
+ (DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT |
+ DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+ return ERR_PTR(-EINVAL);
return __debugfs_create_file(name, mode, parent, data,
fops ? &debugfs_full_proxy_file_operations :
&debugfs_noop_file_operations,
fops);
}
-EXPORT_SYMBOL_GPL(debugfs_create_file);
+EXPORT_SYMBOL_GPL(debugfs_create_file_full);
+
+struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct debugfs_short_fops *fops)
+{
+ if (WARN_ON((unsigned long)fops &
+ (DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT |
+ DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
+ return ERR_PTR(-EINVAL);
+
+ return __debugfs_create_file(name, mode, parent, data,
+ fops ? &debugfs_full_proxy_file_operations :
+ &debugfs_noop_file_operations,
+ (const void *)((unsigned long)fops |
+ DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT));
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file_short);
/**
* debugfs_create_file_unsafe - create a file in the debugfs filesystem
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index dae80c2a469e..a3edfa4f0d8e 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -18,6 +18,7 @@ extern const struct file_operations debugfs_full_proxy_file_operations;
struct debugfs_fsdata {
const struct file_operations *real_fops;
+ const struct debugfs_short_fops *short_fops;
union {
/* automount_fn is used when real_fops is NULL */
debugfs_automount_t automount;
@@ -39,6 +40,11 @@ struct debugfs_fsdata {
* pointer gets its lowest bit set.
*/
#define DEBUGFS_FSDATA_IS_REAL_FOPS_BIT BIT(0)
+/*
+ * A dentry's ->d_fsdata, when pointing to real fops, is with
+ * short fops instead of full fops.
+ */
+#define DEBUGFS_FSDATA_IS_SHORT_FOPS_BIT BIT(1)
/* Access BITS */
#define DEBUGFS_ALLOW_API BIT(0)
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 742b30b61c19..0fe8d80ce5e8 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -30,7 +30,7 @@ static void dlm_run_callback(uint32_t ls_id, uint32_t lkb_id, int8_t mode,
trace_dlm_bast(ls_id, lkb_id, mode, res_name, res_length);
bastfn(astparam, mode);
} else if (flags & DLM_CB_CAST) {
- trace_dlm_ast(ls_id, lkb_id, sb_status, sb_flags, res_name,
+ trace_dlm_ast(ls_id, lkb_id, sb_flags, sb_status, res_name,
res_length);
lksb->sb_status = sb_status;
lksb->sb_flags = sb_flags;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index eac96f1c1d74..b2f21aa00719 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -24,9 +24,9 @@
#include "lowcomms.h"
/*
- * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid (refers to <node>)
* /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
- * /config/dlm/<cluster>/comms/<comm>/nodeid
+ * /config/dlm/<cluster>/comms/<comm>/nodeid (refers to <comm>)
* /config/dlm/<cluster>/comms/<comm>/local
* /config/dlm/<cluster>/comms/<comm>/addr (write only)
* /config/dlm/<cluster>/comms/<comm>/addr_list (read only)
@@ -73,20 +73,6 @@ const struct rhashtable_params dlm_rhash_rsb_params = {
struct dlm_cluster {
struct config_group group;
- unsigned int cl_tcp_port;
- unsigned int cl_buffer_size;
- unsigned int cl_rsbtbl_size;
- unsigned int cl_recover_timer;
- unsigned int cl_toss_secs;
- unsigned int cl_scan_secs;
- unsigned int cl_log_debug;
- unsigned int cl_log_info;
- unsigned int cl_protocol;
- unsigned int cl_mark;
- unsigned int cl_new_rsb_count;
- unsigned int cl_recover_callbacks;
- char cl_cluster_name[DLM_LOCKSPACE_LEN];
-
struct dlm_spaces *sps;
struct dlm_comms *cms;
};
@@ -115,25 +101,60 @@ enum {
static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf)
{
- struct dlm_cluster *cl = config_item_to_cluster(item);
- return sprintf(buf, "%s\n", cl->cl_cluster_name);
+ return sprintf(buf, "%s\n", dlm_config.ci_cluster_name);
}
static ssize_t cluster_cluster_name_store(struct config_item *item,
const char *buf, size_t len)
{
- struct dlm_cluster *cl = config_item_to_cluster(item);
-
strscpy(dlm_config.ci_cluster_name, buf,
- sizeof(dlm_config.ci_cluster_name));
- strscpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
+ sizeof(dlm_config.ci_cluster_name));
return len;
}
CONFIGFS_ATTR(cluster_, cluster_name);
-static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
- int *info_field, int (*check_cb)(unsigned int x),
+static ssize_t cluster_tcp_port_show(struct config_item *item, char *buf)
+{
+ return sprintf(buf, "%u\n", be16_to_cpu(dlm_config.ci_tcp_port));
+}
+
+static int dlm_check_zero_and_dlm_running(unsigned int x)
+{
+ if (!x)
+ return -EINVAL;
+
+ if (dlm_lowcomms_is_running())
+ return -EBUSY;
+
+ return 0;
+}
+
+static ssize_t cluster_tcp_port_store(struct config_item *item,
+ const char *buf, size_t len)
+{
+ int rc;
+ u16 x;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ rc = kstrtou16(buf, 0, &x);
+ if (rc)
+ return rc;
+
+ rc = dlm_check_zero_and_dlm_running(x);
+ if (rc)
+ return rc;
+
+ dlm_config.ci_tcp_port = cpu_to_be16(x);
+ return len;
+}
+
+CONFIGFS_ATTR(cluster_, tcp_port);
+
+static ssize_t cluster_set(unsigned int *info_field,
+ int (*check_cb)(unsigned int x),
const char *buf, size_t len)
{
unsigned int x;
@@ -151,7 +172,6 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
return rc;
}
- *cl_field = x;
*info_field = x;
return len;
@@ -161,14 +181,11 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
static ssize_t cluster_##name##_store(struct config_item *item, \
const char *buf, size_t len) \
{ \
- struct dlm_cluster *cl = config_item_to_cluster(item); \
- return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
- check_cb, buf, len); \
+ return cluster_set(&dlm_config.ci_##name, check_cb, buf, len); \
} \
static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
{ \
- struct dlm_cluster *cl = config_item_to_cluster(item); \
- return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \
+ return snprintf(buf, PAGE_SIZE, "%u\n", dlm_config.ci_##name); \
} \
CONFIGFS_ATTR(cluster_, name);
@@ -191,17 +208,6 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x)
return 0;
}
-static int dlm_check_zero_and_dlm_running(unsigned int x)
-{
- if (!x)
- return -EINVAL;
-
- if (dlm_lowcomms_is_running())
- return -EBUSY;
-
- return 0;
-}
-
static int dlm_check_zero(unsigned int x)
{
if (!x)
@@ -218,7 +224,6 @@ static int dlm_check_buffer_size(unsigned int x)
return 0;
}
-CLUSTER_ATTR(tcp_port, dlm_check_zero_and_dlm_running);
CLUSTER_ATTR(buffer_size, dlm_check_buffer_size);
CLUSTER_ATTR(rsbtbl_size, dlm_check_zero);
CLUSTER_ATTR(recover_timer, dlm_check_zero);
@@ -423,20 +428,6 @@ static struct config_group *make_cluster(struct config_group *g,
configfs_add_default_group(&sps->ss_group, &cl->group);
configfs_add_default_group(&cms->cs_group, &cl->group);
- cl->cl_tcp_port = dlm_config.ci_tcp_port;
- cl->cl_buffer_size = dlm_config.ci_buffer_size;
- cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size;
- cl->cl_recover_timer = dlm_config.ci_recover_timer;
- cl->cl_toss_secs = dlm_config.ci_toss_secs;
- cl->cl_scan_secs = dlm_config.ci_scan_secs;
- cl->cl_log_debug = dlm_config.ci_log_debug;
- cl->cl_log_info = dlm_config.ci_log_info;
- cl->cl_protocol = dlm_config.ci_protocol;
- cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
- cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
- memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
- DLM_LOCKSPACE_LEN);
-
space_list = &sps->ss_group;
comm_list = &cms->cs_group;
return &cl->group;
@@ -517,6 +508,12 @@ static void release_space(struct config_item *i)
static struct config_item *make_comm(struct config_group *g, const char *name)
{
struct dlm_comm *cm;
+ unsigned int nodeid;
+ int rv;
+
+ rv = kstrtouint(name, 0, &nodeid);
+ if (rv)
+ return ERR_PTR(rv);
cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
if (!cm)
@@ -528,7 +525,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
if (!cm->seq)
cm->seq = dlm_comm_count++;
- cm->nodeid = -1;
+ cm->nodeid = nodeid;
cm->local = 0;
cm->addr_count = 0;
cm->mark = 0;
@@ -555,16 +552,25 @@ static void release_comm(struct config_item *i)
static struct config_item *make_node(struct config_group *g, const char *name)
{
struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+ unsigned int nodeid;
struct dlm_node *nd;
+ uint32_t seq = 0;
+ int rv;
+
+ rv = kstrtouint(name, 0, &nodeid);
+ if (rv)
+ return ERR_PTR(rv);
nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
if (!nd)
return ERR_PTR(-ENOMEM);
config_item_init_type_name(&nd->item, name, &node_type);
- nd->nodeid = -1;
+ nd->nodeid = nodeid;
nd->weight = 1; /* default weight of 1 if none is set */
nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */
+ dlm_comm_seq(nodeid, &seq, true);
+ nd->comm_seq = seq;
mutex_lock(&sp->members_lock);
list_add(&nd->list, &sp->members);
@@ -622,16 +628,19 @@ void dlm_config_exit(void)
static ssize_t comm_nodeid_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", config_item_to_comm(item)->nodeid);
+ unsigned int nodeid;
+ int rv;
+
+ rv = kstrtouint(config_item_name(item), 0, &nodeid);
+ if (WARN_ON(rv))
+ return rv;
+
+ return sprintf(buf, "%u\n", nodeid);
}
static ssize_t comm_nodeid_store(struct config_item *item, const char *buf,
size_t len)
{
- int rc = kstrtoint(buf, 0, &config_item_to_comm(item)->nodeid);
-
- if (rc)
- return rc;
return len;
}
@@ -772,20 +781,19 @@ static struct configfs_attribute *comm_attrs[] = {
static ssize_t node_nodeid_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", config_item_to_node(item)->nodeid);
+ unsigned int nodeid;
+ int rv;
+
+ rv = kstrtouint(config_item_name(item), 0, &nodeid);
+ if (WARN_ON(rv))
+ return rv;
+
+ return sprintf(buf, "%u\n", nodeid);
}
static ssize_t node_nodeid_store(struct config_item *item, const char *buf,
size_t len)
{
- struct dlm_node *nd = config_item_to_node(item);
- uint32_t seq = 0;
- int rc = kstrtoint(buf, 0, &nd->nodeid);
-
- if (rc)
- return rc;
- dlm_comm_seq(nd->nodeid, &seq);
- nd->comm_seq = seq;
return len;
}
@@ -845,7 +853,7 @@ static struct dlm_comm *get_comm(int nodeid)
if (!comm_list)
return NULL;
- mutex_lock(&clusters_root.subsys.su_mutex);
+ WARN_ON_ONCE(!mutex_is_locked(&clusters_root.subsys.su_mutex));
list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
cm = config_item_to_comm(i);
@@ -856,7 +864,6 @@ static struct dlm_comm *get_comm(int nodeid)
config_item_get(i);
break;
}
- mutex_unlock(&clusters_root.subsys.su_mutex);
if (!found)
cm = NULL;
@@ -916,11 +923,20 @@ int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
return rv;
}
-int dlm_comm_seq(int nodeid, uint32_t *seq)
+int dlm_comm_seq(int nodeid, uint32_t *seq, bool locked)
{
- struct dlm_comm *cm = get_comm(nodeid);
+ struct dlm_comm *cm;
+
+ if (locked) {
+ cm = get_comm(nodeid);
+ } else {
+ mutex_lock(&clusters_root.subsys.su_mutex);
+ cm = get_comm(nodeid);
+ mutex_unlock(&clusters_root.subsys.su_mutex);
+ }
if (!cm)
return -EEXIST;
+
*seq = cm->seq;
put_comm(cm);
return 0;
@@ -957,7 +973,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_CLUSTER_NAME ""
struct dlm_config_info dlm_config = {
- .ci_tcp_port = DEFAULT_TCP_PORT,
+ .ci_tcp_port = cpu_to_be16(DEFAULT_TCP_PORT),
.ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE,
.ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE,
.ci_recover_timer = DEFAULT_RECOVER_TIMER,
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index ed237d910208..e48c4f9686d3 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -29,18 +29,18 @@ extern const struct rhashtable_params dlm_rhash_rsb_params;
#define DLM_PROTO_SCTP 1
struct dlm_config_info {
- int ci_tcp_port;
- int ci_buffer_size;
- int ci_rsbtbl_size;
- int ci_recover_timer;
- int ci_toss_secs;
- int ci_scan_secs;
- int ci_log_debug;
- int ci_log_info;
- int ci_protocol;
- int ci_mark;
- int ci_new_rsb_count;
- int ci_recover_callbacks;
+ __be16 ci_tcp_port;
+ unsigned int ci_buffer_size;
+ unsigned int ci_rsbtbl_size;
+ unsigned int ci_recover_timer;
+ unsigned int ci_toss_secs;
+ unsigned int ci_scan_secs;
+ unsigned int ci_log_debug;
+ unsigned int ci_log_info;
+ unsigned int ci_protocol;
+ unsigned int ci_mark;
+ unsigned int ci_new_rsb_count;
+ unsigned int ci_recover_callbacks;
char ci_cluster_name[DLM_LOCKSPACE_LEN];
};
@@ -50,7 +50,7 @@ int dlm_config_init(void);
void dlm_config_exit(void);
int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
int *count_out);
-int dlm_comm_seq(int nodeid, uint32_t *seq);
+int dlm_comm_seq(int nodeid, uint32_t *seq, bool locked);
int dlm_our_nodeid(void);
int dlm_our_addr(struct sockaddr_storage *addr, int num);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 865dc70a9dfc..fc1d710166e9 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1703,19 +1703,11 @@ static int msg_reply_type(int mstype)
/* add/remove lkb from global waiters list of lkb's waiting for
a reply from a remote node */
-static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
+static void add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
- int error = 0;
spin_lock_bh(&ls->ls_waiters_lock);
-
- if (is_overlap_unlock(lkb) ||
- (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
- error = -EINVAL;
- goto out;
- }
-
if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
switch (mstype) {
case DLM_MSG_UNLOCK:
@@ -1725,7 +1717,11 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags);
break;
default:
- error = -EBUSY;
+ /* should never happen as validate_lock_args() checks
+ * on lkb_wait_type and validate_unlock_args() only
+ * creates UNLOCK or CANCEL messages.
+ */
+ WARN_ON_ONCE(1);
goto out;
}
lkb->lkb_wait_count++;
@@ -1747,12 +1743,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
hold_lkb(lkb);
list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
out:
- if (error)
- log_error(ls, "addwait error %x %d flags %x %d %d %s",
- lkb->lkb_id, error, dlm_iflags_val(lkb), mstype,
- lkb->lkb_wait_type, lkb->lkb_resource->res_name);
spin_unlock_bh(&ls->ls_waiters_lock);
- return error;
}
/* We clear the RESEND flag because we might be taking an lkb off the waiters
@@ -2861,16 +2852,14 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
case -EINVAL:
/* annoy the user because dlm usage is wrong */
WARN_ON(1);
- log_error(ls, "%s %d %x %x %x %d %d %s", __func__,
+ log_error(ls, "%s %d %x %x %x %d %d", __func__,
rv, lkb->lkb_id, dlm_iflags_val(lkb), args->flags,
- lkb->lkb_status, lkb->lkb_wait_type,
- lkb->lkb_resource->res_name);
+ lkb->lkb_status, lkb->lkb_wait_type);
break;
default:
- log_debug(ls, "%s %d %x %x %x %d %d %s", __func__,
+ log_debug(ls, "%s %d %x %x %x %d %d", __func__,
rv, lkb->lkb_id, dlm_iflags_val(lkb), args->flags,
- lkb->lkb_status, lkb->lkb_wait_type,
- lkb->lkb_resource->res_name);
+ lkb->lkb_status, lkb->lkb_wait_type);
break;
}
@@ -2928,13 +2917,16 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
goto out;
}
+ if (is_overlap_unlock(lkb))
+ goto out;
+
/* cancel not allowed with another cancel/unlock in progress */
if (args->flags & DLM_LKF_CANCEL) {
if (lkb->lkb_exflags & DLM_LKF_CANCEL)
goto out;
- if (is_overlap(lkb))
+ if (is_overlap_cancel(lkb))
goto out;
if (test_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags)) {
@@ -2972,9 +2964,6 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
goto out;
- if (is_overlap_unlock(lkb))
- goto out;
-
if (test_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags)) {
set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags);
rv = -EBUSY;
@@ -3610,10 +3599,7 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
to_nodeid = r->res_nodeid;
- error = add_to_waiters(lkb, mstype, to_nodeid);
- if (error)
- return error;
-
+ add_to_waiters(lkb, mstype, to_nodeid);
error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
if (error)
goto fail;
@@ -3716,10 +3702,7 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
to_nodeid = dlm_dir_nodeid(r);
- error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
- if (error)
- return error;
-
+ add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
if (error)
goto fail;
@@ -5016,16 +4999,19 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_message *ms_local)
{
if (middle_conversion(lkb)) {
+ log_rinfo(ls, "%s %x middle convert in progress", __func__,
+ lkb->lkb_id);
+
+ /* We sent this lock to the new master. The new master will
+ * tell us when it's granted. We no longer need a reply, so
+ * use a fake reply to put the lkb into the right state.
+ */
hold_lkb(lkb);
memset(ms_local, 0, sizeof(struct dlm_message));
ms_local->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
ms_local->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS));
ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
_receive_convert_reply(lkb, ms_local, true);
-
- /* Same special case as in receive_rcom_lock_args() */
- lkb->lkb_grmode = DLM_LOCK_IV;
- rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
unhold_lkb(lkb);
} else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
@@ -5572,10 +5558,11 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
The real granted mode of these converting locks cannot be determined
until all locks have been rebuilt on the rsb (recover_conversion) */
- if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) &&
- middle_conversion(lkb)) {
- rl->rl_status = DLM_LKSTS_CONVERT;
- lkb->lkb_grmode = DLM_LOCK_IV;
+ if (rl->rl_status == DLM_LKSTS_CONVERT && middle_conversion(lkb)) {
+ /* We may need to adjust grmode depending on other granted locks. */
+ log_limit(ls, "%s %x middle convert gr %d rq %d remote %d %x",
+ __func__, lkb->lkb_id, lkb->lkb_grmode,
+ lkb->lkb_rqmode, lkb->lkb_nodeid, lkb->lkb_remid);
rsb_set_flag(r, RSB_RECOVER_CONVERT);
}
@@ -6344,8 +6331,8 @@ int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id,
if (error)
return error;
- error = add_to_waiters(lkb, mstype, to_nodeid);
+ add_to_waiters(lkb, mstype, to_nodeid);
dlm_put_lkb(lkb);
- return error;
+ return 0;
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index cb3a10b041c2..df40c3fd1070 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -660,18 +660,18 @@ static void add_sock(struct socket *sock, struct connection *con)
/* Add the port number to an IPv6 or 4 sockaddr and return the address
length */
-static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+static void make_sockaddr(struct sockaddr_storage *saddr, __be16 port,
int *addr_len)
{
saddr->ss_family = dlm_local_addr[0].ss_family;
if (saddr->ss_family == AF_INET) {
struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
- in4_addr->sin_port = cpu_to_be16(port);
+ in4_addr->sin_port = port;
*addr_len = sizeof(struct sockaddr_in);
memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
} else {
struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
- in6_addr->sin6_port = cpu_to_be16(port);
+ in6_addr->sin6_port = port;
*addr_len = sizeof(struct sockaddr_in6);
}
memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len);
@@ -1121,7 +1121,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
/*
* sctp_bind_addrs - bind a SCTP socket to all our addresses
*/
-static int sctp_bind_addrs(struct socket *sock, uint16_t port)
+static int sctp_bind_addrs(struct socket *sock, __be16 port)
{
struct sockaddr_storage localaddr;
struct sockaddr *addr = (struct sockaddr *)&localaddr;
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index c9661906568a..b0864c93230f 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -493,7 +493,7 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
we consider the node to have failed (versus
being removed due to dlm_release_lockspace) */
- error = dlm_comm_seq(memb->nodeid, &seq);
+ error = dlm_comm_seq(memb->nodeid, &seq, false);
if (!error && seq == memb->comm_seq)
return;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 2e1169c81c6e..be4240f09abd 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -811,33 +811,42 @@ static void recover_lvb(struct dlm_rsb *r)
}
/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
- converting PR->CW or CW->PR need to have their lkb_grmode set. */
+ * converting PR->CW or CW->PR may need to have their lkb_grmode changed.
+ */
static void recover_conversion(struct dlm_rsb *r)
{
struct dlm_ls *ls = r->res_ls;
+ uint32_t other_lkid = 0;
+ int other_grmode = -1;
struct dlm_lkb *lkb;
- int grmode = -1;
list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
if (lkb->lkb_grmode == DLM_LOCK_PR ||
lkb->lkb_grmode == DLM_LOCK_CW) {
- grmode = lkb->lkb_grmode;
+ other_grmode = lkb->lkb_grmode;
+ other_lkid = lkb->lkb_id;
break;
}
}
+ if (other_grmode == -1)
+ return;
+
list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
- if (lkb->lkb_grmode != DLM_LOCK_IV)
- continue;
- if (grmode == -1) {
- log_debug(ls, "recover_conversion %x set gr to rq %d",
- lkb->lkb_id, lkb->lkb_rqmode);
- lkb->lkb_grmode = lkb->lkb_rqmode;
- } else {
- log_debug(ls, "recover_conversion %x set gr %d",
- lkb->lkb_id, grmode);
- lkb->lkb_grmode = grmode;
+ /* Lock recovery created incompatible granted modes, so
+ * change the granted mode of the converting lock to
+ * NL. The rqmode of the converting lock should be CW,
+ * which means the converting lock should be granted at
+ * the end of recovery.
+ */
+ if (((lkb->lkb_grmode == DLM_LOCK_PR) && (other_grmode == DLM_LOCK_CW)) ||
+ ((lkb->lkb_grmode == DLM_LOCK_CW) && (other_grmode == DLM_LOCK_PR))) {
+ log_limit(ls, "%s %x gr %d rq %d, remote %d %x, other_lkid %u, other gr %d, set gr=NL",
+ __func__, lkb->lkb_id, lkb->lkb_grmode,
+ lkb->lkb_rqmode, lkb->lkb_nodeid,
+ lkb->lkb_remid, other_lkid, other_grmode);
+ lkb->lkb_grmode = DLM_LOCK_NL;
}
}
}
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 34f4f9f49a6c..12272a8f6d75 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -151,7 +151,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_members(ls, rv, &neg);
if (error) {
log_rinfo(ls, "dlm_recover_members error %d", error);
- goto fail;
+ goto fail_root_list;
}
dlm_recover_dir_nodeid(ls, &root_list);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 827278525fd9..69536cacdea8 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -328,10 +328,10 @@ out:
* Convert an eCryptfs page index into a lower byte offset
*/
static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *page)
+ struct folio *folio)
{
return ecryptfs_lower_header_size(crypt_stat) +
- ((loff_t)page->index << PAGE_SHIFT);
+ (loff_t)folio->index * PAGE_SIZE;
}
/**
@@ -340,6 +340,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
* encryption operation
* @dst_page: The page to write the result into
* @src_page: The page to read from
+ * @page_index: The offset in the file (in units of PAGE_SIZE)
* @extent_offset: Page extent offset for use in generating IV
* @op: ENCRYPT or DECRYPT to indicate the desired operation
*
@@ -350,9 +351,9 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
struct page *dst_page,
struct page *src_page,
+ pgoff_t page_index,
unsigned long extent_offset, int op)
{
- pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
loff_t extent_base;
char extent_iv[ECRYPTFS_MAX_IV_BYTES];
struct scatterlist src_sg, dst_sg;
@@ -392,7 +393,7 @@ out:
/**
* ecryptfs_encrypt_page
- * @page: Page mapped from the eCryptfs inode for the file; contains
+ * @folio: Folio mapped from the eCryptfs inode for the file; contains
* decrypted content that needs to be encrypted (to a temporary
* page; not in place) and written out to the lower file
*
@@ -406,7 +407,7 @@ out:
*
* Returns zero on success; negative on error
*/
-int ecryptfs_encrypt_page(struct page *page)
+int ecryptfs_encrypt_page(struct folio *folio)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
@@ -416,7 +417,7 @@ int ecryptfs_encrypt_page(struct page *page)
loff_t lower_offset;
int rc = 0;
- ecryptfs_inode = page->mapping->host;
+ ecryptfs_inode = folio->mapping->host;
crypt_stat =
&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
@@ -431,8 +432,9 @@ int ecryptfs_encrypt_page(struct page *page)
for (extent_offset = 0;
extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- rc = crypt_extent(crypt_stat, enc_extent_page, page,
- extent_offset, ENCRYPT);
+ rc = crypt_extent(crypt_stat, enc_extent_page,
+ folio_page(folio, 0), folio->index,
+ extent_offset, ENCRYPT);
if (rc) {
printk(KERN_ERR "%s: Error encrypting extent; "
"rc = [%d]\n", __func__, rc);
@@ -440,7 +442,7 @@ int ecryptfs_encrypt_page(struct page *page)
}
}
- lower_offset = lower_offset_for_page(crypt_stat, page);
+ lower_offset = lower_offset_for_page(crypt_stat, folio);
enc_extent_virt = kmap_local_page(enc_extent_page);
rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
PAGE_SIZE);
@@ -461,7 +463,7 @@ out:
/**
* ecryptfs_decrypt_page
- * @page: Page mapped from the eCryptfs inode for the file; data read
+ * @folio: Folio mapped from the eCryptfs inode for the file; data read
* and decrypted from the lower file will be written into this
* page
*
@@ -475,7 +477,7 @@ out:
*
* Returns zero on success; negative on error
*/
-int ecryptfs_decrypt_page(struct page *page)
+int ecryptfs_decrypt_page(struct folio *folio)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
@@ -484,13 +486,13 @@ int ecryptfs_decrypt_page(struct page *page)
loff_t lower_offset;
int rc = 0;
- ecryptfs_inode = page->mapping->host;
+ ecryptfs_inode = folio->mapping->host;
crypt_stat =
&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
- lower_offset = lower_offset_for_page(crypt_stat, page);
- page_virt = kmap_local_page(page);
+ lower_offset = lower_offset_for_page(crypt_stat, folio);
+ page_virt = kmap_local_folio(folio, 0);
rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
ecryptfs_inode);
kunmap_local(page_virt);
@@ -504,8 +506,9 @@ int ecryptfs_decrypt_page(struct page *page)
for (extent_offset = 0;
extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- rc = crypt_extent(crypt_stat, page, page,
- extent_offset, DECRYPT);
+ struct page *page = folio_page(folio, 0);
+ rc = crypt_extent(crypt_stat, page, page, folio->index,
+ extent_offset, DECRYPT);
if (rc) {
printk(KERN_ERR "%s: Error decrypting extent; "
"rc = [%d]\n", __func__, rc);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c586c5db18b5..1f562e75d0e4 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -569,8 +569,8 @@ void ecryptfs_destroy_mount_crypt_stat(
struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode);
-int ecryptfs_encrypt_page(struct page *page);
-int ecryptfs_decrypt_page(struct page *page);
+int ecryptfs_encrypt_page(struct folio *folio);
+int ecryptfs_decrypt_page(struct folio *folio);
int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
struct inode *ecryptfs_inode);
int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
@@ -653,16 +653,15 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
loff_t offset, size_t size);
int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
- struct page *page_for_lower,
+ struct folio *folio_for_lower,
size_t offset_in_page, size_t size);
int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
struct inode *ecryptfs_inode);
-int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
+int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs,
pgoff_t page_index,
size_t offset_in_page, size_t size,
struct inode *ecryptfs_inode);
-struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
size_t *length_size);
int ecryptfs_write_packet_length(char *dest, size_t size,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index cbdf82f0183f..a9819ddb1ab8 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1008,14 +1008,6 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap,
return rc;
}
-static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
-{
- if (flags & AT_GETATTR_NOSEC)
- return vfs_getattr_nosec(path, stat, request_mask, flags);
- return vfs_getattr(path, stat, request_mask, flags);
-}
-
static int ecryptfs_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
@@ -1024,8 +1016,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
struct kstat lower_stat;
int rc;
- rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry),
- &lower_stat, request_mask, flags);
+ rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry),
+ &lower_stat, request_mask, flags);
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index ceda5555971a..60f0ac8744b5 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -23,47 +23,29 @@
#include "ecryptfs_kernel.h"
/*
- * ecryptfs_get_locked_page
- *
- * Get one page from cache or lower f/s, return error otherwise.
- *
- * Returns locked and up-to-date page (if ok), with increased
- * refcnt.
- */
-struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
-{
- struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
- if (!IS_ERR(page))
- lock_page(page);
- return page;
-}
-
-/**
- * ecryptfs_writepage
- * @page: Page that is locked before this call is made
- * @wbc: Write-back control structure
- *
- * Returns zero on success; non-zero otherwise
- *
* This is where we encrypt the data and pass the encrypted data to
* the lower filesystem. In OpenPGP-compatible mode, we operate on
* entire underlying packets.
*/
-static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
+static int ecryptfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- int rc;
-
- rc = ecryptfs_encrypt_page(page);
- if (rc) {
- ecryptfs_printk(KERN_WARNING, "Error encrypting "
- "page (upper index [0x%.16lx])\n", page->index);
- ClearPageUptodate(page);
- goto out;
+ struct folio *folio = NULL;
+ int error;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
+ error = ecryptfs_encrypt_page(folio);
+ if (error) {
+ ecryptfs_printk(KERN_WARNING,
+ "Error encrypting folio (index [0x%.16lx])\n",
+ folio->index);
+ folio_clear_uptodate(folio);
+ mapping_set_error(mapping, error);
+ }
+ folio_unlock(folio);
}
- SetPageUptodate(page);
-out:
- unlock_page(page);
- return rc;
+
+ return error;
}
static void strip_xattr_flag(char *page_virt,
@@ -97,7 +79,7 @@ static void strip_xattr_flag(char *page_virt,
/**
* ecryptfs_copy_up_encrypted_with_header
- * @page: Sort of a ``virtual'' representation of the encrypted lower
+ * @folio: Sort of a ``virtual'' representation of the encrypted lower
* file. The actual lower file does not have the metadata in
* the header. This is locked.
* @crypt_stat: The eCryptfs inode's cryptographic context
@@ -106,7 +88,7 @@ static void strip_xattr_flag(char *page_virt,
* seeing, with the header information inserted.
*/
static int
-ecryptfs_copy_up_encrypted_with_header(struct page *page,
+ecryptfs_copy_up_encrypted_with_header(struct folio *folio,
struct ecryptfs_crypt_stat *crypt_stat)
{
loff_t extent_num_in_page = 0;
@@ -115,9 +97,9 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
int rc = 0;
while (extent_num_in_page < num_extents_per_page) {
- loff_t view_extent_num = ((((loff_t)page->index)
+ loff_t view_extent_num = ((loff_t)folio->index
* num_extents_per_page)
- + extent_num_in_page);
+ + extent_num_in_page;
size_t num_header_extents_at_front =
(crypt_stat->metadata_size / crypt_stat->extent_size);
@@ -125,21 +107,21 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
/* This is a header extent */
char *page_virt;
- page_virt = kmap_local_page(page);
+ page_virt = kmap_local_folio(folio, 0);
memset(page_virt, 0, PAGE_SIZE);
/* TODO: Support more than one header extent */
if (view_extent_num == 0) {
size_t written;
rc = ecryptfs_read_xattr_region(
- page_virt, page->mapping->host);
+ page_virt, folio->mapping->host);
strip_xattr_flag(page_virt + 16, crypt_stat);
ecryptfs_write_header_metadata(page_virt + 20,
crypt_stat,
&written);
}
kunmap_local(page_virt);
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
if (rc) {
printk(KERN_ERR "%s: Error reading xattr "
"region; rc = [%d]\n", __func__, rc);
@@ -152,9 +134,9 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
- crypt_stat->metadata_size);
rc = ecryptfs_read_lower_page_segment(
- page, (lower_offset >> PAGE_SHIFT),
+ folio, (lower_offset >> PAGE_SHIFT),
(lower_offset & ~PAGE_MASK),
- crypt_stat->extent_size, page->mapping->host);
+ crypt_stat->extent_size, folio->mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
"extent at offset [%lld] in the lower "
@@ -180,55 +162,50 @@ out:
*/
static int ecryptfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
+ struct inode *inode = folio->mapping->host;
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
- int rc = 0;
+ &ecryptfs_inode_to_private(inode)->crypt_stat;
+ int err = 0;
if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
- rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
- PAGE_SIZE,
- page->mapping->host);
+ err = ecryptfs_read_lower_page_segment(folio, folio->index, 0,
+ folio_size(folio), inode);
} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
- rc = ecryptfs_copy_up_encrypted_with_header(page,
- crypt_stat);
- if (rc) {
+ err = ecryptfs_copy_up_encrypted_with_header(folio,
+ crypt_stat);
+ if (err) {
printk(KERN_ERR "%s: Error attempting to copy "
"the encrypted content from the lower "
"file whilst inserting the metadata "
- "from the xattr into the header; rc = "
- "[%d]\n", __func__, rc);
+ "from the xattr into the header; err = "
+ "[%d]\n", __func__, err);
goto out;
}
} else {
- rc = ecryptfs_read_lower_page_segment(
- page, page->index, 0, PAGE_SIZE,
- page->mapping->host);
- if (rc) {
- printk(KERN_ERR "Error reading page; rc = "
- "[%d]\n", rc);
+ err = ecryptfs_read_lower_page_segment(folio,
+ folio->index, 0, folio_size(folio),
+ inode);
+ if (err) {
+ printk(KERN_ERR "Error reading page; err = "
+ "[%d]\n", err);
goto out;
}
}
} else {
- rc = ecryptfs_decrypt_page(page);
- if (rc) {
+ err = ecryptfs_decrypt_page(folio);
+ if (err) {
ecryptfs_printk(KERN_ERR, "Error decrypting page; "
- "rc = [%d]\n", rc);
+ "err = [%d]\n", err);
goto out;
}
}
out:
- if (rc)
- ClearPageUptodate(page);
- else
- SetPageUptodate(page);
- ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
- page->index);
- unlock_page(page);
- return rc;
+ ecryptfs_printk(KERN_DEBUG, "Unlocking folio with index = [0x%.16lx]\n",
+ folio->index);
+ folio_end_read(folio, err == 0);
+ return err;
}
/*
@@ -285,7 +262,7 @@ static int ecryptfs_write_begin(struct file *file,
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_read_lower_page_segment(
- &folio->page, index, 0, PAGE_SIZE, mapping->host);
+ folio, index, 0, PAGE_SIZE, mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
"lower page segment; rc = [%d]\n",
@@ -297,7 +274,7 @@ static int ecryptfs_write_begin(struct file *file,
} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
rc = ecryptfs_copy_up_encrypted_with_header(
- &folio->page, crypt_stat);
+ folio, crypt_stat);
if (rc) {
printk(KERN_ERR "%s: Error attempting "
"to copy the encrypted content "
@@ -311,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
folio_mark_uptodate(folio);
} else {
rc = ecryptfs_read_lower_page_segment(
- &folio->page, index, 0, PAGE_SIZE,
+ folio, index, 0, PAGE_SIZE,
mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error reading "
@@ -328,7 +305,7 @@ static int ecryptfs_write_begin(struct file *file,
folio_zero_range(folio, 0, PAGE_SIZE);
folio_mark_uptodate(folio);
} else if (len < PAGE_SIZE) {
- rc = ecryptfs_decrypt_page(&folio->page);
+ rc = ecryptfs_decrypt_page(folio);
if (rc) {
printk(KERN_ERR "%s: Error decrypting "
"page at index [%ld]; "
@@ -477,7 +454,7 @@ static int ecryptfs_write_end(struct file *file,
"(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
- &folio->page, 0, to);
+ folio, 0, to);
if (!rc) {
rc = copied;
fsstack_copy_inode_size(ecryptfs_inode,
@@ -499,7 +476,7 @@ static int ecryptfs_write_end(struct file *file,
"zeros in page with index = [0x%.16lx]\n", index);
goto out;
}
- rc = ecryptfs_encrypt_page(&folio->page);
+ rc = ecryptfs_encrypt_page(folio);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
"index [0x%.16lx])\n", index);
@@ -548,9 +525,10 @@ const struct address_space_operations ecryptfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
#endif
- .writepage = ecryptfs_writepage,
+ .writepages = ecryptfs_writepages,
.read_folio = ecryptfs_read_folio,
.write_begin = ecryptfs_write_begin,
.write_end = ecryptfs_write_end,
+ .migrate_folio = filemap_migrate_folio,
.bmap = ecryptfs_bmap,
};
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 3458f153a588..b3b451c2b941 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -41,30 +41,29 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
/**
* ecryptfs_write_lower_page_segment
* @ecryptfs_inode: The eCryptfs inode
- * @page_for_lower: The page containing the data to be written to the
+ * @folio_for_lower: The folio containing the data to be written to the
* lower file
- * @offset_in_page: The offset in the @page_for_lower from which to
+ * @offset_in_page: The offset in the @folio_for_lower from which to
* start writing the data
- * @size: The amount of data from @page_for_lower to write to the
+ * @size: The amount of data from @folio_for_lower to write to the
* lower file
*
* Determines the byte offset in the file for the given page and
* offset within the page, maps the page, and makes the call to write
- * the contents of @page_for_lower to the lower inode.
+ * the contents of @folio_for_lower to the lower inode.
*
* Returns zero on success; non-zero otherwise
*/
int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
- struct page *page_for_lower,
+ struct folio *folio_for_lower,
size_t offset_in_page, size_t size)
{
char *virt;
loff_t offset;
int rc;
- offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT)
- + offset_in_page);
- virt = kmap_local_page(page_for_lower);
+ offset = (loff_t)folio_for_lower->index * PAGE_SIZE + offset_in_page;
+ virt = kmap_local_folio(folio_for_lower, 0);
rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
if (rc > 0)
rc = 0;
@@ -93,7 +92,6 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
size_t size)
{
- struct page *ecryptfs_page;
struct ecryptfs_crypt_stat *crypt_stat;
char *ecryptfs_page_virt;
loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
@@ -111,6 +109,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
else
pos = offset;
while (pos < (offset + size)) {
+ struct folio *ecryptfs_folio;
pgoff_t ecryptfs_page_idx = (pos >> PAGE_SHIFT);
size_t start_offset_in_page = (pos & ~PAGE_MASK);
size_t num_bytes = (PAGE_SIZE - start_offset_in_page);
@@ -130,17 +129,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
if (num_bytes > total_remaining_zeros)
num_bytes = total_remaining_zeros;
}
- ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
- ecryptfs_page_idx);
- if (IS_ERR(ecryptfs_page)) {
- rc = PTR_ERR(ecryptfs_page);
+ ecryptfs_folio = read_mapping_folio(ecryptfs_inode->i_mapping,
+ ecryptfs_page_idx, NULL);
+ if (IS_ERR(ecryptfs_folio)) {
+ rc = PTR_ERR(ecryptfs_folio);
printk(KERN_ERR "%s: Error getting page at "
"index [%ld] from eCryptfs inode "
"mapping; rc = [%d]\n", __func__,
ecryptfs_page_idx, rc);
goto out;
}
- ecryptfs_page_virt = kmap_local_page(ecryptfs_page);
+ folio_lock(ecryptfs_folio);
+ ecryptfs_page_virt = kmap_local_folio(ecryptfs_folio, 0);
/*
* pos: where we're now writing, offset: where the request was
@@ -164,17 +164,17 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
data_offset += num_bytes;
}
kunmap_local(ecryptfs_page_virt);
- flush_dcache_page(ecryptfs_page);
- SetPageUptodate(ecryptfs_page);
- unlock_page(ecryptfs_page);
+ flush_dcache_folio(ecryptfs_folio);
+ folio_mark_uptodate(ecryptfs_folio);
+ folio_unlock(ecryptfs_folio);
if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
- rc = ecryptfs_encrypt_page(ecryptfs_page);
+ rc = ecryptfs_encrypt_page(ecryptfs_folio);
else
rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
- ecryptfs_page,
+ ecryptfs_folio,
start_offset_in_page,
data_offset);
- put_page(ecryptfs_page);
+ folio_put(ecryptfs_folio);
if (rc) {
printk(KERN_ERR "%s: Error encrypting "
"page; rc = [%d]\n", __func__, rc);
@@ -228,7 +228,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
/**
* ecryptfs_read_lower_page_segment
- * @page_for_ecryptfs: The page into which data for eCryptfs will be
+ * @folio_for_ecryptfs: The folio into which data for eCryptfs will be
* written
* @page_index: Page index in @page_for_ecryptfs from which to start
* writing
@@ -243,7 +243,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
*
* Returns zero on success; non-zero otherwise
*/
-int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
+int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs,
pgoff_t page_index,
size_t offset_in_page, size_t size,
struct inode *ecryptfs_inode)
@@ -252,12 +252,12 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
loff_t offset;
int rc;
- offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
- virt = kmap_local_page(page_for_ecryptfs);
+ offset = (loff_t)page_index * PAGE_SIZE + offset_in_page;
+ virt = kmap_local_folio(folio_for_ecryptfs, 0);
rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
if (rc > 0)
rc = 0;
kunmap_local(virt);
- flush_dcache_page(page_for_ecryptfs);
+ flush_dcache_folio(folio_for_ecryptfs);
return rc;
}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index e4421c10caeb..c59086b7eabf 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -15,7 +15,6 @@
#include <linux/vfs.h>
#include <linux/blkdev.h>
#include <linux/fs_context.h>
-#include <linux/fs_parser.h>
#include "efs.h"
#include <linux/efs_vh.h>
#include <linux/efs_fs_sb.h>
@@ -49,15 +48,6 @@ static struct pt_types sgi_pt_types[] = {
{0, NULL}
};
-enum {
- Opt_explicit_open,
-};
-
-static const struct fs_parameter_spec efs_param_spec[] = {
- fsparam_flag ("explicit-open", Opt_explicit_open),
- {}
-};
-
/*
* File system definition and registration.
*/
@@ -67,7 +57,6 @@ static struct file_system_type efs_fs_type = {
.kill_sb = efs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
.init_fs_context = efs_init_fs_context,
- .parameters = efs_param_spec,
};
MODULE_ALIAS_FS("efs");
@@ -265,7 +254,8 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc)
if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
pr_err("device does not support %d byte blocks\n",
EFS_BLOCKSIZE);
- return -EINVAL;
+ return invalf(fc, "device does not support %d byte blocks\n",
+ EFS_BLOCKSIZE);
}
/* read the vh (volume header) block */
@@ -327,43 +317,22 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc)
return 0;
}
-static void efs_free_fc(struct fs_context *fc)
-{
- kfree(fc->fs_private);
-}
-
static int efs_get_tree(struct fs_context *fc)
{
return get_tree_bdev(fc, efs_fill_super);
}
-static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param)
-{
- int token;
- struct fs_parse_result result;
-
- token = fs_parse(fc, efs_param_spec, param, &result);
- if (token < 0)
- return token;
- return 0;
-}
-
static int efs_reconfigure(struct fs_context *fc)
{
sync_filesystem(fc->root->d_sb);
+ fc->sb_flags |= SB_RDONLY;
return 0;
}
-struct efs_context {
- unsigned long s_mount_opts;
-};
-
static const struct fs_context_operations efs_context_opts = {
- .parse_param = efs_parse_param,
.get_tree = efs_get_tree,
.reconfigure = efs_reconfigure,
- .free = efs_free_fc,
};
/*
@@ -371,12 +340,6 @@ static const struct fs_context_operations efs_context_opts = {
*/
static int efs_init_fs_context(struct fs_context *fc)
{
- struct efs_context *ctx;
-
- ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
- fc->fs_private = ctx;
fc->ops = &efs_context_opts;
return 0;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 61debd799cf9..1c49f8962021 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -10,10 +10,10 @@
void erofs_unmap_metabuf(struct erofs_buf *buf)
{
- if (buf->kmap_type == EROFS_KMAP)
- kunmap_local(buf->base);
+ if (!buf->base)
+ return;
+ kunmap_local(buf->base);
buf->base = NULL;
- buf->kmap_type = EROFS_NO_KMAP;
}
void erofs_put_metabuf(struct erofs_buf *buf)
@@ -38,20 +38,13 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
}
if (!folio || !folio_contains(folio, index)) {
erofs_put_metabuf(buf);
- folio = read_mapping_folio(buf->mapping, index, NULL);
+ folio = read_mapping_folio(buf->mapping, index, buf->file);
if (IS_ERR(folio))
return folio;
}
buf->page = folio_file_page(folio, index);
-
- if (buf->kmap_type == EROFS_NO_KMAP) {
- if (type == EROFS_KMAP)
- buf->base = kmap_local_page(buf->page);
- buf->kmap_type = type;
- } else if (buf->kmap_type != type) {
- DBG_BUGON(1);
- return ERR_PTR(-EFAULT);
- }
+ if (!buf->base && type == EROFS_KMAP)
+ buf->base = kmap_local_page(buf->page);
if (type == EROFS_NO_KMAP)
return NULL;
return buf->base + (offset & ~PAGE_MASK);
@@ -61,9 +54,11 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- if (erofs_is_fileio_mode(sbi))
- buf->mapping = file_inode(sbi->fdev)->i_mapping;
- else if (erofs_is_fscache_mode(sb))
+ buf->file = NULL;
+ if (erofs_is_fileio_mode(sbi)) {
+ buf->file = sbi->fdev; /* some fs like FUSE needs it */
+ buf->mapping = buf->file->f_mapping;
+ } else if (erofs_is_fscache_mode(sb))
buf->mapping = sbi->s_fscache->inode->i_mapping;
else
buf->mapping = sb->s_bdev->bd_mapping;
@@ -350,7 +345,6 @@ static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
struct erofs_buf buf = {
.page = kmap_to_page(ptr),
.base = ptr,
- .kmap_type = EROFS_KMAP,
};
DBG_BUGON(iomap->type != IOMAP_INLINE);
@@ -411,22 +405,9 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (IS_DAX(inode))
return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif
- if (iocb->ki_flags & IOCB_DIRECT) {
- struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int blksize_mask;
-
- if (bdev)
- blksize_mask = bdev_logical_block_size(bdev) - 1;
- else
- blksize_mask = i_blocksize(inode) - 1;
-
- if ((iocb->ki_pos | iov_iter_count(to) |
- iov_iter_alignment(to)) & blksize_mask)
- return -EINVAL;
-
+ if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
NULL, 0, NULL, 0);
- }
return filemap_read(iocb, to, 0);
}
@@ -473,8 +454,32 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
#define erofs_file_mmap generic_file_readonly_mmap
#endif
+static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ const struct iomap_ops *ops = &erofs_iomap_ops;
+
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+#ifdef CONFIG_EROFS_FS_ZIP
+ ops = &z_erofs_iomap_report_ops;
+#else
+ return generic_file_llseek(file, offset, whence);
+#endif
+
+ if (whence == SEEK_HOLE)
+ offset = iomap_seek_hole(inode, offset, ops);
+ else if (whence == SEEK_DATA)
+ offset = iomap_seek_data(inode, offset, ops);
+ else
+ return generic_file_llseek(file, offset, whence);
+
+ if (offset < 0)
+ return offset;
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+}
+
const struct file_operations erofs_file_fops = {
- .llseek = generic_file_llseek,
+ .llseek = erofs_file_llseek,
.read_iter = erofs_file_read_iter,
.mmap = erofs_file_mmap,
.get_unmapped_area = thp_get_unmapped_area,
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index db29190656eb..d4b89407822a 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -318,6 +318,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags)
{
struct inode *const inode = d_inode(path->dentry);
+ struct block_device *bdev = inode->i_sb->s_bdev;
bool compressed =
erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout);
@@ -330,15 +331,14 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
/*
* Return the DIO alignment restrictions if requested.
*
- * In EROFS, STATX_DIOALIGN is not supported in ondemand mode and
- * compressed files, so in these cases we report no DIO support.
+ * In EROFS, STATX_DIOALIGN is only supported in bdev-based mode
+ * and uncompressed inodes, otherwise we report no DIO support.
*/
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
stat->result_mask |= STATX_DIOALIGN;
- if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) {
- stat->dio_mem_align =
- bdev_logical_block_size(inode->i_sb->s_bdev);
- stat->dio_offset_align = stat->dio_mem_align;
+ if (bdev && !compressed) {
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
}
}
generic_fillattr(idmap, request_mask, inode, stat);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4efd578d7c62..1c847c30a918 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -20,18 +20,12 @@
#include <linux/iomap.h>
#include "erofs_fs.h"
-/* redefine pr_fmt "erofs: " */
-#undef pr_fmt
-#define pr_fmt(fmt) "erofs: " fmt
-
-__printf(3, 4) void _erofs_err(struct super_block *sb,
- const char *function, const char *fmt, ...);
+__printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...);
#define erofs_err(sb, fmt, ...) \
- _erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__)
-__printf(3, 4) void _erofs_info(struct super_block *sb,
- const char *function, const char *fmt, ...);
+ _erofs_printk(sb, KERN_ERR fmt "\n", ##__VA_ARGS__)
#define erofs_info(sb, fmt, ...) \
- _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
+ _erofs_printk(sb, KERN_INFO fmt "\n", ##__VA_ARGS__)
+
#ifdef CONFIG_EROFS_FS_DEBUG
#define DBG_BUGON BUG_ON
#else
@@ -208,12 +202,6 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-/* basic unit of the workstation of a super_block */
-struct erofs_workgroup {
- pgoff_t index;
- struct lockref lockref;
-};
-
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
EROFS_KMAP, /* use kmap_local_page() to map the buffer */
@@ -221,9 +209,9 @@ enum erofs_kmap_type {
struct erofs_buf {
struct address_space *mapping;
+ struct file *file;
struct page *page;
void *base;
- enum erofs_kmap_type kmap_type;
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
@@ -456,20 +444,17 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
-void erofs_workgroup_put(struct erofs_workgroup *grp);
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index);
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp);
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+
+extern atomic_long_t erofs_global_shrink_cnt;
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
int __init z_erofs_init_subsystem(void);
void z_erofs_exit_subsystem(void);
-int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
- struct erofs_workgroup *egrp);
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
+ unsigned long nr_shrink);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags);
void *z_erofs_get_gbuf(unsigned int requiredpages);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index bed3dbe5b7cb..c235a8e4315e 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -18,37 +18,22 @@
static struct kmem_cache *erofs_inode_cachep __read_mostly;
-void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
+void _erofs_printk(struct super_block *sb, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
+ int level;
va_start(args, fmt);
- vaf.fmt = fmt;
+ level = printk_get_level(fmt);
+ vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
-
if (sb)
- pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
+ printk("%c%cerofs (device %s): %pV",
+ KERN_SOH_ASCII, level, sb->s_id, &vaf);
else
- pr_err("%s: %pV", func, &vaf);
- va_end(args);
-}
-
-void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- if (sb)
- pr_info("(device %s): %pV", sb->s_id, &vaf);
- else
- pr_info("%pV", &vaf);
+ printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf);
va_end(args);
}
@@ -631,7 +616,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
errorfc(fc, "unsupported blksize for fscache mode");
return -EINVAL;
}
- if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
+
+ if (erofs_is_fileio_mode(sbi)) {
+ sb->s_blocksize = 1 << sbi->blkszbits;
+ sb->s_blocksize_bits = sbi->blkszbits;
+ } else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
errorfc(fc, "failed to set erofs blksize");
return -EINVAL;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index 63cffd0fd261..19d586273b70 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -10,6 +10,7 @@
enum {
attr_feature,
+ attr_drop_caches,
attr_pointer_ui,
attr_pointer_bool,
};
@@ -57,11 +58,13 @@ static struct erofs_attr erofs_attr_##_name = { \
#ifdef CONFIG_EROFS_FS_ZIP
EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
+EROFS_ATTR_FUNC(drop_caches, 0200);
#endif
static struct attribute *erofs_attrs[] = {
#ifdef CONFIG_EROFS_FS_ZIP
ATTR_LIST(sync_decompress),
+ ATTR_LIST(drop_caches),
#endif
NULL,
};
@@ -163,6 +166,20 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
return -EINVAL;
*(bool *)ptr = !!t;
return len;
+#ifdef CONFIG_EROFS_FS_ZIP
+ case attr_drop_caches:
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t < 1 || t > 3)
+ return -EINVAL;
+
+ if (t & 2)
+ z_erofs_shrink_scan(sbi, ~0UL);
+ if (t & 1)
+ invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1);
+ return len;
+#endif
}
return 0;
}
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index a569ff9dfd04..01f147505487 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -44,12 +44,15 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
* A: Field should be accessed / updated in atomic for parallelized code.
*/
struct z_erofs_pcluster {
- struct erofs_workgroup obj;
struct mutex lock;
+ struct lockref lockref;
/* A: point to next chained pcluster or TAILs */
z_erofs_next_pcluster_t next;
+ /* I: start block address of this pcluster */
+ erofs_off_t index;
+
/* L: the maximum decompression size of this round */
unsigned int length;
@@ -108,7 +111,7 @@ struct z_erofs_decompressqueue {
static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
{
- return !pcl->obj.index;
+ return !pcl->index;
}
static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
@@ -116,7 +119,6 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
}
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
{
return fo->mapping == MNGD_MAPPING(sbi);
@@ -548,7 +550,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- page = find_get_page(mc, pcl->obj.index + i);
+ page = find_get_page(mc, pcl->index + i);
if (!page) {
/* I/O is needed, no possible to decompress directly */
standalone = false;
@@ -564,13 +566,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
continue;
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
}
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
if (!pcl->compressed_bvecs[i].page) {
pcl->compressed_bvecs[i].page = page ? page : newpage;
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
continue;
}
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
if (page)
put_page(page);
@@ -587,11 +589,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
}
/* (erofs_shrinker) disconnect cached encoded data with pclusters */
-int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
+static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
struct folio *folio;
int i;
@@ -626,8 +626,8 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
return true;
ret = false;
- spin_lock(&pcl->obj.lockref.lock);
- if (pcl->obj.lockref.count <= 0) {
+ spin_lock(&pcl->lockref.lock);
+ if (pcl->lockref.count <= 0) {
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
for (; bvec < end; ++bvec) {
if (bvec->page && page_folio(bvec->page) == folio) {
@@ -638,7 +638,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
}
}
}
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
return ret;
}
@@ -689,15 +689,15 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
if (exclusive) {
/* give priority for inplaceio to use file pages first */
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
while (fe->icur > 0) {
if (pcl->compressed_bvecs[--fe->icur].page)
continue;
pcl->compressed_bvecs[fe->icur] = *bvec;
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
return 0;
}
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
@@ -710,13 +710,30 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
return ret;
}
+static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
+{
+ if (lockref_get_not_zero(&pcl->lockref))
+ return true;
+
+ spin_lock(&pcl->lockref.lock);
+ if (__lockref_is_dead(&pcl->lockref)) {
+ spin_unlock(&pcl->lockref.lock);
+ return false;
+ }
+
+ if (!pcl->lockref.count++)
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ spin_unlock(&pcl->lockref.lock);
+ return true;
+}
+
static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
bool ztailpacking = map->m_flags & EROFS_MAP_META;
- struct z_erofs_pcluster *pcl;
- struct erofs_workgroup *grp;
+ struct z_erofs_pcluster *pcl, *pre;
int err;
if (!(map->m_flags & EROFS_MAP_ENCODED) ||
@@ -730,8 +747,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- spin_lock_init(&pcl->obj.lockref.lock);
- pcl->obj.lockref.count = 1; /* one ref for this request */
+ spin_lock_init(&pcl->lockref.lock);
+ pcl->lockref.count = 1; /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
@@ -749,19 +766,26 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(!mutex_trylock(&pcl->lock));
if (ztailpacking) {
- pcl->obj.index = 0; /* which indicates ztailpacking */
+ pcl->index = 0; /* which indicates ztailpacking */
} else {
- pcl->obj.index = erofs_blknr(sb, map->m_pa);
-
- grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
- if (IS_ERR(grp)) {
- err = PTR_ERR(grp);
- goto err_out;
+ pcl->index = erofs_blknr(sb, map->m_pa);
+ while (1) {
+ xa_lock(&sbi->managed_pslots);
+ pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index,
+ NULL, pcl, GFP_KERNEL);
+ if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
+ xa_unlock(&sbi->managed_pslots);
+ break;
+ }
+ /* try to legitimize the current in-tree one */
+ xa_unlock(&sbi->managed_pslots);
+ cond_resched();
}
-
- if (grp != &pcl->obj) {
- fe->pcl = container_of(grp,
- struct z_erofs_pcluster, obj);
+ if (xa_is_err(pre)) {
+ err = xa_err(pre);
+ goto err_out;
+ } else if (pre) {
+ fe->pcl = pre;
err = -EEXIST;
goto err_out;
}
@@ -781,7 +805,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
- struct erofs_workgroup *grp = NULL;
+ struct z_erofs_pcluster *pcl = NULL;
int ret;
DBG_BUGON(fe->pcl);
@@ -789,14 +813,23 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
if (!(map->m_flags & EROFS_MAP_META)) {
- grp = erofs_find_workgroup(sb, blknr);
+ while (1) {
+ rcu_read_lock();
+ pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr);
+ if (!pcl || z_erofs_get_pcluster(pcl)) {
+ DBG_BUGON(pcl && blknr != pcl->index);
+ rcu_read_unlock();
+ break;
+ }
+ rcu_read_unlock();
+ }
} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- if (grp) {
- fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ if (pcl) {
+ fe->pcl = pcl;
ret = -EEXIST;
} else {
ret = z_erofs_register_pcluster(fe);
@@ -851,12 +884,87 @@ static void z_erofs_rcu_callback(struct rcu_head *head)
struct z_erofs_pcluster, rcu));
}
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
{
- struct z_erofs_pcluster *const pcl =
- container_of(grp, struct z_erofs_pcluster, obj);
+ if (pcl->lockref.count)
+ return false;
- call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ /*
+ * Note that all cached folios should be detached before deleted from
+ * the XArray. Otherwise some folios could be still attached to the
+ * orphan old pcluster when the new one is available in the tree.
+ */
+ if (erofs_try_to_free_all_cached_folios(sbi, pcl))
+ return false;
+
+ /*
+ * It's impossible to fail after the pcluster is freezed, but in order
+ * to avoid some race conditions, add a DBG_BUGON to observe this.
+ */
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl);
+
+ lockref_mark_dead(&pcl->lockref);
+ return true;
+}
+
+static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl)
+{
+ bool free;
+
+ spin_lock(&pcl->lockref.lock);
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ spin_unlock(&pcl->lockref.lock);
+ if (free) {
+ atomic_long_dec(&erofs_global_shrink_cnt);
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
+ }
+ return free;
+}
+
+unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
+ unsigned long nr_shrink)
+{
+ struct z_erofs_pcluster *pcl;
+ unsigned int freed = 0;
+ unsigned long index;
+
+ xa_lock(&sbi->managed_pslots);
+ xa_for_each(&sbi->managed_pslots, index, pcl) {
+ /* try to shrink each valid pcluster */
+ if (!erofs_try_to_release_pcluster(sbi, pcl))
+ continue;
+ xa_unlock(&sbi->managed_pslots);
+
+ ++freed;
+ if (!--nr_shrink)
+ return freed;
+ xa_lock(&sbi->managed_pslots);
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return freed;
+}
+
+static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
+ struct z_erofs_pcluster *pcl, bool try_free)
+{
+ bool free = false;
+
+ if (lockref_put_or_lock(&pcl->lockref))
+ return;
+
+ DBG_BUGON(__lockref_is_dead(&pcl->lockref));
+ if (!--pcl->lockref.count) {
+ if (try_free && xa_trylock(&sbi->managed_pslots)) {
+ free = __erofs_try_to_release_pcluster(sbi, pcl);
+ xa_unlock(&sbi->managed_pslots);
+ }
+ atomic_long_add(!free, &erofs_global_shrink_cnt);
+ }
+ spin_unlock(&pcl->lockref.lock);
+ if (free)
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
@@ -877,7 +985,7 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
* any longer if the pcluster isn't hosted by ourselves.
*/
if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
- erofs_workgroup_put(&pcl->obj);
+ z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
fe->pcl = NULL;
}
@@ -1179,6 +1287,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
int i, j, jtop, err2;
struct page *page;
bool overlapped;
+ bool try_free = true;
mutex_lock(&pcl->lock);
be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
@@ -1236,9 +1345,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
/* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
page = be->compressed_pages[i];
- if (!page ||
- erofs_folio_is_managed(sbi, page_folio(page)))
+ if (!page)
+ continue;
+ if (erofs_folio_is_managed(sbi, page_folio(page))) {
+ try_free = false;
continue;
+ }
(void)z_erofs_put_shortlivedpage(be->pagepool, page);
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
}
@@ -1284,6 +1396,11 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
/* pcluster lock MUST be taken before the following line */
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
mutex_unlock(&pcl->lock);
+
+ if (z_erofs_is_inline_pcluster(pcl))
+ z_erofs_free_pcluster(pcl);
+ else
+ z_erofs_put_pcluster(sbi, pcl, try_free);
return err;
}
@@ -1306,10 +1423,6 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
owned = READ_ONCE(be.pcl->next);
err = z_erofs_decompress_pcluster(&be, err) ?: err;
- if (z_erofs_is_inline_pcluster(be.pcl))
- z_erofs_free_pcluster(be.pcl);
- else
- erofs_workgroup_put(&be.pcl->obj);
}
return err;
}
@@ -1391,9 +1504,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
zbv = pcl->compressed_bvecs[nr];
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
if (!zbv.page)
goto out_allocfolio;
@@ -1455,23 +1568,23 @@ repeat:
folio_put(folio);
out_allocfolio:
page = __erofs_allocpage(&f->pagepool, gfp, true);
- spin_lock(&pcl->obj.lockref.lock);
+ spin_lock(&pcl->lockref.lock);
if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
if (page)
erofs_pagepool_add(&f->pagepool, page);
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
cond_resched();
goto repeat;
}
pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
- spin_unlock(&pcl->obj.lockref.lock);
+ spin_unlock(&pcl->lockref.lock);
bvec->bv_page = page;
if (!page)
return;
folio = page_folio(page);
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
- filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) {
+ filemap_add_folio(mc, folio, pcl->index + nr, gfp)) {
/* turn into a temporary shortlived folio (1 ref) */
folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return;
@@ -1603,7 +1716,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
/* no device id here, thus it will always succeed */
mdev = (struct erofs_map_dev) {
- .m_pa = erofs_pos(sb, pcl->obj.index),
+ .m_pa = erofs_pos(sb, pcl->index),
};
(void)erofs_map_dev(sb, &mdev);
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index a076cca1f547..4535f2f0a014 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -219,7 +219,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
unsigned int amortizedshift;
erofs_off_t pos;
- if (lcn >= totalidx)
+ if (lcn >= totalidx || vi->z_logical_clusterbits > 14)
return -EINVAL;
m->lcn = lcn;
@@ -390,7 +390,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
int err;
- do {
+ while (1) {
/* handle the last EOF pcluster (no next HEAD lcluster) */
if ((lcn << lclusterbits) >= inode->i_size) {
map->m_llen = inode->i_size - map->m_la;
@@ -402,14 +402,16 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return err;
if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- DBG_BUGON(!m->delta[1] &&
- m->clusterofs != 1 << lclusterbits);
+ /* work around invalid d1 generated by pre-1.0 mkfs */
+ if (unlikely(!m->delta[1])) {
+ m->delta[1] = 1;
+ DBG_BUGON(1);
+ }
} else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 ||
m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
- /* go on until the next HEAD lcluster */
if (lcn != headlcn)
- break;
+ break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
} else {
erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
@@ -418,8 +420,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return -EOPNOTSUPP;
}
lcn += m->delta[1];
- } while (m->delta[1]);
-
+ }
map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la;
return 0;
}
diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 37afe2024840..75704f58ecfa 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2024 Alibaba Cloud
*/
#include "internal.h"
@@ -19,13 +20,12 @@ static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages,
module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);
module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444);
-static atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
-/* protected by 'erofs_sb_list_lock' */
-static unsigned int shrinker_run_no;
+atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
-/* protects the mounted 'erofs_sb_list' */
+/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */
static DEFINE_SPINLOCK(erofs_sb_list_lock);
static LIST_HEAD(erofs_sb_list);
+static unsigned int shrinker_run_no;
static struct shrinker *erofs_shrinker_info;
static unsigned int z_erofs_gbuf_id(void)
@@ -214,145 +214,6 @@ void erofs_release_pages(struct page **pagepool)
}
}
-static bool erofs_workgroup_get(struct erofs_workgroup *grp)
-{
- if (lockref_get_not_zero(&grp->lockref))
- return true;
-
- spin_lock(&grp->lockref.lock);
- if (__lockref_is_dead(&grp->lockref)) {
- spin_unlock(&grp->lockref.lock);
- return false;
- }
-
- if (!grp->lockref.count++)
- atomic_long_dec(&erofs_global_shrink_cnt);
- spin_unlock(&grp->lockref.lock);
- return true;
-}
-
-struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- struct erofs_workgroup *grp;
-
-repeat:
- rcu_read_lock();
- grp = xa_load(&sbi->managed_pslots, index);
- if (grp) {
- if (!erofs_workgroup_get(grp)) {
- /* prefer to relax rcu read side */
- rcu_read_unlock();
- goto repeat;
- }
-
- DBG_BUGON(index != grp->index);
- }
- rcu_read_unlock();
- return grp;
-}
-
-struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- struct erofs_workgroup *pre;
-
- DBG_BUGON(grp->lockref.count < 1);
-repeat:
- xa_lock(&sbi->managed_pslots);
- pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
- NULL, grp, GFP_KERNEL);
- if (pre) {
- if (xa_is_err(pre)) {
- pre = ERR_PTR(xa_err(pre));
- } else if (!erofs_workgroup_get(pre)) {
- /* try to legitimize the current in-tree one */
- xa_unlock(&sbi->managed_pslots);
- cond_resched();
- goto repeat;
- }
- grp = pre;
- }
- xa_unlock(&sbi->managed_pslots);
- return grp;
-}
-
-static void __erofs_workgroup_free(struct erofs_workgroup *grp)
-{
- atomic_long_dec(&erofs_global_shrink_cnt);
- erofs_workgroup_free_rcu(grp);
-}
-
-void erofs_workgroup_put(struct erofs_workgroup *grp)
-{
- if (lockref_put_or_lock(&grp->lockref))
- return;
-
- DBG_BUGON(__lockref_is_dead(&grp->lockref));
- if (grp->lockref.count == 1)
- atomic_long_inc(&erofs_global_shrink_cnt);
- --grp->lockref.count;
- spin_unlock(&grp->lockref.lock);
-}
-
-static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp)
-{
- int free = false;
-
- spin_lock(&grp->lockref.lock);
- if (grp->lockref.count)
- goto out;
-
- /*
- * Note that all cached pages should be detached before deleted from
- * the XArray. Otherwise some cached pages could be still attached to
- * the orphan old workgroup when the new one is available in the tree.
- */
- if (erofs_try_to_free_all_cached_folios(sbi, grp))
- goto out;
-
- /*
- * It's impossible to fail after the workgroup is freezed,
- * however in order to avoid some race conditions, add a
- * DBG_BUGON to observe this in advance.
- */
- DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
-
- lockref_mark_dead(&grp->lockref);
- free = true;
-out:
- spin_unlock(&grp->lockref.lock);
- if (free)
- __erofs_workgroup_free(grp);
- return free;
-}
-
-static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
- unsigned long nr_shrink)
-{
- struct erofs_workgroup *grp;
- unsigned int freed = 0;
- unsigned long index;
-
- xa_lock(&sbi->managed_pslots);
- xa_for_each(&sbi->managed_pslots, index, grp) {
- /* try to shrink each valid workgroup */
- if (!erofs_try_to_release_workgroup(sbi, grp))
- continue;
- xa_unlock(&sbi->managed_pslots);
-
- ++freed;
- if (!--nr_shrink)
- return freed;
- xa_lock(&sbi->managed_pslots);
- }
- xa_unlock(&sbi->managed_pslots);
- return freed;
-}
-
void erofs_shrinker_register(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -369,8 +230,8 @@ void erofs_shrinker_unregister(struct super_block *sb)
struct erofs_sb_info *const sbi = EROFS_SB(sb);
mutex_lock(&sbi->umount_mutex);
- /* clean up all remaining workgroups in memory */
- erofs_shrink_workstation(sbi, ~0UL);
+ /* clean up all remaining pclusters in memory */
+ z_erofs_shrink_scan(sbi, ~0UL);
spin_lock(&erofs_sb_list_lock);
list_del(&sbi->list);
@@ -418,9 +279,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
spin_unlock(&erofs_sb_list_lock);
sbi->shrinker_run_no = run_no;
-
- freed += erofs_shrink_workstation(sbi, nr - freed);
-
+ freed += z_erofs_shrink_scan(sbi, nr - freed);
spin_lock(&erofs_sb_list_lock);
/* Get the next list element before we move this one */
p = p->next;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 22c934f3a080..76129bfcd663 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -347,13 +347,10 @@ EXPORT_SYMBOL_GPL(eventfd_fget);
*/
struct eventfd_ctx *eventfd_ctx_fdget(int fd)
{
- struct eventfd_ctx *ctx;
- struct fd f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- ctx = eventfd_ctx_fileget(fd_file(f));
- fdput(f);
- return ctx;
+ return eventfd_ctx_fileget(fd_file(f));
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1ae4542f0bd8..f9898e60dd8b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -420,7 +420,9 @@ static bool busy_loop_ep_timeout(unsigned long start_time,
static bool ep_busy_loop_on(struct eventpoll *ep)
{
- return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on();
+ return !!READ_ONCE(ep->busy_poll_usecs) ||
+ READ_ONCE(ep->prefer_busy_poll) ||
+ net_busy_loop_on();
}
static bool ep_busy_loop_end(void *p, unsigned long start_time)
@@ -455,6 +457,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
* it back in when we have moved a socket with a valid NAPI
* ID onto the ready list.
*/
+ if (prefer_busy_poll)
+ napi_resume_irqs(napi_id);
ep->napi_id = 0;
return false;
}
@@ -538,6 +542,22 @@ static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
}
}
+static void ep_suspend_napi_irqs(struct eventpoll *ep)
+{
+ unsigned int napi_id = READ_ONCE(ep->napi_id);
+
+ if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll))
+ napi_suspend_irqs(napi_id);
+}
+
+static void ep_resume_napi_irqs(struct eventpoll *ep)
+{
+ unsigned int napi_id = READ_ONCE(ep->napi_id);
+
+ if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll))
+ napi_resume_irqs(napi_id);
+}
+
#else
static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
@@ -555,6 +575,14 @@ static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
return -EOPNOTSUPP;
}
+static void ep_suspend_napi_irqs(struct eventpoll *ep)
+{
+}
+
+static void ep_resume_napi_irqs(struct eventpoll *ep)
+{
+}
+
#endif /* CONFIG_NET_RX_BUSY_POLL */
/*
@@ -786,6 +814,7 @@ static bool ep_refcount_dec_and_test(struct eventpoll *ep)
static void ep_free(struct eventpoll *ep)
{
+ ep_resume_napi_irqs(ep);
mutex_destroy(&ep->mtx);
free_uid(ep->user);
wakeup_source_unregister(ep->ws);
@@ -823,7 +852,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
to_free = NULL;
head = file->f_ep;
if (head->first == &epi->fllink && !epi->fllink.next) {
- file->f_ep = NULL;
+ /* See eventpoll_release() for details. */
+ WRITE_ONCE(file->f_ep, NULL);
if (!is_file_epoll(file)) {
struct epitems_head *v;
v = container_of(head, struct epitems_head, epitems);
@@ -1002,7 +1032,7 @@ static struct file *epi_fget(const struct epitem *epi)
struct file *file;
file = epi->ffd.file;
- if (!atomic_long_inc_not_zero(&file->f_count))
+ if (!file_ref_get(&file->f_ref))
file = NULL;
return file;
}
@@ -1372,7 +1402,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up(&ep->wq);
+ if (sync)
+ wake_up_sync(&ep->wq);
+ else
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
@@ -1603,7 +1636,8 @@ allocate:
spin_unlock(&file->f_lock);
goto allocate;
}
- file->f_ep = head;
+ /* See eventpoll_release() for details. */
+ WRITE_ONCE(file->f_ep, head);
to_free = NULL;
}
hlist_add_head_rcu(&epi->fllink, file->f_ep);
@@ -2003,8 +2037,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* trying again in search of more luck.
*/
res = ep_send_events(ep, events, maxevents);
- if (res)
+ if (res) {
+ if (res > 0)
+ ep_suspend_napi_irqs(ep);
return res;
+ }
}
if (timed_out)
@@ -2254,25 +2291,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
{
int error;
int full_check = 0;
- struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct eventpoll *tep = NULL;
- error = -EBADF;
- f = fdget(epfd);
- if (!fd_file(f))
- goto error_return;
+ CLASS(fd, f)(epfd);
+ if (fd_empty(f))
+ return -EBADF;
/* Get the "struct file *" for the target file */
- tf = fdget(fd);
- if (!fd_file(tf))
- goto error_fput;
+ CLASS(fd, tf)(fd);
+ if (fd_empty(tf))
+ return -EBADF;
/* The target file descriptor must support poll */
- error = -EPERM;
if (!file_can_poll(fd_file(tf)))
- goto error_tgt_fput;
+ return -EPERM;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
@@ -2391,12 +2425,6 @@ error_tgt_fput:
loop_check_gen++;
mutex_unlock(&epnested_mutex);
}
-
- fdput(tf);
-error_fput:
- fdput(f);
-error_return:
-
return error;
}
@@ -2424,8 +2452,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
- int error;
- struct fd f;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
@@ -2437,17 +2463,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
- f = fdget(epfd);
- if (!fd_file(f))
+ CLASS(fd, f)(epfd);
+ if (fd_empty(f))
return -EBADF;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
- error = -EINVAL;
if (!is_file_epoll(fd_file(f)))
- goto error_fput;
+ return -EINVAL;
/*
* At this point it is safe to assume that the "private_data" contains
@@ -2456,11 +2481,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
ep = fd_file(f)->private_data;
/* Time to fish for events ... */
- error = ep_poll(ep, events, maxevents, to);
-
-error_fput:
- fdput(f);
- return error;
+ return ep_poll(ep, events, maxevents, to);
}
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
diff --git a/fs/exec.c b/fs/exec.c
index 6c53920795c2..aaa605529a75 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -990,7 +990,7 @@ static int exec_mmap(struct mm_struct *mm)
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
- mm_init_cid(mm);
+ mm_init_cid(mm, tsk);
/*
* This prevents preemption while active_mm is being loaded and
* it and mm are being updated, which could cause problems for
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 591fb3f710be..8042ad873808 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -550,7 +550,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
(ignore_locked ? REQ_RAHEAD : 0),
- ext4_end_bitmap_read);
+ ext4_end_bitmap_read,
+ ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO));
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -577,7 +578,6 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
if (!desc)
return -EFSCORRUPTED;
wait_on_buffer(bh);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
if (!buffer_uptodate(bh)) {
ext4_error_err(sb, EIO, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ef6a3c8f3a9a..02d47a64e8d1 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -418,7 +418,7 @@ struct fname {
__u32 inode;
__u8 name_len;
__u8 file_type;
- char name[];
+ char name[] __counted_by(name_len);
};
/*
@@ -471,14 +471,13 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
struct rb_node **p, *parent = NULL;
struct fname *fname, *new_fn;
struct dir_private_info *info;
- int len;
info = dir_file->private_data;
p = &info->root.rb_node;
/* Create and allocate the fname structure */
- len = sizeof(struct fname) + ent_name->len + 1;
- new_fn = kzalloc(len, GFP_KERNEL);
+ new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1),
+ GFP_KERNEL);
if (!new_fn)
return -ENOMEM;
new_fn->hash = hash;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 44b0d418143c..74f2071189b2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1729,6 +1729,10 @@ struct ext4_sb_info {
*/
struct work_struct s_sb_upd_work;
+ /* Atomic write unit values in bytes */
+ unsigned int s_awu_min;
+ unsigned int s_awu_max;
+
/* Ext4 fast commit sub transaction ID */
atomic_t s_fc_subtid;
@@ -1865,14 +1869,6 @@ static inline bool ext4_simulate_fail(struct super_block *sb,
return false;
}
-static inline void ext4_simulate_fail_bh(struct super_block *sb,
- struct buffer_head *bh,
- unsigned long code)
-{
- if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
- clear_buffer_uptodate(bh);
-}
-
/*
* Error number codes for s_{first,last}_error_errno
*
@@ -3100,9 +3096,9 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io);
+ bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io);
+ bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
@@ -3855,6 +3851,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
return buffer_uptodate(bh);
}
+static inline bool ext4_inode_can_atomic_write(struct inode *inode)
+{
+
+ return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
+}
+
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 34e25eee6521..a07a98a4b97a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -568,7 +568,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
- err = ext4_read_bh(bh, 0, NULL);
+ err = ext4_read_bh(bh, 0, NULL, false);
if (err < 0)
goto errout;
}
@@ -3138,7 +3138,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
return;
ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
- EXTENT_STATUS_WRITTEN, 0);
+ EXTENT_STATUS_WRITTEN, false);
}
/* FIXME!! we need to try to merge to left or right after zero-out */
@@ -4158,7 +4158,7 @@ insert_hole:
/* Put just found gap into cache to speed up subsequent requests */
ext_debug(inode, " -> %u:%u\n", hole_start, len);
ext4_es_insert_extent(inode, hole_start, len, ~0,
- EXTENT_STATUS_HOLE, 0);
+ EXTENT_STATUS_HOLE, false);
/* Update hole_len to reflect hole size after lblk */
if (hole_start != lblk)
@@ -4482,7 +4482,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
int depth = 0;
struct ext4_map_blocks map;
unsigned int credits;
- loff_t epos;
+ loff_t epos, old_size = i_size_read(inode);
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4541,6 +4541,11 @@ retry:
if (ext4_update_inode_size(inode, epos) & 0x1)
inode_set_mtime_to_ts(inode,
inode_get_ctime(inode));
+ if (epos > old_size) {
+ pagecache_isize_extended(inode, old_size, epos);
+ ext4_zero_partial_blocks(handle, inode,
+ old_size, epos - old_size);
+ }
}
ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index c786691dabd3..ae29832aab1e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -848,7 +848,7 @@ out:
*/
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status, int flags)
+ unsigned int status, bool delalloc_reserve_used)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
@@ -863,8 +863,8 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n",
- lblk, len, pblk, status, flags, inode->i_ino);
+ es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino);
if (!len)
return;
@@ -945,7 +945,7 @@ error:
resv_used += pending;
if (resv_used)
ext4_da_update_reserve_space(inode, resv_used,
- flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+ delalloc_reserve_used);
if (err1 || err2 || err3 < 0)
goto retry;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 4424232de298..8f9c008d11e8 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -135,7 +135,8 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status, int flags);
+ unsigned int status,
+ bool delalloc_reserve_used);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b33664f6ce2a..26c4fc37edcf 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -291,9 +291,9 @@ void ext4_fc_del(struct inode *inode)
return;
restart:
- spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ spin_lock(&sbi->s_fc_lock);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ spin_unlock(&sbi->s_fc_lock);
return;
}
@@ -357,9 +357,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
}
spin_lock(&sbi->s_fc_lock);
is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- if (has_transaction &&
- (!is_ineligible ||
- (is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid))))
+ if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
spin_unlock(&sbi->s_fc_lock);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f14aed14b9cf..3bd96c3d4cd0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -392,8 +392,9 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
*/
if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
pos + size <= i_size_read(inode))
- return size;
- return ext4_handle_inode_extension(inode, pos, size, size);
+ return 0;
+ error = ext4_handle_inode_extension(inode, pos, size, size);
+ return error < 0 ? error : 0;
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -564,12 +565,9 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
-
ext4_journal_stop(handle);
+ if (ret)
+ goto out;
}
if (ilock_shared && !unwritten)
@@ -599,6 +597,13 @@ out:
ssize_t err;
loff_t endbyte;
+ /*
+ * There is no support for atomic writes on buffered-io yet,
+ * we should never fallback to buffered-io for DIO atomic
+ * writes.
+ */
+ WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC);
+
offset = iocb->ki_pos;
err = ext4_buffered_write_iter(iocb, from);
if (err < 0)
@@ -692,6 +697,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
+
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ size_t len = iov_iter_count(from);
+ int ret;
+
+ if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
+ len > EXT4_SB(inode->i_sb)->s_awu_max)
+ return -EINVAL;
+
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_write_iter(iocb, from);
else
@@ -884,6 +903,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}
+ if (ext4_inode_can_atomic_write(inode))
+ filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+
filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
return dquot_file_open(inode, filp);
}
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index df853c4d3a8c..383c6edea6dd 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -185,6 +185,56 @@ static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr)
return fmr->fmr_physical + fmr->fmr_length;
}
+static int ext4_getfsmap_meta_helper(struct super_block *sb,
+ ext4_group_t agno, ext4_grpblk_t start,
+ ext4_grpblk_t len, void *priv)
+{
+ struct ext4_getfsmap_info *info = priv;
+ struct ext4_fsmap *p;
+ struct ext4_fsmap *tmp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t fsb, fs_start, fs_end;
+ int error;
+
+ fs_start = fsb = (EXT4_C2B(sbi, start) +
+ ext4_group_first_block_no(sb, agno));
+ fs_end = fs_start + EXT4_C2B(sbi, len);
+
+ /* Return relevant extents from the meta_list */
+ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) {
+ if (p->fmr_physical < info->gfi_next_fsblk) {
+ list_del(&p->fmr_list);
+ kfree(p);
+ continue;
+ }
+ if (p->fmr_physical <= fs_start ||
+ p->fmr_physical + p->fmr_length <= fs_end) {
+ /* Emit the retained free extent record if present */
+ if (info->gfi_lastfree.fmr_owner) {
+ error = ext4_getfsmap_helper(sb, info,
+ &info->gfi_lastfree);
+ if (error)
+ return error;
+ info->gfi_lastfree.fmr_owner = 0;
+ }
+ error = ext4_getfsmap_helper(sb, info, p);
+ if (error)
+ return error;
+ fsb = p->fmr_physical + p->fmr_length;
+ if (info->gfi_next_fsblk < fsb)
+ info->gfi_next_fsblk = fsb;
+ list_del(&p->fmr_list);
+ kfree(p);
+ continue;
+ }
+ }
+ if (info->gfi_next_fsblk < fsb)
+ info->gfi_next_fsblk = fsb;
+
+ return 0;
+}
+
+
/* Transform a blockgroup's free record into a fsmap */
static int ext4_getfsmap_datadev_helper(struct super_block *sb,
ext4_group_t agno, ext4_grpblk_t start,
@@ -539,6 +589,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
error = ext4_mballoc_query_range(sb, info->gfi_agno,
EXT4_B2C(sbi, info->gfi_low.fmr_physical),
EXT4_B2C(sbi, info->gfi_high.fmr_physical),
+ ext4_getfsmap_meta_helper,
ext4_getfsmap_datadev_helper, info);
if (error)
goto err;
@@ -560,7 +611,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
/* Report any gaps at the end of the bg */
info->gfi_last = true;
- error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info);
+ error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1,
+ 0, info);
if (error)
goto err;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7f1a5f90dbbd..21d228073d79 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -193,8 +193,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
trace_ext4_load_inode_bitmap(sb, block_group);
- ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
+ ext4_read_bh(bh, REQ_META | REQ_PRIO,
+ ext4_end_bitmap_read,
+ ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO));
if (!buffer_uptodate(bh)) {
put_bh(bh);
ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 7404f0935c90..7de327fa7b1c 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -170,7 +170,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
}
if (!bh_uptodate_or_lock(bh)) {
- if (ext4_read_bh(bh, 0, NULL) < 0) {
+ if (ext4_read_bh(bh, 0, NULL, false) < 0) {
put_bh(bh);
goto failure;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 54bdd4884fe6..89aade6f45f6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -483,7 +483,7 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, 0);
+ map->m_pblk, status, false);
return retval;
}
@@ -563,8 +563,8 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, flags);
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk,
+ status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
return retval;
}
@@ -856,7 +856,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
if (nowait)
return sb_find_get_block(inode->i_sb, map.m_pblk);
- bh = sb_getblk(inode->i_sb, map.m_pblk);
+ /*
+ * Since bh could introduce extra ref count such as referred by
+ * journal_head etc. Try to avoid using __GFP_MOVABLE here
+ * as it may fail the migration when journal_head remains.
+ */
+ bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk,
+ inode->i_sb->s_blocksize);
+
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
if (map.m_flags & EXT4_MAP_NEW) {
@@ -1307,8 +1314,10 @@ static int ext4_write_end(struct file *file,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity)
+ if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
+ ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ }
/*
* Don't mark the inode dirty under folio lock. First, it unnecessarily
* makes the holding time of folio lock longer. Second, it forces lock
@@ -1423,8 +1432,10 @@ static int ext4_journalled_write_end(struct file *file,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity)
+ if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
+ ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ }
if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -2985,7 +2996,8 @@ static int ext4_da_do_write_end(struct address_space *mapping,
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool disksize_changed = false;
- loff_t new_i_size;
+ loff_t new_i_size, zero_len = 0;
+ handle_t *handle;
if (unlikely(!folio_buffers(folio))) {
folio_unlock(folio);
@@ -3029,18 +3041,21 @@ static int ext4_da_do_write_end(struct address_space *mapping,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos)
+ if (pos > old_size) {
pagecache_isize_extended(inode, old_size, pos);
+ zero_len = pos - old_size;
+ }
- if (disksize_changed) {
- handle_t *handle;
+ if (!disksize_changed && !zero_len)
+ return copied;
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
- }
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ if (zero_len)
+ ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
return copied;
}
@@ -3444,17 +3459,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
return ret;
}
+static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
+{
+ /* must be a directio to fall back to buffered */
+ if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) !=
+ (IOMAP_WRITE | IOMAP_DIRECT))
+ return false;
+
+ /* atomic writes are all-or-nothing */
+ if (flags & IOMAP_ATOMIC)
+ return false;
+
+ /* can only try again if we wrote nothing */
+ return written == 0;
+}
+
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
/*
* Check to see whether an error occurred while writing out the data to
- * the allocated blocks. If so, return the magic error code so that we
- * fallback to buffered I/O and attempt to complete the remainder of
- * the I/O. Any blocks that may have been allocated in preparation for
- * the direct I/O will be reused during buffered I/O.
+ * the allocated blocks. If so, return the magic error code for
+ * non-atomic write so that we fallback to buffered I/O and attempt to
+ * complete the remainder of the I/O.
+ * For non-atomic writes, any blocks that may have been
+ * allocated in preparation for the direct I/O will be reused during
+ * buffered I/O. For atomic write, we never fallback to buffered-io.
*/
- if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+ if (ext4_want_directio_fallback(flags, written))
return -ENOTBLK;
return 0;
@@ -4497,10 +4529,10 @@ make_io:
* Read the block from disk.
*/
trace_ext4_load_inode(sb, ino);
- ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL,
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO));
blk_finish_plug(&plug);
wait_on_buffer(bh);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
if (!buffer_uptodate(bh)) {
if (ret_block)
*ret_block = block;
@@ -5426,6 +5458,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (attr->ia_size != inode->i_size) {
+ /* attach jbd2 jinode for EOF folio tail zeroing */
+ if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
+ oldsize & (inode->i_sb->s_blocksize - 1)) {
+ error = ext4_inode_attach_jinode(inode);
+ if (error)
+ goto err_out;
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
@@ -5436,12 +5476,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
orphan = 1;
}
/*
- * Update c/mtime on truncate up, ext4_truncate() will
- * update c/mtime in shrink case below
+ * Update c/mtime and tail zero the EOF folio on
+ * truncate up. ext4_truncate() handles the shrink case
+ * below.
*/
- if (!shrink)
+ if (!shrink) {
inode_set_mtime_to_ts(inode,
inode_set_ctime_current(inode));
+ if (oldsize & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle,
+ inode->i_mapping, oldsize);
+ }
if (shrink)
ext4_fc_track_range(handle, inode,
@@ -5578,6 +5623,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
}
}
+ if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int awu_min = 0, awu_max = 0;
+
+ if (ext4_inode_can_atomic_write(inode)) {
+ awu_min = sbi->s_awu_min;
+ awu_max = sbi->s_awu_max;
+ }
+
+ generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
+ }
+
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1c77400bd88e..7b9ce71c1c81 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1330,7 +1330,6 @@ group_extend_out:
case EXT4_IOC_MOVE_EXT: {
struct move_extent me;
- struct fd donor;
int err;
if (!(filp->f_mode & FMODE_READ) ||
@@ -1342,30 +1341,26 @@ group_extend_out:
return -EFAULT;
me.moved_len = 0;
- donor = fdget(me.donor_fd);
- if (!fd_file(donor))
+ CLASS(fd, donor)(me.donor_fd);
+ if (fd_empty(donor))
return -EBADF;
- if (!(fd_file(donor)->f_mode & FMODE_WRITE)) {
- err = -EBADF;
- goto mext_out;
- }
+ if (!(fd_file(donor)->f_mode & FMODE_WRITE))
+ return -EBADF;
if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with bigalloc");
- err = -EOPNOTSUPP;
- goto mext_out;
+ return -EOPNOTSUPP;
} else if (IS_DAX(inode)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with DAX");
- err = -EOPNOTSUPP;
- goto mext_out;
+ return -EOPNOTSUPP;
}
err = mnt_want_write_file(filp);
if (err)
- goto mext_out;
+ return err;
err = ext4_move_extents(filp, fd_file(donor), me.orig_start,
me.donor_start, me.len, &me.moved_len);
@@ -1374,8 +1369,6 @@ group_extend_out:
if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
err = -EFAULT;
-mext_out:
- fdput(donor);
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d73e38323879..b25a27c86696 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5711,7 +5711,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
mb_debug(sb, "%u found", ac->ac_found);
- mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no");
+ mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa));
if (ac->ac_pa)
mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
"group pa" : "inode pa");
@@ -6056,7 +6056,7 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
}
out_dbg:
- mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
+ mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret));
return ret;
}
@@ -6999,13 +6999,14 @@ int
ext4_mballoc_query_range(
struct super_block *sb,
ext4_group_t group,
- ext4_grpblk_t start,
+ ext4_grpblk_t first,
ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn meta_formatter,
ext4_mballoc_query_range_fn formatter,
void *priv)
{
void *bitmap;
- ext4_grpblk_t next;
+ ext4_grpblk_t start, next;
struct ext4_buddy e4b;
int error;
@@ -7016,10 +7017,19 @@ ext4_mballoc_query_range(
ext4_lock_group(sb, group);
- start = max(e4b.bd_info->bb_first_free, start);
+ start = max(e4b.bd_info->bb_first_free, first);
if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
-
+ if (meta_formatter && start != first) {
+ if (start > end)
+ start = end;
+ ext4_unlock_group(sb, group);
+ error = meta_formatter(sb, group, first, start - first,
+ priv);
+ if (error)
+ goto out_unload;
+ ext4_lock_group(sb, group);
+ }
while (start <= end) {
start = mb_find_next_zero_bit(bitmap, end + 1, start);
if (start > end)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d8553f1498d3..f8280de3e882 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -259,6 +259,7 @@ ext4_mballoc_query_range(
ext4_group_t agno,
ext4_grpblk_t start,
ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn meta_formatter,
ext4_mballoc_query_range_fn formatter,
void *priv);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index bd946d0c71b7..d64c04ed061a 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -94,7 +94,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
}
lock_buffer(*bh);
- ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL);
+ ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false);
if (ret)
goto warn_exit;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b64661ea6e0e..898443e98efc 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -213,7 +213,7 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
unlock_buffer(bh);
continue;
}
- ext4_read_bh_nowait(bh, 0, NULL);
+ ext4_read_bh_nowait(bh, 0, NULL, false);
nr++;
} while (block++, (bh = bh->b_this_page) != head);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 790db7eac6c2..bcf2737078b8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1747,7 +1747,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
#endif
frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
- return (struct buffer_head *) frame;
+ return ERR_CAST(frame);
do {
block = dx_get_block(frame->at);
bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
@@ -1952,7 +1952,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
if (IS_ERR(bh2)) {
brelse(*bh);
*bh = NULL;
- return (struct ext4_dir_entry_2 *) bh2;
+ return ERR_CAST(bh2);
}
BUFFER_TRACE(*bh, "get_write_access");
@@ -2000,8 +2000,17 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
else
split = count/2;
+ if (WARN_ON_ONCE(split == 0)) {
+ /* Should never happen, but avoid out-of-bounds access below */
+ ext4_error_inode_block(dir, (*bh)->b_blocknr, 0,
+ "bad indexed directory? hash=%08x:%08x count=%d move=%u",
+ hinfo->hash, hinfo->minor_hash, count, move);
+ err = -EFSCORRUPTED;
+ goto out;
+ }
+
hash2 = map[split].hash;
- continued = split > 0 ? hash2 == map[split - 1].hash : 0;
+ continued = hash2 == map[split - 1].hash;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
(unsigned long)dx_get_block(frame->at),
hash2, split, count-split));
@@ -2043,10 +2052,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
return de;
journal_error:
+ ext4_std_error(dir->i_sb, err);
+out:
brelse(*bh);
brelse(bh2);
*bh = NULL;
- ext4_std_error(dir->i_sb, err);
return ERR_PTR(err);
}
@@ -2395,11 +2405,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
-#if IS_ENABLED(CONFIG_UNICODE)
- if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
- utf8_validate(sb->s_encoding, &dentry->d_name))
+ if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
return -EINVAL;
-#endif
retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
if (retval)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index ad5543866d21..69b8a7221a2b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -417,11 +417,13 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
submit_and_retry:
ext4_io_submit(io);
}
- if (io->io_bio == NULL)
+ if (io->io_bio == NULL) {
io_submit_init_bio(io, bh);
+ io->io_bio->bi_write_hint = inode->i_write_hint;
+ }
if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
goto submit_and_retry;
- wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
+ wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
io->io_next_block++;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a2704f064361..72f77f78ae8d 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1300,7 +1300,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
if (unlikely(!bh))
return NULL;
if (!bh_uptodate_or_lock(bh)) {
- if (ext4_read_bh(bh, 0, NULL) < 0) {
+ if (ext4_read_bh(bh, 0, NULL, false) < 0) {
brelse(bh);
return NULL;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 16a4ce704460..785809f33ff4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -161,8 +161,14 @@ MODULE_ALIAS("ext3");
static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io)
+ bh_end_io_t *end_io, bool simu_fail)
{
+ if (simu_fail) {
+ clear_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ return;
+ }
+
/*
* buffer's verified bit is no longer valid after reading from
* disk again due to write out error, clear it to make sure we
@@ -176,7 +182,7 @@ static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
}
void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io)
+ bh_end_io_t *end_io, bool simu_fail)
{
BUG_ON(!buffer_locked(bh));
@@ -184,10 +190,11 @@ void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
unlock_buffer(bh);
return;
}
- __ext4_read_bh(bh, op_flags, end_io);
+ __ext4_read_bh(bh, op_flags, end_io, simu_fail);
}
-int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io)
+int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io, bool simu_fail)
{
BUG_ON(!buffer_locked(bh));
@@ -196,7 +203,7 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io
return 0;
}
- __ext4_read_bh(bh, op_flags, end_io);
+ __ext4_read_bh(bh, op_flags, end_io, simu_fail);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
@@ -208,10 +215,10 @@ int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
lock_buffer(bh);
if (!wait) {
- ext4_read_bh_nowait(bh, op_flags, NULL);
+ ext4_read_bh_nowait(bh, op_flags, NULL, false);
return 0;
}
- return ext4_read_bh(bh, op_flags, NULL);
+ return ext4_read_bh(bh, op_flags, NULL, false);
}
/*
@@ -266,7 +273,7 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
if (likely(bh)) {
if (trylock_buffer(bh))
- ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
+ ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false);
brelse(bh);
}
}
@@ -346,9 +353,9 @@ __u32 ext4_free_group_clusters(struct super_block *sb,
__u32 ext4_free_inodes_count(struct super_block *sb,
struct ext4_group_desc *bg)
{
- return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+ return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+ (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0);
}
__u32 ext4_used_dirs_count(struct super_block *sb,
@@ -402,9 +409,9 @@ void ext4_free_group_clusters_set(struct super_block *sb,
void ext4_free_inodes_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count)
{
- bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+ WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count));
if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
- bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+ WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16));
}
void ext4_used_dirs_set(struct super_block *sb,
@@ -2096,16 +2103,16 @@ static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
}
#define EXT4_SET_CTX(name) \
-static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
- unsigned long flag) \
+static inline __maybe_unused \
+void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \
{ \
ctx->mask_s_##name |= flag; \
ctx->vals_s_##name |= flag; \
}
#define EXT4_CLEAR_CTX(name) \
-static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \
- unsigned long flag) \
+static inline __maybe_unused \
+void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \
{ \
ctx->mask_s_##name |= flag; \
ctx->vals_s_##name &= ~flag; \
@@ -3030,6 +3037,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PUTS("mb_optimize_scan=1");
}
+ if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS))
+ SEQ_OPTS_PUTS("prefetch_block_bitmaps");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -3709,12 +3719,12 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- start_time = ktime_get_real_ns();
+ start_time = ktime_get_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) *
EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -3774,8 +3784,9 @@ static int ext4_lazyinit_thread(void *arg)
cont_thread:
while (true) {
- next_wakeup = MAX_JIFFY_OFFSET;
+ bool next_wakeup_initialized = false;
+ next_wakeup = 0;
mutex_lock(&eli->li_list_mtx);
if (list_empty(&eli->li_request_list)) {
mutex_unlock(&eli->li_list_mtx);
@@ -3788,8 +3799,11 @@ cont_thread:
lr_request);
if (time_before(jiffies, elr->lr_next_sched)) {
- if (time_before(elr->lr_next_sched, next_wakeup))
+ if (!next_wakeup_initialized ||
+ time_before(elr->lr_next_sched, next_wakeup)) {
next_wakeup = elr->lr_next_sched;
+ next_wakeup_initialized = true;
+ }
continue;
}
if (down_read_trylock(&elr->lr_super->s_umount)) {
@@ -3817,16 +3831,18 @@ cont_thread:
elr->lr_next_sched = jiffies +
get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
}
- if (time_before(elr->lr_next_sched, next_wakeup))
+ if (!next_wakeup_initialized ||
+ time_before(elr->lr_next_sched, next_wakeup)) {
next_wakeup = elr->lr_next_sched;
+ next_wakeup_initialized = true;
+ }
}
mutex_unlock(&eli->li_list_mtx);
try_to_freeze();
cur = jiffies;
- if ((time_after_eq(cur, next_wakeup)) ||
- (MAX_JIFFY_OFFSET == next_wakeup)) {
+ if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) {
cond_resched();
continue;
}
@@ -4425,6 +4441,36 @@ static int ext4_handle_clustersize(struct super_block *sb)
return 0;
}
+/*
+ * ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
+ * @sb: super block
+ * TODO: Later add support for bigalloc
+ */
+static void ext4_atomic_write_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct block_device *bdev = sb->s_bdev;
+
+ if (!bdev_can_atomic_write(bdev))
+ return;
+
+ if (!ext4_has_feature_extents(sb))
+ return;
+
+ sbi->s_awu_min = max(sb->s_blocksize,
+ bdev_atomic_write_unit_min_bytes(bdev));
+ sbi->s_awu_max = min(sb->s_blocksize,
+ bdev_atomic_write_unit_max_bytes(bdev));
+ if (sbi->s_awu_min && sbi->s_awu_max &&
+ sbi->s_awu_min <= sbi->s_awu_max) {
+ ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
+ sbi->s_awu_min, sbi->s_awu_max);
+ } else {
+ sbi->s_awu_min = 0;
+ sbi->s_awu_max = 0;
+ }
+}
+
static void ext4_fast_commit_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -5336,6 +5382,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
spin_lock_init(&sbi->s_bdev_wb_lock);
+ ext4_atomic_write_init(sb);
ext4_fast_commit_init(sb);
sb->s_root = NULL;
@@ -6301,7 +6348,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (unlikely(ext4_forced_shutdown(sb)))
- return 0;
+ return -EIO;
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
@@ -6518,8 +6565,12 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
goto restore_opts;
}
- if (test_opt2(sb, ABORT))
- ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
+ if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) &&
+ !test_opt(sb, DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount");
+ err = -EINVAL;
+ goto restore_opts;
+ }
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -6689,6 +6740,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
ext4_stop_mmpd(sbi);
+ /*
+ * Handle aborting the filesystem as the last thing during remount to
+ * avoid obsure errors during remount when some option changes fail to
+ * apply due to shutdown filesystem.
+ */
+ if (test_opt2(sb, ABORT))
+ ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
+
return 0;
restore_opts:
@@ -7329,7 +7388,7 @@ static struct file_system_type ext4_fs_type = {
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
.kill_sb = ext4_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("ext4");
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 94f7b084f601..e3ce763cce18 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
}
if (fio->io_wbc && !is_read_io(fio->op))
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
__read_io_type(page) : WB_DATA_TYPE(fio->page, false));
@@ -911,7 +912,8 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
@@ -1011,7 +1013,8 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
io->last_block_in_bio = fio->new_blkaddr;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 321d8ffbab6e..84447d5145aa 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3038,32 +3038,27 @@ out:
static int __f2fs_ioc_move_range(struct file *filp,
struct f2fs_move_range *range)
{
- struct fd dst;
int err;
if (!(filp->f_mode & FMODE_READ) ||
!(filp->f_mode & FMODE_WRITE))
return -EBADF;
- dst = fdget(range->dst_fd);
- if (!fd_file(dst))
+ CLASS(fd, dst)(range->dst_fd);
+ if (fd_empty(dst))
return -EBADF;
- if (!(fd_file(dst)->f_mode & FMODE_WRITE)) {
- err = -EBADF;
- goto err_out;
- }
+ if (!(fd_file(dst)->f_mode & FMODE_WRITE))
+ return -EBADF;
err = mnt_want_write_file(filp);
if (err)
- goto err_out;
+ return err;
err = f2fs_move_file_range(filp, range->pos_in, fd_file(dst),
range->pos_out, range->len);
mnt_drop_write_file(filp);
-err_out:
- fdput(dst);
return err;
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 22dd9dcce7ec..ac77dd912412 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
#include <linux/slab.h>
@@ -397,6 +396,9 @@ static long f_dupfd_query(int fd, struct file *filp)
{
CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+
/*
* We can do the 'fdput()' immediately, as the only thing that
* matters is the pointer value which isn't changed by the fdput.
@@ -570,24 +572,21 @@ static int check_fcntl_cmd(unsigned cmd)
SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
- struct fd f = fdget_raw(fd);
- long err = -EBADF;
+ CLASS(fd_raw, f)(fd);
+ long err;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
- goto out1;
+ return -EBADF;
}
err = security_file_fcntl(fd_file(f), cmd, arg);
if (!err)
err = do_fcntl(fd, cmd, arg, fd_file(f));
-out1:
- fdput(f);
-out:
return err;
}
@@ -596,21 +595,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
unsigned long, arg)
{
void __user *argp = (void __user *)arg;
- struct fd f = fdget_raw(fd);
+ CLASS(fd_raw, f)(fd);
struct flock64 flock;
- long err = -EBADF;
+ long err;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
- goto out1;
+ return -EBADF;
}
err = security_file_fcntl(fd_file(f), cmd, arg);
if (err)
- goto out1;
+ return err;
switch (cmd) {
case F_GETLK64:
@@ -635,9 +634,6 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
err = do_fcntl(fd, cmd, arg, fd_file(f));
break;
}
-out1:
- fdput(f);
-out:
return err;
}
#endif
@@ -733,21 +729,21 @@ static int fixup_compat_flock(struct flock *flock)
static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
compat_ulong_t arg)
{
- struct fd f = fdget_raw(fd);
+ CLASS(fd_raw, f)(fd);
struct flock flock;
- long err = -EBADF;
+ long err;
- if (!fd_file(f))
- return err;
+ if (fd_empty(f))
+ return -EBADF;
if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
- goto out_put;
+ return -EBADF;
}
err = security_file_fcntl(fd_file(f), cmd, arg);
if (err)
- goto out_put;
+ return err;
switch (cmd) {
case F_GETLK:
@@ -790,8 +786,6 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
err = do_fcntl(fd, cmd, arg, fd_file(f));
break;
}
-out_put:
- fdput(f);
return err;
}
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 82df28d45cd7..5f801139358e 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -139,12 +139,11 @@ static int get_path_from_fd(int fd, struct path *root)
path_get(root);
spin_unlock(&fs->lock);
} else {
- struct fd f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
*root = fd_file(f)->f_path;
path_get(root);
- fdput(f);
}
return 0;
diff --git a/fs/file.c b/fs/file.c
index eb093e736972..fb1011cf6b4a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -20,10 +20,73 @@
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
+#include <linux/file_ref.h>
#include <net/sock.h>
#include "internal.h"
+/**
+ * __file_ref_put - Slowpath of file_ref_put()
+ * @ref: Pointer to the reference count
+ * @cnt: Current reference count
+ *
+ * Invoked when the reference count is outside of the valid zone.
+ *
+ * Return:
+ * True if this was the last reference with no future references
+ * possible. This signals the caller that it can safely schedule the
+ * object, which is protected by the reference counter, for
+ * deconstruction.
+ *
+ * False if there are still active references or the put() raced
+ * with a concurrent get()/put() pair. Caller is not allowed to
+ * deconstruct the protected object.
+ */
+bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
+{
+ /* Did this drop the last reference? */
+ if (likely(cnt == FILE_REF_NOREF)) {
+ /*
+ * Carefully try to set the reference count to FILE_REF_DEAD.
+ *
+ * This can fail if a concurrent get() operation has
+ * elevated it again or the corresponding put() even marked
+ * it dead already. Both are valid situations and do not
+ * require a retry. If this fails the caller is not
+ * allowed to deconstruct the object.
+ */
+ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
+ return false;
+
+ /*
+ * The caller can safely schedule the object for
+ * deconstruction. Provide acquire ordering.
+ */
+ smp_acquire__after_ctrl_dep();
+ return true;
+ }
+
+ /*
+ * If the reference count was already in the dead zone, then this
+ * put() operation is imbalanced. Warn, put the reference count back to
+ * DEAD and tell the caller to not deconstruct the object.
+ */
+ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
+ atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
+ return false;
+ }
+
+ /*
+ * This is a put() operation on a saturated refcount. Restore the
+ * mean saturation value and tell the caller to not deconstruct the
+ * object.
+ */
+ if (cnt > FILE_REF_MAXREF)
+ atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
+ return false;
+}
+EXPORT_SYMBOL_GPL(__file_ref_put);
+
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
@@ -89,18 +152,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
* 'unsigned long' in some places, but simply because that is how the Linux
* kernel bitmaps are defined to work: they are not "bits in an array of bytes",
* they are very much "bits in an array of unsigned long".
- *
- * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
- * by that "1024/sizeof(ptr)" before, we already know there are sufficient
- * clear low bits. Clang seems to realize that, gcc ends up being confused.
- *
- * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
- * let's consider it documentation (and maybe a test-case for gcc to improve
- * its code generation ;)
*/
-static struct fdtable * alloc_fdtable(unsigned int nr)
+static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
{
struct fdtable *fdt;
+ unsigned int nr;
void *data;
/*
@@ -108,22 +164,32 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
* Allocation steps are keyed to the size of the fdarray, since it
* grows far faster than any of the other dynamic data. We try to fit
* the fdarray into comfortable page-tuned chunks: starting at 1024B
- * and growing in powers of two from there on.
+ * and growing in powers of two from there on. Since we called only
+ * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
+ * already gives BITS_PER_LONG slots), the above boils down to
+ * 1. use the smallest power of two large enough to give us that many
+ * slots.
+ * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is
+ * 256 slots (i.e. 1Kb fd array).
+ * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there
+ * and we are never going to be asked for 64 or less.
*/
- nr /= (1024 / sizeof(struct file *));
- nr = roundup_pow_of_two(nr + 1);
- nr *= (1024 / sizeof(struct file *));
- nr = ALIGN(nr, BITS_PER_LONG);
+ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
+ nr = 256;
+ else
+ nr = roundup_pow_of_two(slots_wanted);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
- * had been set lower between the check in expand_files() and here. Deal
- * with that in caller, it's cheaper that way.
+ * had been set lower between the check in expand_files() and here.
*
* We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
* bitmaps handling below becomes unpleasant, to put it mildly...
*/
- if (unlikely(nr > sysctl_nr_open))
- nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
+ if (unlikely(nr > sysctl_nr_open)) {
+ nr = round_down(sysctl_nr_open, BITS_PER_LONG);
+ if (nr < slots_wanted)
+ return ERR_PTR(-EMFILE);
+ }
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
@@ -152,14 +218,14 @@ out_arr:
out_fdt:
kfree(fdt);
out:
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
/*
* Expand the file descriptor table.
* This function will allocate a new fdtable and both fd array and fdset, of
* the given size.
- * Return <0 error code on error; 1 on successful completion.
+ * Return <0 error code on error; 0 on successful completion.
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_fdtable(struct files_struct *files, unsigned int nr)
@@ -169,7 +235,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
struct fdtable *new_fdt, *cur_fdt;
spin_unlock(&files->file_lock);
- new_fdt = alloc_fdtable(nr);
+ new_fdt = alloc_fdtable(nr + 1);
/* make sure all fd_install() have seen resize_in_progress
* or have finished their rcu_read_lock_sched() section.
@@ -178,16 +244,8 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
synchronize_rcu();
spin_lock(&files->file_lock);
- if (!new_fdt)
- return -ENOMEM;
- /*
- * extremely unlikely race - sysctl_nr_open decreased between the check in
- * caller and alloc_fdtable(). Cheaper to catch it here...
- */
- if (unlikely(new_fdt->max_fds <= nr)) {
- __free_fdtable(new_fdt);
- return -EMFILE;
- }
+ if (IS_ERR(new_fdt))
+ return PTR_ERR(new_fdt);
cur_fdt = files_fdtable(files);
BUG_ON(nr < cur_fdt->max_fds);
copy_fdtable(new_fdt, cur_fdt);
@@ -196,15 +254,14 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
/* coupled with smp_rmb() in fd_install() */
smp_wmb();
- return 1;
+ return 0;
}
/*
* Expand files.
* This function will expand the file structures, if the requested size exceeds
* the current capacity and there is room for expansion.
- * Return <0 error code on error; 0 when nothing done; 1 when files were
- * expanded and execution may have blocked.
+ * Return <0 error code on error; 0 on success.
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_files(struct files_struct *files, unsigned int nr)
@@ -212,14 +269,14 @@ static int expand_files(struct files_struct *files, unsigned int nr)
__acquires(files->file_lock)
{
struct fdtable *fdt;
- int expanded = 0;
+ int error;
repeat:
fdt = files_fdtable(files);
/* Do we need to expand? */
if (nr < fdt->max_fds)
- return expanded;
+ return 0;
/* Can we expand? */
if (nr >= sysctl_nr_open)
@@ -227,7 +284,6 @@ repeat:
if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock);
- expanded = 1;
wait_event(files->resize_wait, !files->resize_in_progress);
spin_lock(&files->file_lock);
goto repeat;
@@ -235,27 +291,28 @@ repeat:
/* All good, so we try */
files->resize_in_progress = true;
- expanded = expand_fdtable(files, nr);
+ error = expand_fdtable(files, nr);
files->resize_in_progress = false;
wake_up_all(&files->resize_wait);
- return expanded;
-}
-
-static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
-{
- __set_bit(fd, fdt->close_on_exec);
+ return error;
}
-static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
+static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
+ bool set)
{
- if (test_bit(fd, fdt->close_on_exec))
- __clear_bit(fd, fdt->close_on_exec);
+ if (set) {
+ __set_bit(fd, fdt->close_on_exec);
+ } else {
+ if (test_bit(fd, fdt->close_on_exec))
+ __clear_bit(fd, fdt->close_on_exec);
+ }
}
-static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
+static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
{
__set_bit(fd, fdt->open_fds);
+ __set_close_on_exec(fd, fdt, set);
fd /= BITS_PER_LONG;
if (!~fdt->open_fds[fd])
__set_bit(fd, fdt->full_fds_bits);
@@ -264,7 +321,9 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
__clear_bit(fd, fdt->open_fds);
- __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
+ fd /= BITS_PER_LONG;
+ if (test_bit(fd, fdt->full_fds_bits))
+ __clear_bit(fd, fdt->full_fds_bits);
}
static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
@@ -306,7 +365,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
struct file **old_fds, **new_fds;
unsigned int open_files, i;
struct fdtable *old_fdt, *new_fdt;
- int error;
newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
if (!newf)
@@ -338,17 +396,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
if (new_fdt != &newf->fdtab)
__free_fdtable(new_fdt);
- new_fdt = alloc_fdtable(open_files - 1);
- if (!new_fdt) {
- error = -ENOMEM;
- goto out_release;
- }
-
- /* beyond sysctl_nr_open; nothing to do */
- if (unlikely(new_fdt->max_fds < open_files)) {
- __free_fdtable(new_fdt);
- error = -EMFILE;
- goto out_release;
+ new_fdt = alloc_fdtable(open_files);
+ if (IS_ERR(new_fdt)) {
+ kmem_cache_free(files_cachep, newf);
+ return ERR_CAST(new_fdt);
}
/*
@@ -389,10 +440,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
rcu_assign_pointer(newf->fdt, new_fdt);
return newf;
-
-out_release:
- kmem_cache_free(files_cachep, newf);
- return ERR_PTR(error);
}
static struct fdtable *close_files(struct files_struct * files)
@@ -413,7 +460,7 @@ static struct fdtable *close_files(struct files_struct * files)
set = fdt->open_fds[j++];
while (set) {
if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
+ struct file *file = fdt->fd[i];
if (file) {
filp_close(file, files);
cond_resched();
@@ -470,6 +517,15 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
+ unsigned int bit;
+
+ /*
+ * Try to avoid looking at the second level bitmap
+ */
+ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
+ start & (BITS_PER_LONG - 1));
+ if (bit < BITS_PER_LONG)
+ return bit + bitbit * BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
if (bitbit >= maxfd)
@@ -496,7 +552,7 @@ repeat:
if (fd < files->next_fd)
fd = files->next_fd;
- if (fd < fdt->max_fds)
+ if (likely(fd < fdt->max_fds))
fd = find_next_fd(fdt, fd);
/*
@@ -504,36 +560,22 @@ repeat:
* will limit the total number of files that can be opened.
*/
error = -EMFILE;
- if (fd >= end)
+ if (unlikely(fd >= end))
goto out;
- error = expand_files(files, fd);
- if (error < 0)
- goto out;
+ if (unlikely(fd >= fdt->max_fds)) {
+ error = expand_files(files, fd);
+ if (error < 0)
+ goto out;
- /*
- * If we needed to expand the fs array we
- * might have blocked - try again.
- */
- if (error)
goto repeat;
+ }
if (start <= files->next_fd)
files->next_fd = fd + 1;
- __set_open_fd(fd, fdt);
- if (flags & O_CLOEXEC)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_open_fd(fd, fdt, flags & O_CLOEXEC);
error = fd;
-#if 1
- /* Sanity check */
- if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
- printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
- rcu_assign_pointer(fdt->fd[fd], NULL);
- }
-#endif
out:
spin_unlock(&files->file_lock);
@@ -599,7 +641,7 @@ void fd_install(unsigned int fd, struct file *file)
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
- BUG_ON(fdt->fd[fd] != NULL);
+ WARN_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
@@ -713,7 +755,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
}
/**
- * __close_range() - Close all file descriptors in a given range.
+ * sys_close_range() - Close all file descriptors in a given range.
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
@@ -721,8 +763,10 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
*
* This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed.
+ * Currently, errors to close a given file descriptor are ignored.
*/
-int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
+SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
+ unsigned int, flags)
{
struct task_struct *me = current;
struct files_struct *cur_fds = me->files, *fds = NULL;
@@ -839,7 +883,7 @@ static struct file *__get_file_rcu(struct file __rcu **f)
if (!file)
return NULL;
- if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ if (unlikely(!file_ref_get(&file->f_ref)))
return ERR_PTR(-EAGAIN);
file_reloaded = rcu_dereference_raw(*f);
@@ -853,8 +897,8 @@ static struct file *__get_file_rcu(struct file __rcu **f)
OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
/*
- * atomic_long_inc_not_zero() above provided a full memory
- * barrier when we acquired a reference.
+ * file_ref_get() above provided a full memory barrier when we
+ * acquired a reference.
*
* This is paired with the write barrier from assigning to the
* __rcu protected file pointer so that if that pointer still
@@ -952,11 +996,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
* We need to confirm it by incrementing the refcount
* and then check the lookup again.
*
- * atomic_long_inc_not_zero() gives us a full memory
- * barrier. We only really need an 'acquire' one to
- * protect the loads below, but we don't have that.
+ * file_ref_get() gives us a full memory barrier. We
+ * only really need an 'acquire' one to protect the
+ * loads below, but we don't have that.
*/
- if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ if (unlikely(!file_ref_get(&file->f_ref)))
continue;
/*
@@ -1037,29 +1081,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file;
}
-struct file *lookup_fdget_rcu(unsigned int fd)
-{
- return __fget_files_rcu(current->files, fd, 0);
-
-}
-EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
-
-struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
-{
- /* Must be called with rcu_read_lock held */
- struct files_struct *files;
- struct file *file = NULL;
-
- task_lock(task);
- files = task->files;
- if (files)
- file = __fget_files_rcu(files, fd, 0);
- task_unlock(task);
-
- return file;
-}
-
-struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -1069,17 +1091,19 @@ struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *
task_lock(task);
files = task->files;
if (files) {
+ rcu_read_lock();
for (; fd < files_fdtable(files)->max_fds; fd++) {
file = __fget_files_rcu(files, fd, 0);
if (file)
break;
}
+ rcu_read_unlock();
}
task_unlock(task);
*ret_fd = fd;
return file;
}
-EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
+EXPORT_SYMBOL(fget_task_next);
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1096,6 +1120,13 @@ EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
*
* The fput_needed flag returned by fget_light should be passed to the
* corresponding fput_light.
+ *
+ * (As an exception to rule 2, you can call filp_close between fget_light and
+ * fput_light provided that you capture a real refcount with get_file before
+ * the call to filp_close, and ensure that this real refcount is fput *after*
+ * the fput_light call.)
+ *
+ * See also the documentation in rust/kernel/file.rs.
*/
static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
{
@@ -1176,13 +1207,8 @@ void __f_unlock_pos(struct file *f)
void set_close_on_exec(unsigned int fd, int flag)
{
struct files_struct *files = current->files;
- struct fdtable *fdt;
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (flag)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_close_on_exec(fd, files_fdtable(files), flag);
spin_unlock(&files->file_lock);
}
@@ -1223,11 +1249,7 @@ __releases(&files->file_lock)
goto Ebusy;
get_file(file);
rcu_assign_pointer(fdt->fd[fd], file);
- __set_open_fd(fd, fdt);
- if (flags & O_CLOEXEC)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_open_fd(fd, fdt, flags & O_CLOEXEC);
spin_unlock(&files->file_lock);
if (tofree)
diff --git a/fs/file_table.c b/fs/file_table.c
index eed5ffad9997..976736be47cb 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -9,7 +9,6 @@
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
@@ -40,13 +39,17 @@ static struct files_stat_struct files_stat = {
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __ro_after_init;
+static struct kmem_cache *bfilp_cachep __ro_after_init;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
/* Container for backing file with optional user path */
struct backing_file {
struct file file;
- struct path user_path;
+ union {
+ struct path user_path;
+ freeptr_t bf_freeptr;
+ };
};
static inline struct backing_file *backing_file(struct file *f)
@@ -68,7 +71,7 @@ static inline void file_free(struct file *f)
put_cred(f->f_cred);
if (unlikely(f->f_mode & FMODE_BACKING)) {
path_put(backing_file_user_path(f));
- kfree(backing_file(f));
+ kmem_cache_free(bfilp_cachep, backing_file(f));
} else {
kmem_cache_free(filp_cachep, f);
}
@@ -165,16 +168,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
* the respective member when opening the file.
*/
mutex_init(&f->f_pos_lock);
- f->f_flags = flags;
- f->f_mode = OPEN_FMODE(flags);
- /* f->f_version: 0 */
+ memset(&f->f_path, 0, sizeof(f->f_path));
+ memset(&f->f_ra, 0, sizeof(f->f_ra));
+
+ f->f_flags = flags;
+ f->f_mode = OPEN_FMODE(flags);
+
+ f->f_op = NULL;
+ f->f_mapping = NULL;
+ f->private_data = NULL;
+ f->f_inode = NULL;
+ f->f_owner = NULL;
+#ifdef CONFIG_EPOLL
+ f->f_ep = NULL;
+#endif
+
+ f->f_iocb_flags = 0;
+ f->f_pos = 0;
+ f->f_wb_err = 0;
+ f->f_sb_err = 0;
/*
* We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
* fget-rcu pattern users need to be able to handle spurious
* refcount bumps we should reinitialize the reused file first.
*/
- atomic_long_set(&f->f_count, 1);
+ file_ref_init(&f->f_ref, 1);
return 0;
}
@@ -206,7 +225,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
goto over;
}
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
@@ -240,7 +259,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
struct file *f;
int error;
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
@@ -267,13 +286,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
struct backing_file *ff;
int error;
- ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
+ ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
if (unlikely(!ff))
return ERR_PTR(-ENOMEM);
error = init_file(&ff->file, flags, cred);
if (unlikely(error)) {
- kfree(ff);
+ kmem_cache_free(bfilp_cachep, ff);
return ERR_PTR(error);
}
@@ -479,7 +498,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count)) {
+ if (file_ref_put(&file->f_ref)) {
struct task_struct *task = current;
if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
@@ -512,7 +531,7 @@ void fput(struct file *file)
*/
void __fput_sync(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count))
+ if (file_ref_put(&file->f_ref))
__fput(file);
}
@@ -529,6 +548,11 @@ void __init files_init(void)
filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
+
+ args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
+ bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file),
+ &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index fbcd603365ad..8c67627f2a3d 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -25,7 +25,7 @@
struct vxfs_dirblk {
__fs16 d_free; /* free space in dirblock */
__fs16 d_nhash; /* no of hash chains */
- __fs16 d_hash[1]; /* hash chain */
+ __fs16 d_hash[]; /* hash chain */
};
/*
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d8bec3c1bb1f..3cd99e2dc6ac 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio)
if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
wb_put(wb);
}
-EXPORT_SYMBOL_GPL(__inode_attach_wb);
/**
* inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
@@ -731,8 +730,9 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
* writeback completion, wbc_detach_inode() should be called. This is used
* to track the cgroup writeback context.
*/
-void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- struct inode *inode)
+static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock)
{
if (!inode_cgwb_enabled(inode)) {
spin_unlock(&inode->i_lock);
@@ -762,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
inode_switch_wbs(inode, wbc->wb_id);
}
-EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
+
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ inode_attach_wb(inode, NULL);
+ wbc_attach_and_unlock_inode(wbc, inode);
+}
+EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode);
/**
* wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -890,17 +907,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
/**
* wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
* @wbc: writeback_control of the writeback in progress
- * @page: page being written out
+ * @folio: folio being written out
* @bytes: number of bytes being written out
*
- * @bytes from @page are about to written out during the writeback
+ * @bytes from @folio are about to written out during the writeback
* controlled by @wbc. Keep the book for foreign inode detection. See
* wbc_detach_inode().
*/
-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
size_t bytes)
{
- struct folio *folio;
struct cgroup_subsys_state *css;
int id;
@@ -913,7 +929,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
if (!wbc->wb || wbc->no_cgroup_owner)
return;
- folio = page_folio(page);
css = mem_cgroup_css_from_folio(folio);
/* dead cgroups shouldn't contribute to inode ownership arbitration */
if (!(css->flags & CSS_ONLINE))
@@ -1227,6 +1242,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
}
}
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock)
+{
+ spin_unlock(&inode->i_lock);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 24727ec34e5a..16fa61ef56bf 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -156,6 +156,7 @@ int fs_lookup_param(struct fs_context *fc,
f = getname_kernel(param->string);
if (IS_ERR(f))
return PTR_ERR(f);
+ param->dirfd = AT_FDCWD;
put_f = true;
break;
case fs_value_is_filename:
@@ -308,6 +309,26 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
}
EXPORT_SYMBOL(fs_param_is_fd);
+int fs_param_is_file_or_string(struct p_log *log,
+ const struct fs_parameter_spec *p,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ switch (param->type) {
+ case fs_value_is_string:
+ return fs_param_is_string(log, p, param, result);
+ case fs_value_is_file:
+ result->uint_32 = param->dirfd;
+ if (result->uint_32 <= INT_MAX)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ return fs_param_bad_value(log, param);
+}
+EXPORT_SYMBOL(fs_param_is_file_or_string);
+
int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 6cef3deccded..094a7f510edf 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -349,7 +349,6 @@ SYSCALL_DEFINE5(fsconfig,
int, aux)
{
struct fs_context *fc;
- struct fd f;
int ret;
int lookup_flags = 0;
@@ -392,12 +391,11 @@ SYSCALL_DEFINE5(fsconfig,
return -EOPNOTSUPP;
}
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- ret = -EINVAL;
if (fd_file(f)->f_op != &fscontext_fops)
- goto out_f;
+ return -EINVAL;
fc = fd_file(f)->private_data;
if (fc->ops == &legacy_fs_context_ops) {
@@ -407,17 +405,14 @@ SYSCALL_DEFINE5(fsconfig,
case FSCONFIG_SET_PATH_EMPTY:
case FSCONFIG_SET_FD:
case FSCONFIG_CMD_CREATE_EXCL:
- ret = -EOPNOTSUPP;
- goto out_f;
+ return -EOPNOTSUPP;
}
}
if (_key) {
param.key = strndup_user(_key, 256);
- if (IS_ERR(param.key)) {
- ret = PTR_ERR(param.key);
- goto out_f;
- }
+ if (IS_ERR(param.key))
+ return PTR_ERR(param.key);
}
switch (cmd) {
@@ -496,7 +491,5 @@ SYSCALL_DEFINE5(fsconfig,
}
out_key:
kfree(param.key);
-out_f:
- fdput(f);
return ret;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1f64ae6d7a69..0723c6344b20 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2371,13 +2371,12 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
int res;
int oldfd;
struct fuse_dev *fud = NULL;
- struct fd f;
if (get_user(oldfd, argp))
return -EFAULT;
- f = fdget(oldfd);
- if (!fd_file(f))
+ CLASS(fd, f)(oldfd);
+ if (fd_empty(f))
return -EINVAL;
/*
@@ -2394,7 +2393,6 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
mutex_unlock(&fuse_mutex);
}
- fdput(f);
return res;
}
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index bbac547dfcb3..607ef735ad4a 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -18,11 +18,11 @@ static void fuse_file_accessed(struct file *file)
fuse_invalidate_atime(inode);
}
-static void fuse_passthrough_end_write(struct file *file, loff_t pos, ssize_t ret)
+static void fuse_passthrough_end_write(struct kiocb *iocb, ssize_t ret)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = file_inode(iocb->ki_filp);
- fuse_write_update_attr(inode, pos, ret);
+ fuse_write_update_attr(inode, iocb->ki_pos, ret);
}
ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -34,7 +34,6 @@ ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t ret;
struct backing_file_ctx ctx = {
.cred = ff->cred,
- .user_file = file,
.accessed = fuse_file_accessed,
};
@@ -62,7 +61,6 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb,
ssize_t ret;
struct backing_file_ctx ctx = {
.cred = ff->cred,
- .user_file = file,
.end_write = fuse_passthrough_end_write,
};
@@ -88,15 +86,20 @@ ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos,
struct file *backing_file = fuse_file_passthrough(ff);
struct backing_file_ctx ctx = {
.cred = ff->cred,
- .user_file = in,
.accessed = fuse_file_accessed,
};
+ struct kiocb iocb;
+ ssize_t ret;
pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
- backing_file, ppos ? *ppos : 0, len, flags);
+ backing_file, *ppos, len, flags);
- return backing_file_splice_read(backing_file, ppos, pipe, len, flags,
- &ctx);
+ init_sync_kiocb(&iocb, in);
+ iocb.ki_pos = *ppos;
+ ret = backing_file_splice_read(backing_file, &iocb, pipe, len, flags, &ctx);
+ *ppos = iocb.ki_pos;
+
+ return ret;
}
ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
@@ -109,16 +112,18 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
ssize_t ret;
struct backing_file_ctx ctx = {
.cred = ff->cred,
- .user_file = out,
.end_write = fuse_passthrough_end_write,
};
+ struct kiocb iocb;
pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
- backing_file, ppos ? *ppos : 0, len, flags);
+ backing_file, *ppos, len, flags);
inode_lock(inode);
- ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags,
- &ctx);
+ init_sync_kiocb(&iocb, out);
+ iocb.ki_pos = *ppos;
+ ret = backing_file_splice_write(pipe, backing_file, &iocb, len, flags, &ctx);
+ *ppos = iocb.ki_pos;
inode_unlock(inode);
return ret;
@@ -130,7 +135,6 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma)
struct file *backing_file = fuse_file_passthrough(ff);
struct backing_file_ctx ctx = {
.cred = ff->cred,
- .user_file = file,
.accessed = fuse_file_accessed,
};
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d418d8b5367f..3334c394ce9c 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -190,6 +190,5 @@ const struct export_operations gfs2_export_ops = {
.fh_to_parent = gfs2_fh_to_parent,
.get_name = gfs2_get_name,
.get_parent = gfs2_get_parent,
- .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index f7dd64856c9b..1e73cf87ff88 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1586,6 +1586,7 @@ const struct file_operations gfs2_file_fops = {
.splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
+ .fop_flags = FOP_ASYNC_LOCK,
};
const struct file_operations gfs2_dir_fops = {
@@ -1598,6 +1599,7 @@ const struct file_operations gfs2_dir_fops = {
.lock = gfs2_lock,
.flock = gfs2_flock,
.llseek = default_llseek,
+ .fop_flags = FOP_ASYNC_LOCK,
};
#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 269c3bc7fced..4701c4aafbf4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -34,7 +34,6 @@
#include <linux/lockref.h>
#include <linux/rhashtable.h>
#include <linux/pid_namespace.h>
-#include <linux/fdtable.h>
#include <linux/file.h>
#include "gfs2.h"
@@ -2768,25 +2767,18 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
i->file = NULL;
}
- rcu_read_lock();
for(;; i->fd++) {
- struct inode *inode;
-
- i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
+ i->file = fget_task_next(i->task, &i->fd);
if (!i->file) {
i->fd = 0;
break;
}
- inode = file_inode(i->file);
- if (inode->i_sb == i->sb)
+ if (file_inode(i->file)->i_sb == i->sb)
break;
- rcu_read_unlock();
fput(i->file);
- rcu_read_lock();
}
- rcu_read_unlock();
return i->file;
}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index eeac99765f0d..3bee9b5dba5e 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -15,10 +15,11 @@
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/nls.h>
-#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/vfs.h>
@@ -111,21 +112,24 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static int hfs_remount(struct super_block *sb, int *flags, char *data)
+static int hfs_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+
sync_filesystem(sb);
- *flags |= SB_NODIRATIME;
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ fc->sb_flags |= SB_NODIRATIME;
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (!(*flags & SB_RDONLY)) {
+
+ if (!(fc->sb_flags & SB_RDONLY)) {
if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
pr_warn("filesystem is marked locked, leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
}
}
return 0;
@@ -180,7 +184,6 @@ static const struct super_operations hfs_super_operations = {
.put_super = hfs_put_super,
.sync_fs = hfs_sync_fs,
.statfs = hfs_statfs,
- .remount_fs = hfs_remount,
.show_options = hfs_show_options,
};
@@ -188,181 +191,112 @@ enum {
opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask,
opt_part, opt_session, opt_type, opt_creator, opt_quiet,
opt_codepage, opt_iocharset,
- opt_err
};
-static const match_table_t tokens = {
- { opt_uid, "uid=%u" },
- { opt_gid, "gid=%u" },
- { opt_umask, "umask=%o" },
- { opt_file_umask, "file_umask=%o" },
- { opt_dir_umask, "dir_umask=%o" },
- { opt_part, "part=%u" },
- { opt_session, "session=%u" },
- { opt_type, "type=%s" },
- { opt_creator, "creator=%s" },
- { opt_quiet, "quiet" },
- { opt_codepage, "codepage=%s" },
- { opt_iocharset, "iocharset=%s" },
- { opt_err, NULL }
+static const struct fs_parameter_spec hfs_param_spec[] = {
+ fsparam_u32 ("uid", opt_uid),
+ fsparam_u32 ("gid", opt_gid),
+ fsparam_u32oct ("umask", opt_umask),
+ fsparam_u32oct ("file_umask", opt_file_umask),
+ fsparam_u32oct ("dir_umask", opt_dir_umask),
+ fsparam_u32 ("part", opt_part),
+ fsparam_u32 ("session", opt_session),
+ fsparam_string ("type", opt_type),
+ fsparam_string ("creator", opt_creator),
+ fsparam_flag ("quiet", opt_quiet),
+ fsparam_string ("codepage", opt_codepage),
+ fsparam_string ("iocharset", opt_iocharset),
+ {}
};
-static inline int match_fourchar(substring_t *arg, u32 *result)
-{
- if (arg->to - arg->from != 4)
- return -EINVAL;
- memcpy(result, arg->from, 4);
- return 0;
-}
-
/*
- * parse_options()
+ * hfs_parse_param()
*
- * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger
- * This function is called by hfs_read_super() to parse the mount options.
+ * This function is called by the vfs to parse the mount options.
*/
-static int parse_options(char *options, struct hfs_sb_info *hsb)
+static int hfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int tmp, token;
-
- /* initialize the sb with defaults */
- hsb->s_uid = current_uid();
- hsb->s_gid = current_gid();
- hsb->s_file_umask = 0133;
- hsb->s_dir_umask = 0022;
- hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */
- hsb->s_quiet = 0;
- hsb->part = -1;
- hsb->session = -1;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case opt_uid:
- if (match_int(&args[0], &tmp)) {
- pr_err("uid requires an argument\n");
- return 0;
- }
- hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp);
- if (!uid_valid(hsb->s_uid)) {
- pr_err("invalid uid %d\n", tmp);
- return 0;
- }
- break;
- case opt_gid:
- if (match_int(&args[0], &tmp)) {
- pr_err("gid requires an argument\n");
- return 0;
- }
- hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp);
- if (!gid_valid(hsb->s_gid)) {
- pr_err("invalid gid %d\n", tmp);
- return 0;
- }
- break;
- case opt_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("umask requires a value\n");
- return 0;
- }
- hsb->s_file_umask = (umode_t)tmp;
- hsb->s_dir_umask = (umode_t)tmp;
- break;
- case opt_file_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("file_umask requires a value\n");
- return 0;
- }
- hsb->s_file_umask = (umode_t)tmp;
- break;
- case opt_dir_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("dir_umask requires a value\n");
- return 0;
- }
- hsb->s_dir_umask = (umode_t)tmp;
- break;
- case opt_part:
- if (match_int(&args[0], &hsb->part)) {
- pr_err("part requires an argument\n");
- return 0;
- }
- break;
- case opt_session:
- if (match_int(&args[0], &hsb->session)) {
- pr_err("session requires an argument\n");
- return 0;
- }
- break;
- case opt_type:
- if (match_fourchar(&args[0], &hsb->s_type)) {
- pr_err("type requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_creator:
- if (match_fourchar(&args[0], &hsb->s_creator)) {
- pr_err("creator requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_quiet:
- hsb->s_quiet = 1;
- break;
- case opt_codepage:
- if (hsb->nls_disk) {
- pr_err("unable to change codepage\n");
- return 0;
- }
- p = match_strdup(&args[0]);
- if (p)
- hsb->nls_disk = load_nls(p);
- if (!hsb->nls_disk) {
- pr_err("unable to load codepage \"%s\"\n", p);
- kfree(p);
- return 0;
- }
- kfree(p);
- break;
- case opt_iocharset:
- if (hsb->nls_io) {
- pr_err("unable to change iocharset\n");
- return 0;
- }
- p = match_strdup(&args[0]);
- if (p)
- hsb->nls_io = load_nls(p);
- if (!hsb->nls_io) {
- pr_err("unable to load iocharset \"%s\"\n", p);
- kfree(p);
- return 0;
- }
- kfree(p);
- break;
- default:
- return 0;
- }
- }
+ struct hfs_sb_info *hsb = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
+
+ /* hfs does not honor any fs-specific options on remount */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
- if (hsb->nls_disk && !hsb->nls_io) {
- hsb->nls_io = load_nls_default();
+ opt = fs_parse(fc, hfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case opt_uid:
+ hsb->s_uid = result.uid;
+ break;
+ case opt_gid:
+ hsb->s_gid = result.gid;
+ break;
+ case opt_umask:
+ hsb->s_file_umask = (umode_t)result.uint_32;
+ hsb->s_dir_umask = (umode_t)result.uint_32;
+ break;
+ case opt_file_umask:
+ hsb->s_file_umask = (umode_t)result.uint_32;
+ break;
+ case opt_dir_umask:
+ hsb->s_dir_umask = (umode_t)result.uint_32;
+ break;
+ case opt_part:
+ hsb->part = result.uint_32;
+ break;
+ case opt_session:
+ hsb->session = result.uint_32;
+ break;
+ case opt_type:
+ if (strlen(param->string) != 4) {
+ pr_err("type requires a 4 character value\n");
+ return -EINVAL;
+ }
+ memcpy(&hsb->s_type, param->string, 4);
+ break;
+ case opt_creator:
+ if (strlen(param->string) != 4) {
+ pr_err("creator requires a 4 character value\n");
+ return -EINVAL;
+ }
+ memcpy(&hsb->s_creator, param->string, 4);
+ break;
+ case opt_quiet:
+ hsb->s_quiet = 1;
+ break;
+ case opt_codepage:
+ if (hsb->nls_disk) {
+ pr_err("unable to change codepage\n");
+ return -EINVAL;
+ }
+ hsb->nls_disk = load_nls(param->string);
+ if (!hsb->nls_disk) {
+ pr_err("unable to load codepage \"%s\"\n",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case opt_iocharset:
+ if (hsb->nls_io) {
+ pr_err("unable to change iocharset\n");
+ return -EINVAL;
+ }
+ hsb->nls_io = load_nls(param->string);
if (!hsb->nls_io) {
- pr_err("unable to load default iocharset\n");
- return 0;
+ pr_err("unable to load iocharset \"%s\"\n",
+ param->string);
+ return -EINVAL;
}
+ break;
+ default:
+ return -EINVAL;
}
- hsb->s_dir_umask &= 0777;
- hsb->s_file_umask &= 0577;
- return 1;
+ return 0;
}
/*
@@ -376,29 +310,25 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
* hfs_btree_init() to get the necessary data about the extents and
* catalog B-trees and, finally, reading the root inode into memory.
*/
-static int hfs_fill_super(struct super_block *sb, void *data, int silent)
+static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
- struct hfs_sb_info *sbi;
+ struct hfs_sb_info *sbi = HFS_SB(sb);
struct hfs_find_data fd;
hfs_cat_rec rec;
struct inode *root_inode;
+ int silent = fc->sb_flags & SB_SILENT;
int res;
- sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
+ /* load_nls_default does not fail */
+ if (sbi->nls_disk && !sbi->nls_io)
+ sbi->nls_io = load_nls_default();
+ sbi->s_dir_umask &= 0777;
+ sbi->s_file_umask &= 0577;
- sbi->sb = sb;
- sb->s_fs_info = sbi;
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb);
- res = -EINVAL;
- if (!parse_options((char *)data, sbi)) {
- pr_err("unable to parse mount options\n");
- goto bail;
- }
-
+ sbi->sb = sb;
sb->s_op = &hfs_super_operations;
sb->s_xattr = hfs_xattr_handlers;
sb->s_flags |= SB_NODIRATIME;
@@ -451,18 +381,56 @@ bail:
return res;
}
-static struct dentry *hfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, hfs_fill_super);
+}
+
+static void hfs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations hfs_context_ops = {
+ .parse_param = hfs_parse_param,
+ .get_tree = hfs_get_tree,
+ .reconfigure = hfs_reconfigure,
+ .free = hfs_free_fc,
+};
+
+static int hfs_init_fs_context(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+ struct hfs_sb_info *hsb;
+
+ hsb = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
+ if (!hsb)
+ return -ENOMEM;
+
+ fc->s_fs_info = hsb;
+ fc->ops = &hfs_context_ops;
+
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) {
+ /* initialize options with defaults */
+ hsb->s_uid = current_uid();
+ hsb->s_gid = current_gid();
+ hsb->s_file_umask = 0133;
+ hsb->s_dir_umask = 0022;
+ hsb->s_type = cpu_to_be32(0x3f3f3f3f); /* == '????' */
+ hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */
+ hsb->s_quiet = 0;
+ hsb->part = -1;
+ hsb->session = -1;
+ }
+
+ return 0;
}
static struct file_system_type hfs_fs_type = {
.owner = THIS_MODULE,
.name = "hfs",
- .mount = hfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = hfs_init_fs_context,
};
MODULE_ALIAS_FS("hfs");
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 59ce81dca73f..2f089bff0095 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -21,6 +21,7 @@
#include <linux/mutex.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
+#include <linux/fs_context.h>
#include "hfsplus_raw.h"
#define DBG_BNODE_REFS 0x00000001
@@ -156,6 +157,7 @@ struct hfsplus_sb_info {
/* Runtime variables */
u32 blockoffset;
+ u32 min_io_size;
sector_t part_start;
sector_t sect_count;
int fs_shift;
@@ -307,7 +309,7 @@ struct hfsplus_readdir_data {
*/
static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
{
- return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev),
+ return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size,
HFSPLUS_SECTOR_SIZE);
}
@@ -496,8 +498,7 @@ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
/* options.c */
void hfsplus_fill_defaults(struct hfsplus_sb_info *opts);
-int hfsplus_parse_options_remount(char *input, int *force);
-int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi);
+int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param);
int hfsplus_show_options(struct seq_file *seq, struct dentry *root);
/* part_tbl.c */
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index c94a58762ad6..a66a09a56bf7 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -12,7 +12,8 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/nls.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
@@ -23,26 +24,23 @@ enum {
opt_creator, opt_type,
opt_umask, opt_uid, opt_gid,
opt_part, opt_session, opt_nls,
- opt_nodecompose, opt_decompose,
- opt_barrier, opt_nobarrier,
- opt_force, opt_err
+ opt_decompose, opt_barrier,
+ opt_force,
};
-static const match_table_t tokens = {
- { opt_creator, "creator=%s" },
- { opt_type, "type=%s" },
- { opt_umask, "umask=%o" },
- { opt_uid, "uid=%u" },
- { opt_gid, "gid=%u" },
- { opt_part, "part=%u" },
- { opt_session, "session=%u" },
- { opt_nls, "nls=%s" },
- { opt_decompose, "decompose" },
- { opt_nodecompose, "nodecompose" },
- { opt_barrier, "barrier" },
- { opt_nobarrier, "nobarrier" },
- { opt_force, "force" },
- { opt_err, NULL }
+static const struct fs_parameter_spec hfs_param_spec[] = {
+ fsparam_string ("creator", opt_creator),
+ fsparam_string ("type", opt_type),
+ fsparam_u32oct ("umask", opt_umask),
+ fsparam_u32 ("uid", opt_uid),
+ fsparam_u32 ("gid", opt_gid),
+ fsparam_u32 ("part", opt_part),
+ fsparam_u32 ("session", opt_session),
+ fsparam_string ("nls", opt_nls),
+ fsparam_flag_no ("decompose", opt_decompose),
+ fsparam_flag_no ("barrier", opt_barrier),
+ fsparam_flag ("force", opt_force),
+ {}
};
/* Initialize an options object to reasonable defaults */
@@ -60,162 +58,89 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
opts->session = -1;
}
-/* convert a "four byte character" to a 32 bit int with error checks */
-static inline int match_fourchar(substring_t *arg, u32 *result)
+/* Parse options from mount. Returns nonzero errno on failure */
+int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- if (arg->to - arg->from != 4)
- return -EINVAL;
- memcpy(result, arg->from, 4);
- return 0;
-}
-
-int hfsplus_parse_options_remount(char *input, int *force)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int token;
-
- if (!input)
- return 1;
-
- while ((p = strsep(&input, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case opt_force:
- *force = 1;
- break;
- default:
- break;
+ struct hfsplus_sb_info *sbi = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
+
+ /*
+ * Only the force option is examined during remount, all others
+ * are ignored.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
+ strncmp(param->key, "force", 5))
+ return 0;
+
+ opt = fs_parse(fc, hfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case opt_creator:
+ if (strlen(param->string) != 4) {
+ pr_err("creator requires a 4 character value\n");
+ return -EINVAL;
}
- }
-
- return 1;
-}
-
-/* Parse options from mount. Returns 0 on failure */
-/* input is the options passed to mount() as a string */
-int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int tmp, token;
-
- if (!input)
- goto done;
-
- while ((p = strsep(&input, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case opt_creator:
- if (match_fourchar(&args[0], &sbi->creator)) {
- pr_err("creator requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_type:
- if (match_fourchar(&args[0], &sbi->type)) {
- pr_err("type requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("umask requires a value\n");
- return 0;
- }
- sbi->umask = (umode_t)tmp;
- break;
- case opt_uid:
- if (match_int(&args[0], &tmp)) {
- pr_err("uid requires an argument\n");
- return 0;
- }
- sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp);
- if (!uid_valid(sbi->uid)) {
- pr_err("invalid uid specified\n");
- return 0;
- } else {
- set_bit(HFSPLUS_SB_UID, &sbi->flags);
- }
- break;
- case opt_gid:
- if (match_int(&args[0], &tmp)) {
- pr_err("gid requires an argument\n");
- return 0;
- }
- sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp);
- if (!gid_valid(sbi->gid)) {
- pr_err("invalid gid specified\n");
- return 0;
- } else {
- set_bit(HFSPLUS_SB_GID, &sbi->flags);
- }
- break;
- case opt_part:
- if (match_int(&args[0], &sbi->part)) {
- pr_err("part requires an argument\n");
- return 0;
- }
- break;
- case opt_session:
- if (match_int(&args[0], &sbi->session)) {
- pr_err("session requires an argument\n");
- return 0;
- }
- break;
- case opt_nls:
- if (sbi->nls) {
- pr_err("unable to change nls mapping\n");
- return 0;
- }
- p = match_strdup(&args[0]);
- if (p)
- sbi->nls = load_nls(p);
- if (!sbi->nls) {
- pr_err("unable to load nls mapping \"%s\"\n",
- p);
- kfree(p);
- return 0;
- }
- kfree(p);
- break;
- case opt_decompose:
- clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
- break;
- case opt_nodecompose:
+ memcpy(&sbi->creator, param->string, 4);
+ break;
+ case opt_type:
+ if (strlen(param->string) != 4) {
+ pr_err("type requires a 4 character value\n");
+ return -EINVAL;
+ }
+ memcpy(&sbi->type, param->string, 4);
+ break;
+ case opt_umask:
+ sbi->umask = (umode_t)result.uint_32;
+ break;
+ case opt_uid:
+ sbi->uid = result.uid;
+ set_bit(HFSPLUS_SB_UID, &sbi->flags);
+ break;
+ case opt_gid:
+ sbi->gid = result.gid;
+ set_bit(HFSPLUS_SB_GID, &sbi->flags);
+ break;
+ case opt_part:
+ sbi->part = result.uint_32;
+ break;
+ case opt_session:
+ sbi->session = result.uint_32;
+ break;
+ case opt_nls:
+ if (sbi->nls) {
+ pr_err("unable to change nls mapping\n");
+ return -EINVAL;
+ }
+ sbi->nls = load_nls(param->string);
+ if (!sbi->nls) {
+ pr_err("unable to load nls mapping \"%s\"\n",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case opt_decompose:
+ if (result.negated)
set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
- break;
- case opt_barrier:
- clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
- break;
- case opt_nobarrier:
+ else
+ clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
+ break;
+ case opt_barrier:
+ if (result.negated)
set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
- break;
- case opt_force:
- set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
- break;
- default:
- return 0;
- }
- }
-
-done:
- if (!sbi->nls) {
- /* try utf8 first, as this is the old default behaviour */
- sbi->nls = load_nls("utf8");
- if (!sbi->nls)
- sbi->nls = load_nls_default();
- if (!sbi->nls)
- return 0;
+ else
+ clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+ break;
+ case opt_force:
+ set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 97920202790f..948b8aaee33e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -14,6 +14,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
#include <linux/slab.h>
#include <linux/vfs.h>
#include <linux/nls.h>
@@ -332,34 +333,33 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
+static int hfsplus_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+
sync_filesystem(sb);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (!(*flags & SB_RDONLY)) {
- struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
- int force = 0;
-
- if (!hfsplus_parse_options_remount(data, &force))
- return -EINVAL;
+ if (!(fc->sb_flags & SB_RDONLY)) {
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+ struct hfsplus_vh *vhdr = sbi->s_vhdr;
if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
- } else if (force) {
+ fc->sb_flags |= SB_RDONLY;
+ } else if (test_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
/* nothing */
} else if (vhdr->attributes &
cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
pr_warn("filesystem is marked locked, leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
} else if (vhdr->attributes &
cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
pr_warn("filesystem is marked journaled, leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
}
}
return 0;
@@ -373,38 +373,33 @@ static const struct super_operations hfsplus_sops = {
.put_super = hfsplus_put_super,
.sync_fs = hfsplus_sync_fs,
.statfs = hfsplus_statfs,
- .remount_fs = hfsplus_remount,
.show_options = hfsplus_show_options,
};
-static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
+static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct hfsplus_vh *vhdr;
- struct hfsplus_sb_info *sbi;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
hfsplus_cat_entry entry;
struct hfs_find_data fd;
struct inode *root, *inode;
struct qstr str;
- struct nls_table *nls = NULL;
+ struct nls_table *nls;
u64 last_fs_block, last_fs_page;
+ int silent = fc->sb_flags & SB_SILENT;
int err;
- err = -ENOMEM;
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- goto out;
-
- sb->s_fs_info = sbi;
mutex_init(&sbi->alloc_mutex);
mutex_init(&sbi->vh_mutex);
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
- hfsplus_fill_defaults(sbi);
err = -EINVAL;
- if (!hfsplus_parse_options(data, sbi)) {
- pr_err("unable to parse mount options\n");
- goto out_unload_nls;
+ if (!sbi->nls) {
+ /* try utf8 first, as this is the old default behaviour */
+ sbi->nls = load_nls("utf8");
+ if (!sbi->nls)
+ sbi->nls = load_nls_default();
}
/* temporarily use utf8 to correctly find the hidden dir below */
@@ -616,7 +611,6 @@ out_unload_nls:
unload_nls(sbi->nls);
unload_nls(nls);
kfree(sbi);
-out:
return err;
}
@@ -641,18 +635,46 @@ static void hfsplus_free_inode(struct inode *inode)
#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
-static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hfsplus_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, hfsplus_fill_super);
+}
+
+static void hfsplus_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations hfsplus_context_ops = {
+ .parse_param = hfsplus_parse_param,
+ .get_tree = hfsplus_get_tree,
+ .reconfigure = hfsplus_reconfigure,
+ .free = hfsplus_free_fc,
+};
+
+static int hfsplus_init_fs_context(struct fs_context *fc)
+{
+ struct hfsplus_sb_info *sbi;
+
+ sbi = kzalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
+ hfsplus_fill_defaults(sbi);
+
+ fc->s_fs_info = sbi;
+ fc->ops = &hfsplus_context_ops;
+
+ return 0;
}
static struct file_system_type hfsplus_fs_type = {
.owner = THIS_MODULE,
.name = "hfsplus",
- .mount = hfsplus_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = hfsplus_init_fs_context,
};
MODULE_ALIAS_FS("hfsplus");
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 9592ffcb44e5..74801911bc1c 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -172,6 +172,8 @@ int hfsplus_read_wrapper(struct super_block *sb)
if (!blocksize)
goto out;
+ sbi->min_io_size = blocksize;
+
if (hfsplus_get_last_session(sb, &part_start, &part_size))
goto out;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index e73717daa5f9..27567920abe4 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -9,7 +9,8 @@
#include "hpfs_fn.h"
#include <linux/module.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/init.h>
#include <linux/statfs.h>
#include <linux/magic.h>
@@ -90,7 +91,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...)
hpfs_sb(s)->sb_was_error = 1;
}
-/*
+/*
* A little trick to detect cycles in many hpfs structures and don't let the
* kernel crash on corrupted filesystem. When first called, set c2 to 0.
*
@@ -272,146 +273,70 @@ static void destroy_inodecache(void)
kmem_cache_destroy(hpfs_inode_cachep);
}
-/*
- * A tiny parser for option strings, stolen from dosfs.
- * Stolen again from read-only hpfs.
- * And updated for table-driven option parsing.
- */
-
enum {
- Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis,
- Opt_check_none, Opt_check_normal, Opt_check_strict,
- Opt_err_cont, Opt_err_ro, Opt_err_panic,
- Opt_eas_no, Opt_eas_ro, Opt_eas_rw,
- Opt_chkdsk_no, Opt_chkdsk_errors, Opt_chkdsk_always,
- Opt_timeshift, Opt_err,
+ Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case,
+ Opt_check, Opt_err, Opt_eas, Opt_chkdsk, Opt_timeshift,
};
-static const match_table_t tokens = {
- {Opt_help, "help"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_umask, "umask=%o"},
- {Opt_case_lower, "case=lower"},
- {Opt_case_asis, "case=asis"},
- {Opt_check_none, "check=none"},
- {Opt_check_normal, "check=normal"},
- {Opt_check_strict, "check=strict"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_err_panic, "errors=panic"},
- {Opt_eas_no, "eas=no"},
- {Opt_eas_ro, "eas=ro"},
- {Opt_eas_rw, "eas=rw"},
- {Opt_chkdsk_no, "chkdsk=no"},
- {Opt_chkdsk_errors, "chkdsk=errors"},
- {Opt_chkdsk_always, "chkdsk=always"},
- {Opt_timeshift, "timeshift=%d"},
- {Opt_err, NULL},
+static const struct constant_table hpfs_param_case[] = {
+ {"asis", 0},
+ {"lower", 1},
+ {}
};
-static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask,
- int *lowercase, int *eas, int *chk, int *errs,
- int *chkdsk, int *timeshift)
-{
- char *p;
- int option;
+static const struct constant_table hpfs_param_check[] = {
+ {"none", 0},
+ {"normal", 1},
+ {"strict", 2},
+ {}
+};
- if (!opts)
- return 1;
+static const struct constant_table hpfs_param_err[] = {
+ {"continue", 0},
+ {"remount-ro", 1},
+ {"panic", 2},
+ {}
+};
- /*pr_info("Parsing opts: '%s'\n",opts);*/
-
- while ((p = strsep(&opts, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_help:
- return 2;
- case Opt_uid:
- if (match_int(args, &option))
- return 0;
- *uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(*uid))
- return 0;
- break;
- case Opt_gid:
- if (match_int(args, &option))
- return 0;
- *gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(*gid))
- return 0;
- break;
- case Opt_umask:
- if (match_octal(args, &option))
- return 0;
- *umask = option;
- break;
- case Opt_case_lower:
- *lowercase = 1;
- break;
- case Opt_case_asis:
- *lowercase = 0;
- break;
- case Opt_check_none:
- *chk = 0;
- break;
- case Opt_check_normal:
- *chk = 1;
- break;
- case Opt_check_strict:
- *chk = 2;
- break;
- case Opt_err_cont:
- *errs = 0;
- break;
- case Opt_err_ro:
- *errs = 1;
- break;
- case Opt_err_panic:
- *errs = 2;
- break;
- case Opt_eas_no:
- *eas = 0;
- break;
- case Opt_eas_ro:
- *eas = 1;
- break;
- case Opt_eas_rw:
- *eas = 2;
- break;
- case Opt_chkdsk_no:
- *chkdsk = 0;
- break;
- case Opt_chkdsk_errors:
- *chkdsk = 1;
- break;
- case Opt_chkdsk_always:
- *chkdsk = 2;
- break;
- case Opt_timeshift:
- {
- int m = 1;
- char *rhs = args[0].from;
- if (!rhs || !*rhs)
- return 0;
- if (*rhs == '-') m = -1;
- if (*rhs == '+' || *rhs == '-') rhs++;
- *timeshift = simple_strtoul(rhs, &rhs, 0) * m;
- if (*rhs)
- return 0;
- break;
- }
- default:
- return 0;
- }
- }
- return 1;
-}
+static const struct constant_table hpfs_param_eas[] = {
+ {"no", 0},
+ {"ro", 1},
+ {"rw", 2},
+ {}
+};
+
+static const struct constant_table hpfs_param_chkdsk[] = {
+ {"no", 0},
+ {"errors", 1},
+ {"always", 2},
+ {}
+};
+
+static const struct fs_parameter_spec hpfs_param_spec[] = {
+ fsparam_flag ("help", Opt_help),
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("umask", Opt_umask),
+ fsparam_enum ("case", Opt_case, hpfs_param_case),
+ fsparam_enum ("check", Opt_check, hpfs_param_check),
+ fsparam_enum ("errors", Opt_err, hpfs_param_err),
+ fsparam_enum ("eas", Opt_eas, hpfs_param_eas),
+ fsparam_enum ("chkdsk", Opt_chkdsk, hpfs_param_chkdsk),
+ fsparam_s32 ("timeshift", Opt_timeshift),
+ {}
+};
+
+struct hpfs_fc_context {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t umask;
+ int lowercase;
+ int eas;
+ int chk;
+ int errs;
+ int chkdsk;
+ int timeshift;
+};
static inline void hpfs_help(void)
{
@@ -439,49 +364,92 @@ HPFS filesystem options:\n\
\n");
}
-static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
+static int hpfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- kuid_t uid;
- kgid_t gid;
- umode_t umask;
- int lowercase, eas, chk, errs, chkdsk, timeshift;
- int o;
+ struct hpfs_fc_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, hpfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_help:
+ hpfs_help();
+ return -EINVAL;
+ case Opt_uid:
+ ctx->uid = result.uid;
+ break;
+ case Opt_gid:
+ ctx->gid = result.gid;
+ break;
+ case Opt_umask:
+ ctx->umask = result.uint_32;
+ break;
+ case Opt_case:
+ ctx->lowercase = result.uint_32;
+ break;
+ case Opt_check:
+ ctx->chk = result.uint_32;
+ break;
+ case Opt_err:
+ ctx->errs = result.uint_32;
+ break;
+ case Opt_eas:
+ ctx->eas = result.uint_32;
+ break;
+ case Opt_chkdsk:
+ ctx->chkdsk = result.uint_32;
+ break;
+ case Opt_timeshift:
+ {
+ int m = 1;
+ char *rhs = param->string;
+ int timeshift;
+
+ if (*rhs == '-') m = -1;
+ if (*rhs == '+' || *rhs == '-') rhs++;
+ timeshift = simple_strtoul(rhs, &rhs, 0) * m;
+ if (*rhs)
+ return -EINVAL;
+ ctx->timeshift = timeshift;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int hpfs_reconfigure(struct fs_context *fc)
+{
+ struct hpfs_fc_context *ctx = fc->fs_private;
+ struct super_block *s = fc->root->d_sb;
struct hpfs_sb_info *sbi = hpfs_sb(s);
sync_filesystem(s);
- *flags |= SB_NOATIME;
+ fc->sb_flags |= SB_NOATIME;
hpfs_lock(s);
- uid = sbi->sb_uid; gid = sbi->sb_gid;
- umask = 0777 & ~sbi->sb_mode;
- lowercase = sbi->sb_lowercase;
- eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk;
- errs = sbi->sb_err; timeshift = sbi->sb_timeshift;
-
- if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase,
- &eas, &chk, &errs, &chkdsk, &timeshift))) {
- pr_err("bad mount options.\n");
- goto out_err;
- }
- if (o == 2) {
- hpfs_help();
- goto out_err;
- }
- if (timeshift != sbi->sb_timeshift) {
+
+ if (ctx->timeshift != sbi->sb_timeshift) {
pr_err("timeshift can't be changed using remount.\n");
goto out_err;
}
unmark_dirty(s);
- sbi->sb_uid = uid; sbi->sb_gid = gid;
- sbi->sb_mode = 0777 & ~umask;
- sbi->sb_lowercase = lowercase;
- sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk;
- sbi->sb_err = errs; sbi->sb_timeshift = timeshift;
+ sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid;
+ sbi->sb_mode = 0777 & ~ctx->umask;
+ sbi->sb_lowercase = ctx->lowercase;
+ sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk;
+ sbi->sb_chkdsk = ctx->chkdsk;
+ sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift;
- if (!(*flags & SB_RDONLY)) mark_dirty(s, 1);
+ if (!(fc->sb_flags & SB_RDONLY)) mark_dirty(s, 1);
hpfs_unlock(s);
return 0;
@@ -530,30 +498,24 @@ static const struct super_operations hpfs_sops =
.evict_inode = hpfs_evict_inode,
.put_super = hpfs_put_super,
.statfs = hpfs_statfs,
- .remount_fs = hpfs_remount_fs,
.show_options = hpfs_show_options,
};
-static int hpfs_fill_super(struct super_block *s, void *options, int silent)
+static int hpfs_fill_super(struct super_block *s, struct fs_context *fc)
{
+ struct hpfs_fc_context *ctx = fc->fs_private;
struct buffer_head *bh0, *bh1, *bh2;
struct hpfs_boot_block *bootblock;
struct hpfs_super_block *superblock;
struct hpfs_spare_block *spareblock;
struct hpfs_sb_info *sbi;
struct inode *root;
-
- kuid_t uid;
- kgid_t gid;
- umode_t umask;
- int lowercase, eas, chk, errs, chkdsk, timeshift;
+ int silent = fc->sb_flags & SB_SILENT;
dnode_secno root_dno;
struct hpfs_dirent *de = NULL;
struct quad_buffer_head qbh;
- int o;
-
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi) {
return -ENOMEM;
@@ -563,26 +525,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
mutex_init(&sbi->hpfs_mutex);
hpfs_lock(s);
- uid = current_uid();
- gid = current_gid();
- umask = current_umask();
- lowercase = 0;
- eas = 2;
- chk = 1;
- errs = 1;
- chkdsk = 1;
- timeshift = 0;
-
- if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase,
- &eas, &chk, &errs, &chkdsk, &timeshift))) {
- pr_err("bad mount options.\n");
- goto bail0;
- }
- if (o==2) {
- hpfs_help();
- goto bail0;
- }
-
/*sbi->sb_mounting = 1;*/
sb_set_blocksize(s, 512);
sbi->sb_fs_size = -1;
@@ -622,17 +564,17 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start);
sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band);
sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap);
- sbi->sb_uid = uid;
- sbi->sb_gid = gid;
- sbi->sb_mode = 0777 & ~umask;
+ sbi->sb_uid = ctx->uid;
+ sbi->sb_gid = ctx->gid;
+ sbi->sb_mode = 0777 & ~ctx->umask;
sbi->sb_n_free = -1;
sbi->sb_n_free_dnodes = -1;
- sbi->sb_lowercase = lowercase;
- sbi->sb_eas = eas;
- sbi->sb_chk = chk;
- sbi->sb_chkdsk = chkdsk;
- sbi->sb_err = errs;
- sbi->sb_timeshift = timeshift;
+ sbi->sb_lowercase = ctx->lowercase;
+ sbi->sb_eas = ctx->eas;
+ sbi->sb_chk = ctx->chk;
+ sbi->sb_chkdsk = ctx->chkdsk;
+ sbi->sb_err = ctx->errs;
+ sbi->sb_timeshift = ctx->timeshift;
sbi->sb_was_error = 0;
sbi->sb_cp_table = NULL;
sbi->sb_c_bitmap = -1;
@@ -653,7 +595,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
/* Check for general fs errors*/
if (spareblock->dirty && !spareblock->old_wrote) {
- if (errs == 2) {
+ if (sbi->sb_err == 2) {
pr_err("Improperly stopped, not mounted\n");
goto bail4;
}
@@ -667,16 +609,16 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
}
if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) {
- if (errs >= 2) {
+ if (sbi->sb_err >= 2) {
pr_err("Spare dnodes used, try chkdsk\n");
mark_dirty(s, 0);
goto bail4;
}
hpfs_error(s, "warning: spare dnodes used, try chkdsk");
- if (errs == 0)
+ if (sbi->sb_err == 0)
pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n");
}
- if (chk) {
+ if (sbi->sb_chk) {
unsigned a;
if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) ||
le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) {
@@ -755,18 +697,70 @@ bail0:
return -EINVAL;
}
-static struct dentry *hpfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hpfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, hpfs_fill_super);
+}
+
+static void hpfs_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+ kfree(fc->fs_private);
}
+static const struct fs_context_operations hpfs_fc_context_ops = {
+ .parse_param = hpfs_parse_param,
+ .get_tree = hpfs_get_tree,
+ .reconfigure = hpfs_reconfigure,
+ .free = hpfs_free_fc,
+};
+
+static int hpfs_init_fs_context(struct fs_context *fc)
+{
+ struct hpfs_fc_context *ctx;
+
+ ctx = kzalloc(sizeof(struct hpfs_fc_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct hpfs_sb_info *sbi = hpfs_sb(sb);
+
+ ctx->uid = sbi->sb_uid;
+ ctx->gid = sbi->sb_gid;
+ ctx->umask = 0777 & ~sbi->sb_mode;
+ ctx->lowercase = sbi->sb_lowercase;
+ ctx->eas = sbi->sb_eas;
+ ctx->chk = sbi->sb_chk;
+ ctx->chkdsk = sbi->sb_chkdsk;
+ ctx->errs = sbi->sb_err;
+ ctx->timeshift = sbi->sb_timeshift;
+
+ } else {
+ ctx->uid = current_uid();
+ ctx->gid = current_gid();
+ ctx->umask = current_umask();
+ ctx->lowercase = 0;
+ ctx->eas = 2;
+ ctx->chk = 1;
+ ctx->errs = 1;
+ ctx->chkdsk = 1;
+ ctx->timeshift = 0;
+ }
+
+ fc->fs_private = ctx;
+ fc->ops = &hpfs_fc_context_ops;
+
+ return 0;
+};
+
static struct file_system_type hpfs_fs_type = {
.owner = THIS_MODULE,
.name = "hpfs",
- .mount = hpfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = hpfs_init_fs_context,
+ .parameters = hpfs_param_spec,
};
MODULE_ALIAS_FS("hpfs");
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5cf327337e22..1bbf783b244a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -39,6 +39,9 @@
#include <linux/uaccess.h>
#include <linux/sched/mm.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/hugetlbfs.h>
+
static const struct address_space_operations hugetlbfs_aops;
static const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
@@ -110,7 +113,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* way when do_mmap unwinds (may be important on powerpc
* and ia64).
*/
- vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
+ vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND | VM_MTE_ALLOWED);
vma->vm_ops = &hugetlb_vm_ops;
ret = seal_check_write(info->seals, vma);
@@ -687,6 +690,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
+ trace_hugetlbfs_evict_inode(inode);
remove_inode_hugepages(inode, 0, LLONG_MAX);
/*
@@ -814,8 +818,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- return hugetlbfs_punch_hole(inode, offset, len);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ error = hugetlbfs_punch_hole(inode, offset, len);
+ goto out_nolock;
+ }
/*
* Default preallocate case.
@@ -919,6 +925,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
inode_set_ctime_current(inode);
out:
inode_unlock(inode);
+
+out_nolock:
+ trace_hugetlbfs_fallocate(inode, mode, offset, len, error);
return error;
}
@@ -935,6 +944,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
if (error)
return error;
+ trace_hugetlbfs_setattr(inode, dentry, attr);
+
if (ia_valid & ATTR_SIZE) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
@@ -1033,6 +1044,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
}
lockdep_annotate_inode_mutex_key(inode);
+ trace_hugetlbfs_alloc_inode(inode, dir, mode);
} else {
if (resv_map)
kref_put(&resv_map->refs, resv_map_release);
@@ -1272,6 +1284,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
static void hugetlbfs_free_inode(struct inode *inode)
{
+ trace_hugetlbfs_free_inode(inode);
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}
diff --git a/fs/inode.c b/fs/inode.c
index 8dabb224f941..b13b778257ae 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,7 +21,12 @@
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <linux/rw_hint.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
#include <trace/events/writeback.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/timestamp.h>
+
#include "internal.h"
/*
@@ -98,6 +103,70 @@ long get_nr_dirty_inodes(void)
return nr_dirty > 0 ? nr_dirty : 0;
}
+#ifdef CONFIG_DEBUG_FS
+static DEFINE_PER_CPU(long, mg_ctime_updates);
+static DEFINE_PER_CPU(long, mg_fine_stamps);
+static DEFINE_PER_CPU(long, mg_ctime_swaps);
+
+static unsigned long get_mg_ctime_updates(void)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ sum += data_race(per_cpu(mg_ctime_updates, i));
+ return sum;
+}
+
+static unsigned long get_mg_fine_stamps(void)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ sum += data_race(per_cpu(mg_fine_stamps, i));
+ return sum;
+}
+
+static unsigned long get_mg_ctime_swaps(void)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ sum += data_race(per_cpu(mg_ctime_swaps, i));
+ return sum;
+}
+
+#define mgtime_counter_inc(__var) this_cpu_inc(__var)
+
+static int mgts_show(struct seq_file *s, void *p)
+{
+ unsigned long ctime_updates = get_mg_ctime_updates();
+ unsigned long ctime_swaps = get_mg_ctime_swaps();
+ unsigned long fine_stamps = get_mg_fine_stamps();
+ unsigned long floor_swaps = timekeeping_get_mg_floor_swaps();
+
+ seq_printf(s, "%lu %lu %lu %lu\n",
+ ctime_updates, ctime_swaps, fine_stamps, floor_swaps);
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(mgts);
+
+static int __init mg_debugfs_init(void)
+{
+ debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops);
+ return 0;
+}
+late_initcall(mg_debugfs_init);
+
+#else /* ! CONFIG_DEBUG_FS */
+
+#define mgtime_counter_inc(__var) do { } while (0)
+
+#endif /* CONFIG_DEBUG_FS */
+
/*
* Handle nr_inode sysctl
*/
@@ -174,6 +243,8 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp
inode->i_opflags = 0;
if (sb->s_xattr)
inode->i_opflags |= IOP_XATTR;
+ if (sb->s_type->fs_flags & FS_MGTIME)
+ inode->i_opflags |= IOP_MGTIME;
i_uid_write(inode, 0);
i_gid_write(inode, 0);
atomic_set(&inode->i_writecount, 0);
@@ -748,7 +819,7 @@ static void evict(struct inode *inode)
* ___wait_var_event() either sees the bit cleared or
* waitqueue_active() check in wake_up_var() sees the waiter.
*/
- smp_mb();
+ smp_mb__after_spinlock();
inode_wake_up_bit(inode, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
spin_unlock(&inode->i_lock);
@@ -1241,16 +1312,15 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
* @data: opaque data pointer to pass to @test and @set
*
* Search for the inode specified by @hashval and @data in the inode cache,
- * and if present it is return it with an increased reference count. This is
- * a variant of iget5_locked() for callers that don't want to fail on memory
- * allocation of inode.
+ * and if present return it with an increased reference count. This is a
+ * variant of iget5_locked() that doesn't allocate an inode.
*
- * If the inode is not in cache, insert the pre-allocated inode to cache and
+ * If the inode is not present in the cache, insert the pre-allocated inode and
* return it locked, hashed, and with the I_NEW flag set. The file system gets
* to fill it in before unlocking it via unlock_new_inode().
*
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
+ * Note that both @test and @set are called with the inode_hash_lock held, so
+ * they can't sleep.
*/
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *),
@@ -1314,16 +1384,16 @@ EXPORT_SYMBOL(inode_insert5);
* @data: opaque data pointer to pass to @test and @set
*
* Search for the inode specified by @hashval and @data in the inode cache,
- * and if present it is return it with an increased reference count. This is
- * a generalized version of iget_locked() for file systems where the inode
+ * and if present return it with an increased reference count. This is a
+ * generalized version of iget_locked() for file systems where the inode
* number is not sufficient for unique identification of an inode.
*
- * If the inode is not in cache, allocate a new inode and return it locked,
- * hashed, and with the I_NEW flag set. The file system gets to fill it in
- * before unlocking it via unlock_new_inode().
+ * If the inode is not present in the cache, allocate and insert a new inode
+ * and return it locked, hashed, and with the I_NEW flag set. The file system
+ * gets to fill it in before unlocking it via unlock_new_inode().
*
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
+ * Note that both @test and @set are called with the inode_hash_lock held, so
+ * they can't sleep.
*/
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *),
@@ -2211,19 +2281,58 @@ int file_remove_privs(struct file *file)
}
EXPORT_SYMBOL(file_remove_privs);
+/**
+ * current_time - Return FS time (possibly fine-grained)
+ * @inode: inode.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
+ * as having been QUERIED, get a fine-grained timestamp, but don't update
+ * the floor.
+ *
+ * For a multigrain inode, this is effectively an estimate of the timestamp
+ * that a file would receive. An actual update must go through
+ * inode_set_ctime_current().
+ */
+struct timespec64 current_time(struct inode *inode)
+{
+ struct timespec64 now;
+ u32 cns;
+
+ ktime_get_coarse_real_ts64_mg(&now);
+
+ if (!is_mgtime(inode))
+ goto out;
+
+ /* If nothing has queried it, then coarse time is fine */
+ cns = smp_load_acquire(&inode->i_ctime_nsec);
+ if (cns & I_CTIME_QUERIED) {
+ /*
+ * If there is no apparent change, then get a fine-grained
+ * timestamp.
+ */
+ if (now.tv_nsec == (cns & ~I_CTIME_QUERIED))
+ ktime_get_real_ts64(&now);
+ }
+out:
+ return timestamp_truncate(now, inode);
+}
+EXPORT_SYMBOL(current_time);
+
static int inode_needs_update_time(struct inode *inode)
{
+ struct timespec64 now, ts;
int sync_it = 0;
- struct timespec64 now = current_time(inode);
- struct timespec64 ts;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
+ now = current_time(inode);
+
ts = inode_get_mtime(inode);
if (!timespec64_equal(&ts, &now))
- sync_it = S_MTIME;
+ sync_it |= S_MTIME;
ts = inode_get_ctime(inode);
if (!timespec64_equal(&ts, &now))
@@ -2600,6 +2709,16 @@ void inode_nohighmem(struct inode *inode)
}
EXPORT_SYMBOL(inode_nohighmem);
+struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts)
+{
+ trace_inode_set_ctime_to_ts(inode, &ts);
+ set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec);
+ inode->i_ctime_sec = ts.tv_sec;
+ inode->i_ctime_nsec = ts.tv_nsec;
+ return ts;
+}
+EXPORT_SYMBOL(inode_set_ctime_to_ts);
+
/**
* timestamp_truncate - Truncate timespec to a granularity
* @t: Timespec
@@ -2632,39 +2751,159 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
EXPORT_SYMBOL(timestamp_truncate);
/**
- * current_time - Return FS time
- * @inode: inode.
+ * inode_set_ctime_current - set the ctime to current_time
+ * @inode: inode
*
- * Return the current time truncated to the time granularity supported by
- * the fs.
+ * Set the inode's ctime to the current value for the inode. Returns the
+ * current value that was assigned. If this is not a multigrain inode, then we
+ * set it to the later of the coarse time and floor value.
+ *
+ * If it is multigrain, then we first see if the coarse-grained timestamp is
+ * distinct from what is already there. If so, then use that. Otherwise, get a
+ * fine-grained timestamp.
*
- * Note that inode and inode->sb cannot be NULL.
- * Otherwise, the function warns and returns time without truncation.
+ * After that, try to swap the new value into i_ctime_nsec. Accept the
+ * resulting ctime, regardless of the outcome of the swap. If it has
+ * already been replaced, then that timestamp is later than the earlier
+ * unacceptable one, and is thus acceptable.
*/
-struct timespec64 current_time(struct inode *inode)
+struct timespec64 inode_set_ctime_current(struct inode *inode)
{
struct timespec64 now;
+ u32 cns, cur;
- ktime_get_coarse_real_ts64(&now);
- return timestamp_truncate(now, inode);
+ ktime_get_coarse_real_ts64_mg(&now);
+ now = timestamp_truncate(now, inode);
+
+ /* Just return that if this is not a multigrain fs */
+ if (!is_mgtime(inode)) {
+ inode_set_ctime_to_ts(inode, now);
+ goto out;
+ }
+
+ /*
+ * A fine-grained time is only needed if someone has queried
+ * for timestamps, and the current coarse grained time isn't
+ * later than what's already there.
+ */
+ cns = smp_load_acquire(&inode->i_ctime_nsec);
+ if (cns & I_CTIME_QUERIED) {
+ struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
+ .tv_nsec = cns & ~I_CTIME_QUERIED };
+
+ if (timespec64_compare(&now, &ctime) <= 0) {
+ ktime_get_real_ts64_mg(&now);
+ now = timestamp_truncate(now, inode);
+ mgtime_counter_inc(mg_fine_stamps);
+ }
+ }
+ mgtime_counter_inc(mg_ctime_updates);
+
+ /* No need to cmpxchg if it's exactly the same */
+ if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) {
+ trace_ctime_xchg_skip(inode, &now);
+ goto out;
+ }
+ cur = cns;
+retry:
+ /* Try to swap the nsec value into place. */
+ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
+ /* If swap occurred, then we're (mostly) done */
+ inode->i_ctime_sec = now.tv_sec;
+ trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur);
+ mgtime_counter_inc(mg_ctime_swaps);
+ } else {
+ /*
+ * Was the change due to someone marking the old ctime QUERIED?
+ * If so then retry the swap. This can only happen once since
+ * the only way to clear I_CTIME_QUERIED is to stamp the inode
+ * with a new ctime.
+ */
+ if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) {
+ cns = cur;
+ goto retry;
+ }
+ /* Otherwise, keep the existing ctime */
+ now.tv_sec = inode->i_ctime_sec;
+ now.tv_nsec = cur & ~I_CTIME_QUERIED;
+ }
+out:
+ return now;
}
-EXPORT_SYMBOL(current_time);
+EXPORT_SYMBOL(inode_set_ctime_current);
/**
- * inode_set_ctime_current - set the ctime to current_time
- * @inode: inode
+ * inode_set_ctime_deleg - try to update the ctime on a delegated inode
+ * @inode: inode to update
+ * @update: timespec64 to set the ctime
*
- * Set the inode->i_ctime to the current value for the inode. Returns
- * the current value that was assigned to i_ctime.
+ * Attempt to atomically update the ctime on behalf of a delegation holder.
+ *
+ * The nfs server can call back the holder of a delegation to get updated
+ * inode attributes, including the mtime. When updating the mtime, update
+ * the ctime to a value at least equal to that.
+ *
+ * This can race with concurrent updates to the inode, in which
+ * case the update is skipped.
+ *
+ * Note that this works even when multigrain timestamps are not enabled,
+ * so it is used in either case.
*/
-struct timespec64 inode_set_ctime_current(struct inode *inode)
+struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update)
{
- struct timespec64 now = current_time(inode);
+ struct timespec64 now, cur_ts;
+ u32 cur, old;
- inode_set_ctime_to_ts(inode, now);
- return now;
+ /* pairs with try_cmpxchg below */
+ cur = smp_load_acquire(&inode->i_ctime_nsec);
+ cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
+ cur_ts.tv_sec = inode->i_ctime_sec;
+
+ /* If the update is older than the existing value, skip it. */
+ if (timespec64_compare(&update, &cur_ts) <= 0)
+ return cur_ts;
+
+ ktime_get_coarse_real_ts64_mg(&now);
+
+ /* Clamp the update to "now" if it's in the future */
+ if (timespec64_compare(&update, &now) > 0)
+ update = now;
+
+ update = timestamp_truncate(update, inode);
+
+ /* No need to update if the values are already the same */
+ if (timespec64_equal(&update, &cur_ts))
+ return cur_ts;
+
+ /*
+ * Try to swap the nsec value into place. If it fails, that means
+ * it raced with an update due to a write or similar activity. That
+ * stamp takes precedence, so just skip the update.
+ */
+retry:
+ old = cur;
+ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) {
+ inode->i_ctime_sec = update.tv_sec;
+ mgtime_counter_inc(mg_ctime_swaps);
+ return update;
+ }
+
+ /*
+ * Was the change due to another task marking the old ctime QUERIED?
+ *
+ * If so, then retry the swap. This can only happen once since
+ * the only way to clear I_CTIME_QUERIED is to stamp the inode
+ * with a new ctime.
+ */
+ if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED)))
+ goto retry;
+
+ /* Otherwise, it was a new timestamp. */
+ cur_ts.tv_sec = inode->i_ctime_sec;
+ cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
+ return cur_ts;
}
-EXPORT_SYMBOL(inode_set_ctime_current);
+EXPORT_SYMBOL(inode_set_ctime_deleg);
/**
* in_group_or_capable - check whether caller is CAP_FSETID privileged
@@ -2672,7 +2911,7 @@ EXPORT_SYMBOL(inode_set_ctime_current);
* @inode: inode to check
* @vfsgid: the new/current vfsgid of @inode
*
- * Check wether @vfsgid is in the caller's group list or if the caller is
+ * Check whether @vfsgid is in the caller's group list or if the caller is
* privileged with CAP_FSETID over @inode. This can be used to determine
* whether the setgid bit can be kept or must be dropped.
*
diff --git a/fs/internal.h b/fs/internal.h
index 8c1b7acbbe8f..e7f02ae1e098 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -246,7 +246,6 @@ int open_namespace(struct ns_common *ns);
* fs/stat.c:
*/
-int getname_statx_lookup_flags(int flags);
int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer);
int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
@@ -267,7 +266,7 @@ struct xattr_name {
char name[XATTR_NAME_MAX + 1];
};
-struct xattr_ctx {
+struct kernel_xattr_ctx {
/* Value of attribute */
union {
const void __user *cvalue;
@@ -280,14 +279,15 @@ struct xattr_ctx {
unsigned int flags;
};
+ssize_t file_getxattr(struct file *file, struct kernel_xattr_ctx *ctx);
+ssize_t filename_getxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx);
+int file_setxattr(struct file *file, struct kernel_xattr_ctx *ctx);
+int filename_setxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx);
+int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx);
+int import_xattr_name(struct xattr_name *kname, const char __user *name);
-ssize_t do_getxattr(struct mnt_idmap *idmap,
- struct dentry *d,
- struct xattr_ctx *ctx);
-
-int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
-int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct xattr_ctx *ctx);
int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode);
#ifdef CONFIG_FS_POSIX_ACL
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 6e0c954388d4..638a36be31c1 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,11 +231,11 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff)
{
- struct fd src_file = fdget(srcfd);
+ CLASS(fd, src_file)(srcfd);
loff_t cloned;
int ret;
- if (!fd_file(src_file))
+ if (fd_empty(src_file))
return -EBADF;
cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff,
olen, 0);
@@ -245,7 +245,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
ret = -EINVAL;
else
ret = 0;
- fdput(src_file);
return ret;
}
@@ -892,22 +891,20 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int error;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = security_file_ioctl(fd_file(f), cmd, arg);
if (error)
- goto out;
+ return error;
error = do_vfs_ioctl(fd_file(f), fd, cmd, arg);
if (error == -ENOIOCTLCMD)
error = vfs_ioctl(fd_file(f), cmd, arg);
-out:
- fdput(f);
return error;
}
@@ -950,15 +947,15 @@ EXPORT_SYMBOL(compat_ptr_ioctl);
COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
compat_ulong_t, arg)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int error;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = security_file_ioctl_compat(fd_file(f), cmd, arg);
if (error)
- goto out;
+ return error;
switch (cmd) {
/* FICLONE takes an int argument, so don't use compat_ptr() */
@@ -1009,10 +1006,6 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
error = -ENOTTY;
break;
}
-
- out:
- fdput(f);
-
return error;
}
#endif
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ef0b68bccbb6..d42f01e0fc1c 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1601,6 +1601,8 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
{
if (ioend->io_bio.bi_status != next->io_bio.bi_status)
return false;
+ if (next->io_flags & IOMAP_F_BOUNDARY)
+ return false;
if ((ioend->io_flags & IOMAP_F_SHARED) ^
(next->io_flags & IOMAP_F_SHARED))
return false;
@@ -1720,6 +1722,8 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
INIT_LIST_HEAD(&ioend->io_list);
ioend->io_type = wpc->iomap.type;
ioend->io_flags = wpc->iomap.flags;
+ if (pos > wpc->iomap.offset)
+ wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = pos;
@@ -1731,6 +1735,8 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
{
+ if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ return false;
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
(wpc->ioend->io_flags & IOMAP_F_SHARED))
return false;
@@ -1784,7 +1790,7 @@ new_ioend:
if (ifs)
atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len;
- wbc_account_cgroup_owner(wbc, &folio->page, len);
+ wbc_account_cgroup_owner(wbc, folio, len);
return 0;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f637aa0706a3..b521eb15759e 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- const struct iomap *iomap, bool use_fua)
+ const struct iomap *iomap, bool use_fua, bool atomic)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
@@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ if (atomic)
+ opflags |= REQ_ATOMIC;
return opflags;
}
@@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
- loff_t length = iomap_length(iter);
+ const loff_t length = iomap_length(iter);
+ bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
@@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
+ if (atomic && length != fs_block_size)
+ return -EINVAL;
+
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
@@ -377,12 +383,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- /*
- * Set the operation flags early so that bio_iov_iter_get_pages
- * can set up the page vector appropriately for a ZONE_APPEND
- * operation.
- */
- bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
@@ -415,6 +416,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
+ if (WARN_ON_ONCE(atomic && n != length)) {
+ /*
+ * This bio should have covered the complete length,
+ * which it doesn't, so error. We may need to zero out
+ * the tail (complete FS block), similar to when
+ * bio_iov_iter_get_pages() returns an error, above.
+ */
+ ret = -EINVAL;
+ bio_put(bio);
+ goto zero_tail;
+ }
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
} else {
@@ -598,6 +610,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ iomi.flags |= IOMAP_ATOMIC;
+
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -659,7 +674,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret != -EAGAIN) {
trace_iomap_dio_invalidate_fail(inode, iomi.pos,
iomi.len);
- ret = -ENOTBLK;
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ /*
+ * folio invalidation failed, maybe
+ * this is transient, unlock and see if
+ * the caller tries again.
+ */
+ ret = -EAGAIN;
+ } else {
+ /* fall back to buffered write */
+ ret = -ENOTBLK;
+ }
}
goto out_free_dio;
}
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 0a991c4ce87d..4118a42cdab0 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_REPORT, "REPORT" }, \
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
- { IOMAP_NOWAIT, "NOWAIT" }
+ { IOMAP_NOWAIT, "NOWAIT" }, \
+ { IOMAP_ATOMIC, "ATOMIC" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index f50311a6b429..47038e660812 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -948,8 +948,6 @@ root_found:
goto out_no_inode;
}
- kfree(opt->iocharset);
-
return 0;
/*
@@ -987,7 +985,6 @@ out_freebh:
brelse(bh);
brelse(pri_bh);
out_freesbi:
- kfree(opt->iocharset);
kfree(sbi);
s->s_fs_info = NULL;
return error;
@@ -1528,7 +1525,10 @@ static int isofs_get_tree(struct fs_context *fc)
static void isofs_free_fc(struct fs_context *fc)
{
- kfree(fc->fs_private);
+ struct isofs_options *opt = fc->fs_private;
+
+ kfree(opt->iocharset);
+ kfree(opt);
}
static const struct fs_context_operations isofs_context_ops = {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4305a1ac808a..9153ff3a08e7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -662,10 +662,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
JBUFFER_TRACE(jh, "ph3: write metadata");
escape = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &wbuf[bufs], blocknr);
- if (escape < 0) {
- jbd2_journal_abort(journal, escape);
- continue;
- }
jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
/* Record the new block's tag in the current descriptor
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 97f487c3d8fc..7e49d912b091 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -318,7 +318,6 @@ static inline void jbd2_data_do_escape(char *data)
*
*
* Return value:
- * <0: Error
* =0: Finished OK without escape
* =1: Finished OK with escape
*/
@@ -386,12 +385,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
goto escape_done;
spin_unlock(&jh_in->b_state_lock);
- tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
- if (!tmp) {
- brelse(new_bh);
- free_buffer_head(new_bh);
- return -ENOMEM;
- }
+ tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
@@ -1518,9 +1512,10 @@ static int journal_load_superblock(journal_t *journal)
* destroy journal_t structures, and to initialise and read existing
* journal blocks from disk. */
-/* First: create and setup a journal_t object in memory. We initialise
- * very few fields yet: that has to wait until we have created the
- * journal structures from from scratch, or loaded them from disk. */
+/* The journal_init_common() function creates and fills a journal_t object
+ * in memory. It calls journal_load_superblock() to load the on-disk journal
+ * superblock and initialize the journal_t object.
+ */
static journal_t *journal_init_common(struct block_device *bdev,
struct block_device *fs_dev,
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 667f67342c52..9192be7c19d8 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -485,6 +485,104 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
return tag->t_checksum == cpu_to_be16(csum32);
}
+static __always_inline int jbd2_do_replay(journal_t *journal,
+ struct recovery_info *info,
+ struct buffer_head *bh,
+ unsigned long *next_log_block,
+ unsigned int next_commit_ID)
+{
+ char *tagp;
+ int flags;
+ int ret = 0;
+ int tag_bytes = journal_tag_bytes(journal);
+ int descr_csum_size = 0;
+ unsigned long io_block;
+ journal_block_tag_t tag;
+ struct buffer_head *obh;
+ struct buffer_head *nbh;
+
+ if (jbd2_journal_has_csum_v2or3(journal))
+ descr_csum_size = sizeof(struct jbd2_journal_block_tail);
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+ while (tagp - bh->b_data + tag_bytes <=
+ journal->j_blocksize - descr_csum_size) {
+ int err;
+
+ memcpy(&tag, tagp, sizeof(tag));
+ flags = be16_to_cpu(tag.t_flags);
+
+ io_block = (*next_log_block)++;
+ wrap(journal, *next_log_block);
+ err = jread(&obh, journal, io_block);
+ if (err) {
+ /* Recover what we can, but report failure at the end. */
+ ret = err;
+ pr_err("JBD2: IO error %d recovering block %lu in log\n",
+ err, io_block);
+ } else {
+ unsigned long long blocknr;
+
+ J_ASSERT(obh != NULL);
+ blocknr = read_tag_block(journal, &tag);
+
+ /* If the block has been revoked, then we're all done here. */
+ if (jbd2_journal_test_revoke(journal, blocknr,
+ next_commit_ID)) {
+ brelse(obh);
+ ++info->nr_revoke_hits;
+ goto skip_write;
+ }
+
+ /* Look for block corruption */
+ if (!jbd2_block_tag_csum_verify(journal, &tag,
+ (journal_block_tag3_t *)tagp,
+ obh->b_data, next_commit_ID)) {
+ brelse(obh);
+ ret = -EFSBADCRC;
+ pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n",
+ blocknr, io_block);
+ goto skip_write;
+ }
+
+ /* Find a buffer for the new data being restored */
+ nbh = __getblk(journal->j_fs_dev, blocknr,
+ journal->j_blocksize);
+ if (nbh == NULL) {
+ pr_err("JBD2: Out of memory during recovery.\n");
+ brelse(obh);
+ return -ENOMEM;
+ }
+
+ lock_buffer(nbh);
+ memcpy(nbh->b_data, obh->b_data, journal->j_blocksize);
+ if (flags & JBD2_FLAG_ESCAPE) {
+ *((__be32 *)nbh->b_data) =
+ cpu_to_be32(JBD2_MAGIC_NUMBER);
+ }
+
+ BUFFER_TRACE(nbh, "marking dirty");
+ set_buffer_uptodate(nbh);
+ mark_buffer_dirty(nbh);
+ BUFFER_TRACE(nbh, "marking uptodate");
+ ++info->nr_replays;
+ unlock_buffer(nbh);
+ brelse(obh);
+ brelse(nbh);
+ }
+
+skip_write:
+ tagp += tag_bytes;
+ if (!(flags & JBD2_FLAG_SAME_UUID))
+ tagp += 16;
+
+ if (flags & JBD2_FLAG_LAST_TAG)
+ break;
+ }
+
+ return ret;
+}
+
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
@@ -493,13 +591,10 @@ static int do_one_pass(journal_t *journal,
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
- struct buffer_head * bh;
+ struct buffer_head *bh = NULL;
unsigned int sequence;
int blocktype;
- int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
- int descr_csum_size = 0;
- int block_error = 0;
bool need_check_commit_time = false;
__u64 last_trans_commit_time = 0, commit_time;
@@ -528,12 +623,6 @@ static int do_one_pass(journal_t *journal,
*/
while (1) {
- int flags;
- char * tagp;
- journal_block_tag_t tag;
- struct buffer_head * obh;
- struct buffer_head * nbh;
-
cond_resched();
/* If we already know where to stop the log traversal,
@@ -552,6 +641,8 @@ static int do_one_pass(journal_t *journal,
* record. */
jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
+ brelse(bh);
+ bh = NULL;
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -567,20 +658,16 @@ static int do_one_pass(journal_t *journal,
tmp = (journal_header_t *)bh->b_data;
- if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
- brelse(bh);
+ if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER))
break;
- }
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
jbd2_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
- if (sequence != next_commit_ID) {
- brelse(bh);
+ if (sequence != next_commit_ID)
break;
- }
/* OK, we have a valid descriptor block which matches
* all of the sequence number checks. What are we going
@@ -589,11 +676,7 @@ static int do_one_pass(journal_t *journal,
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
/* Verify checksum first */
- if (jbd2_journal_has_csum_v2or3(journal))
- descr_csum_size =
- sizeof(struct jbd2_journal_block_tail);
- if (descr_csum_size > 0 &&
- !jbd2_descriptor_block_csum_verify(journal,
+ if (!jbd2_descriptor_block_csum_verify(journal,
bh->b_data)) {
/*
* PASS_SCAN can see stale blocks due to lazy
@@ -603,7 +686,6 @@ static int do_one_pass(journal_t *journal,
pr_err("JBD2: Invalid checksum recovering block %lu in log\n",
next_log_block);
err = -EFSBADCRC;
- brelse(bh);
goto failed;
}
need_check_commit_time = true;
@@ -619,125 +701,39 @@ static int do_one_pass(journal_t *journal,
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
jbd2_has_feature_checksum(journal) &&
- !need_check_commit_time &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
- &crc32_sum)) {
- put_bh(bh);
+ &crc32_sum))
break;
- }
- put_bh(bh);
continue;
}
next_log_block += count_tags(journal, bh);
wrap(journal, next_log_block);
- put_bh(bh);
continue;
}
- /* A descriptor block: we can now write all of
- * the data blocks. Yay, useful work is finally
- * getting done here! */
-
- tagp = &bh->b_data[sizeof(journal_header_t)];
- while ((tagp - bh->b_data + tag_bytes)
- <= journal->j_blocksize - descr_csum_size) {
- unsigned long io_block;
-
- memcpy(&tag, tagp, sizeof(tag));
- flags = be16_to_cpu(tag.t_flags);
-
- io_block = next_log_block++;
- wrap(journal, next_log_block);
- err = jread(&obh, journal, io_block);
- if (err) {
- /* Recover what we can, but
- * report failure at the end. */
- success = err;
- printk(KERN_ERR
- "JBD2: IO error %d recovering "
- "block %lu in log\n",
- err, io_block);
- } else {
- unsigned long long blocknr;
-
- J_ASSERT(obh != NULL);
- blocknr = read_tag_block(journal,
- &tag);
-
- /* If the block has been
- * revoked, then we're all done
- * here. */
- if (jbd2_journal_test_revoke
- (journal, blocknr,
- next_commit_ID)) {
- brelse(obh);
- ++info->nr_revoke_hits;
- goto skip_write;
- }
-
- /* Look for block corruption */
- if (!jbd2_block_tag_csum_verify(
- journal, &tag, (journal_block_tag3_t *)tagp,
- obh->b_data, be32_to_cpu(tmp->h_sequence))) {
- brelse(obh);
- success = -EFSBADCRC;
- printk(KERN_ERR "JBD2: Invalid "
- "checksum recovering "
- "data block %llu in "
- "journal block %lu\n",
- blocknr, io_block);
- block_error = 1;
- goto skip_write;
- }
-
- /* Find a buffer for the new
- * data being restored */
- nbh = __getblk(journal->j_fs_dev,
- blocknr,
- journal->j_blocksize);
- if (nbh == NULL) {
- printk(KERN_ERR
- "JBD2: Out of memory "
- "during recovery.\n");
- err = -ENOMEM;
- brelse(bh);
- brelse(obh);
- goto failed;
- }
-
- lock_buffer(nbh);
- memcpy(nbh->b_data, obh->b_data,
- journal->j_blocksize);
- if (flags & JBD2_FLAG_ESCAPE) {
- *((__be32 *)nbh->b_data) =
- cpu_to_be32(JBD2_MAGIC_NUMBER);
- }
-
- BUFFER_TRACE(nbh, "marking dirty");
- set_buffer_uptodate(nbh);
- mark_buffer_dirty(nbh);
- BUFFER_TRACE(nbh, "marking uptodate");
- ++info->nr_replays;
- unlock_buffer(nbh);
- brelse(obh);
- brelse(nbh);
- }
-
- skip_write:
- tagp += tag_bytes;
- if (!(flags & JBD2_FLAG_SAME_UUID))
- tagp += 16;
-
- if (flags & JBD2_FLAG_LAST_TAG)
- break;
+ /*
+ * A descriptor block: we can now write all of the
+ * data blocks. Yay, useful work is finally getting
+ * done here!
+ */
+ err = jbd2_do_replay(journal, info, bh, &next_log_block,
+ next_commit_ID);
+ if (err) {
+ if (err == -ENOMEM)
+ goto failed;
+ success = err;
}
- brelse(bh);
continue;
case JBD2_COMMIT_BLOCK:
+ if (pass != PASS_SCAN) {
+ next_commit_ID++;
+ continue;
+ }
+
/* How to differentiate between interrupted commit
* and journal corruption ?
*
@@ -782,7 +778,6 @@ static int do_one_pass(journal_t *journal,
pr_err("JBD2: Invalid checksum found in transaction %u\n",
next_commit_ID);
err = -EFSBADCRC;
- brelse(bh);
goto failed;
}
ignore_crc_mismatch:
@@ -792,7 +787,6 @@ static int do_one_pass(journal_t *journal,
*/
jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
next_commit_ID);
- brelse(bh);
goto done;
}
@@ -802,8 +796,7 @@ static int do_one_pass(journal_t *journal,
* much to do other than move on to the next sequence
* number.
*/
- if (pass == PASS_SCAN &&
- jbd2_has_feature_checksum(journal)) {
+ if (jbd2_has_feature_checksum(journal)) {
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
unsigned found_chksum =
@@ -812,7 +805,6 @@ static int do_one_pass(journal_t *journal,
if (info->end_transaction) {
journal->j_failed_commit =
info->end_transaction;
- brelse(bh);
break;
}
@@ -828,36 +820,33 @@ static int do_one_pass(journal_t *journal,
goto chksum_error;
crc32_sum = ~0;
+ goto chksum_ok;
}
- if (pass == PASS_SCAN &&
- !jbd2_commit_block_csum_verify(journal,
- bh->b_data)) {
- if (jbd2_commit_block_csum_verify_partial(
- journal,
- bh->b_data)) {
- pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
- next_commit_ID, next_log_block);
- goto chksum_ok;
- }
- chksum_error:
- if (commit_time < last_trans_commit_time)
- goto ignore_crc_mismatch;
- info->end_transaction = next_commit_ID;
- info->head_block = head_block;
- if (!jbd2_has_feature_async_commit(journal)) {
- journal->j_failed_commit =
- next_commit_ID;
- brelse(bh);
- break;
- }
+ if (jbd2_commit_block_csum_verify(journal, bh->b_data))
+ goto chksum_ok;
+
+ if (jbd2_commit_block_csum_verify_partial(journal,
+ bh->b_data)) {
+ pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
+ next_commit_ID, next_log_block);
+ goto chksum_ok;
}
- if (pass == PASS_SCAN) {
- chksum_ok:
- last_trans_commit_time = commit_time;
- head_block = next_log_block;
+
+chksum_error:
+ if (commit_time < last_trans_commit_time)
+ goto ignore_crc_mismatch;
+ info->end_transaction = next_commit_ID;
+ info->head_block = head_block;
+
+ if (!jbd2_has_feature_async_commit(journal)) {
+ journal->j_failed_commit = next_commit_ID;
+ break;
}
- brelse(bh);
+
+chksum_ok:
+ last_trans_commit_time = commit_time;
+ head_block = next_log_block;
next_commit_ID++;
continue;
@@ -876,14 +865,11 @@ static int do_one_pass(journal_t *journal,
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
- if (pass != PASS_REVOKE) {
- brelse(bh);
+ if (pass != PASS_REVOKE)
continue;
- }
err = scan_revoke_records(journal, bh,
next_commit_ID, info);
- brelse(bh);
if (err)
goto failed;
continue;
@@ -891,12 +877,12 @@ static int do_one_pass(journal_t *journal,
default:
jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
- brelse(bh);
goto done;
}
}
done:
+ brelse(bh);
/*
* We broke out of the log scan loop: either we came to the
* known end of the log or we found an unexpected block in the
@@ -927,11 +913,10 @@ static int do_one_pass(journal_t *journal,
success = err;
}
- if (block_error && success == 0)
- success = -EIO;
return success;
failed:
+ brelse(bh);
return err;
}
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 3ab410059dc2..f9009e4f9ffd 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -1820,6 +1820,9 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
return -EIO;
dp = (struct dmap *) mp->data;
+ if (dp->tree.budmin < 0)
+ return -EIO;
+
/* try to allocate the blocks.
*/
rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
@@ -2888,6 +2891,9 @@ static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl)
/* bubble the new value up the tree as required.
*/
for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
+ if (lp == 0)
+ break;
+
/* get the index of the first leaf of the 4 leaf
* group containing the specified leaf (leafno).
*/
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 5d3127ca68a4..8f85177f284b 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -2891,6 +2891,14 @@ int jfs_readdir(struct file *file, struct dir_context *ctx)
stbl = DT_GETSTBL(p);
for (i = index; i < p->header.nextindex; i++) {
+ if (stbl[i] < 0 || stbl[i] > 127) {
+ jfs_err("JFS: Invalid stbl[%d] = %d for inode %ld, block = %lld",
+ i, stbl[i], (long)ip->i_ino, (long long)bn);
+ free_page(dirent_buf);
+ DT_PUTPAGE(mp);
+ return -EIO;
+ }
+
d = (struct ldtentry *) & p->slot[stbl[i]];
if (((long) jfs_dirent + d->namlen + 1) >
@@ -3086,6 +3094,13 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
/* get the leftmost entry */
stbl = DT_GETSTBL(p);
+
+ if (stbl[0] < 0 || stbl[0] > 127) {
+ DT_PUTPAGE(mp);
+ jfs_error(ip->i_sb, "stbl[0] out of bound\n");
+ return -EIO;
+ }
+
xd = (pxd_t *) & p->slot[stbl[0]];
/* get the child page block address */
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 33ef13a0b110..8794281f8ffd 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -24,6 +24,7 @@
#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */
#define JFS_ERR_CONTINUE 0x00000004 /* continue */
#define JFS_ERR_PANIC 0x00000008 /* panic */
+#define JFS_ERR_MASK (JFS_ERR_REMOUNT_RO|JFS_ERR_CONTINUE|JFS_ERR_PANIC)
/* Quota support */
#define JFS_USRQUOTA 0x00000010
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index e1be21ca5d6e..223d9ac59839 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -6,11 +6,11 @@
#include <linux/fs.h>
#include <linux/module.h>
-#include <linux/parser.h>
#include <linux/completion.h>
#include <linux/vfs.h>
#include <linux/quotaops.h>
-#include <linux/mount.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/moduleparam.h>
#include <linux/kthread.h>
#include <linux/posix_acl.h>
@@ -210,240 +210,195 @@ enum {
Opt_discard, Opt_nodiscard, Opt_discard_minblk
};
-static const match_table_t tokens = {
- {Opt_integrity, "integrity"},
- {Opt_nointegrity, "nointegrity"},
- {Opt_iocharset, "iocharset=%s"},
- {Opt_resize, "resize=%u"},
- {Opt_resize_nosize, "resize"},
- {Opt_errors, "errors=%s"},
- {Opt_ignore, "noquota"},
- {Opt_quota, "quota"},
- {Opt_usrquota, "usrquota"},
- {Opt_grpquota, "grpquota"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_umask, "umask=%u"},
- {Opt_discard, "discard"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_discard_minblk, "discard=%u"},
- {Opt_err, NULL}
+static const struct constant_table jfs_param_errors[] = {
+ {"continue", JFS_ERR_CONTINUE},
+ {"remount-ro", JFS_ERR_REMOUNT_RO},
+ {"panic", JFS_ERR_PANIC},
+ {}
};
-static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
- int *flag)
-{
- void *nls_map = (void *)-1; /* -1: no change; NULL: none */
- char *p;
- struct jfs_sb_info *sbi = JFS_SBI(sb);
+static const struct fs_parameter_spec jfs_param_spec[] = {
+ fsparam_flag_no ("integrity", Opt_integrity),
+ fsparam_string ("iocharset", Opt_iocharset),
+ fsparam_u64 ("resize", Opt_resize),
+ fsparam_flag ("resize", Opt_resize_nosize),
+ fsparam_enum ("errors", Opt_errors, jfs_param_errors),
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("noquota", Opt_ignore),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("umask", Opt_umask),
+ fsparam_flag ("discard", Opt_discard),
+ fsparam_u32 ("discard", Opt_discard_minblk),
+ fsparam_flag ("nodiscard", Opt_nodiscard),
+ {}
+};
- *newLVSize = 0;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_integrity:
- *flag &= ~JFS_NOINTEGRITY;
- break;
- case Opt_nointegrity:
- *flag |= JFS_NOINTEGRITY;
- break;
- case Opt_ignore:
- /* Silently ignore the quota options */
- /* Don't do anything ;-) */
- break;
- case Opt_iocharset:
- if (nls_map && nls_map != (void *) -1)
- unload_nls(nls_map);
- if (!strcmp(args[0].from, "none"))
- nls_map = NULL;
- else {
- nls_map = load_nls(args[0].from);
- if (!nls_map) {
- pr_err("JFS: charset not found\n");
- goto cleanup;
- }
- }
- break;
- case Opt_resize:
- {
- char *resize = args[0].from;
- int rc = kstrtoll(resize, 0, newLVSize);
+struct jfs_context {
+ int flag;
+ kuid_t uid;
+ kgid_t gid;
+ uint umask;
+ uint minblks_trim;
+ void *nls_map;
+ bool resize;
+ s64 newLVSize;
+};
- if (rc)
- goto cleanup;
- break;
- }
- case Opt_resize_nosize:
- {
- *newLVSize = sb_bdev_nr_blocks(sb);
- if (*newLVSize == 0)
- pr_err("JFS: Cannot determine volume size\n");
- break;
+static int jfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct jfs_context *ctx = fc->fs_private;
+ int reconfigure = (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE);
+ struct fs_parse_result result;
+ struct nls_table *nls_map;
+ int opt;
+
+ opt = fs_parse(fc, jfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_integrity:
+ if (result.negated)
+ ctx->flag |= JFS_NOINTEGRITY;
+ else
+ ctx->flag &= ~JFS_NOINTEGRITY;
+ break;
+ case Opt_ignore:
+ /* Silently ignore the quota options */
+ /* Don't do anything ;-) */
+ break;
+ case Opt_iocharset:
+ if (ctx->nls_map && ctx->nls_map != (void *) -1) {
+ unload_nls(ctx->nls_map);
+ ctx->nls_map = NULL;
}
- case Opt_errors:
- {
- char *errors = args[0].from;
- if (!errors || !*errors)
- goto cleanup;
- if (!strcmp(errors, "continue")) {
- *flag &= ~JFS_ERR_REMOUNT_RO;
- *flag &= ~JFS_ERR_PANIC;
- *flag |= JFS_ERR_CONTINUE;
- } else if (!strcmp(errors, "remount-ro")) {
- *flag &= ~JFS_ERR_CONTINUE;
- *flag &= ~JFS_ERR_PANIC;
- *flag |= JFS_ERR_REMOUNT_RO;
- } else if (!strcmp(errors, "panic")) {
- *flag &= ~JFS_ERR_CONTINUE;
- *flag &= ~JFS_ERR_REMOUNT_RO;
- *flag |= JFS_ERR_PANIC;
- } else {
- pr_err("JFS: %s is an invalid error handler\n",
- errors);
- goto cleanup;
+ if (!strcmp(param->string, "none"))
+ ctx->nls_map = NULL;
+ else {
+ nls_map = load_nls(param->string);
+ if (!nls_map) {
+ pr_err("JFS: charset not found\n");
+ return -EINVAL;
}
- break;
+ ctx->nls_map = nls_map;
}
+ break;
+ case Opt_resize:
+ if (!reconfigure)
+ return -EINVAL;
+ ctx->resize = true;
+ ctx->newLVSize = result.uint_64;
+ break;
+ case Opt_resize_nosize:
+ if (!reconfigure)
+ return -EINVAL;
+ ctx->resize = true;
+ break;
+ case Opt_errors:
+ ctx->flag &= ~JFS_ERR_MASK;
+ ctx->flag |= result.uint_32;
+ break;
#ifdef CONFIG_QUOTA
- case Opt_quota:
- case Opt_usrquota:
- *flag |= JFS_USRQUOTA;
- break;
- case Opt_grpquota:
- *flag |= JFS_GRPQUOTA;
- break;
+ case Opt_quota:
+ case Opt_usrquota:
+ ctx->flag |= JFS_USRQUOTA;
+ break;
+ case Opt_grpquota:
+ ctx->flag |= JFS_GRPQUOTA;
+ break;
#else
- case Opt_usrquota:
- case Opt_grpquota:
- case Opt_quota:
- pr_err("JFS: quota operations not supported\n");
- break;
+ case Opt_usrquota:
+ case Opt_grpquota:
+ case Opt_quota:
+ pr_err("JFS: quota operations not supported\n");
+ break;
#endif
- case Opt_uid:
- {
- char *uid = args[0].from;
- uid_t val;
- int rc = kstrtouint(uid, 0, &val);
-
- if (rc)
- goto cleanup;
- sbi->uid = make_kuid(current_user_ns(), val);
- if (!uid_valid(sbi->uid))
- goto cleanup;
- break;
- }
-
- case Opt_gid:
- {
- char *gid = args[0].from;
- gid_t val;
- int rc = kstrtouint(gid, 0, &val);
-
- if (rc)
- goto cleanup;
- sbi->gid = make_kgid(current_user_ns(), val);
- if (!gid_valid(sbi->gid))
- goto cleanup;
- break;
+ case Opt_uid:
+ ctx->uid = result.uid;
+ break;
+
+ case Opt_gid:
+ ctx->gid = result.gid;
+ break;
+
+ case Opt_umask:
+ if (result.uint_32 & ~0777) {
+ pr_err("JFS: Invalid value of umask\n");
+ return -EINVAL;
}
+ ctx->umask = result.uint_32;
+ break;
- case Opt_umask:
- {
- char *umask = args[0].from;
- int rc = kstrtouint(umask, 8, &sbi->umask);
+ case Opt_discard:
+ /* if set to 1, even copying files will cause
+ * trimming :O
+ * -> user has more control over the online trimming
+ */
+ ctx->minblks_trim = 64;
+ ctx->flag |= JFS_DISCARD;
+ break;
- if (rc)
- goto cleanup;
- if (sbi->umask & ~0777) {
- pr_err("JFS: Invalid value of umask\n");
- goto cleanup;
- }
- break;
- }
+ case Opt_nodiscard:
+ ctx->flag &= ~JFS_DISCARD;
+ break;
- case Opt_discard:
- /* if set to 1, even copying files will cause
- * trimming :O
- * -> user has more control over the online trimming
- */
- sbi->minblks_trim = 64;
- if (bdev_max_discard_sectors(sb->s_bdev))
- *flag |= JFS_DISCARD;
- else
- pr_err("JFS: discard option not supported on device\n");
- break;
-
- case Opt_nodiscard:
- *flag &= ~JFS_DISCARD;
- break;
-
- case Opt_discard_minblk:
- {
- char *minblks_trim = args[0].from;
- int rc;
- if (bdev_max_discard_sectors(sb->s_bdev)) {
- *flag |= JFS_DISCARD;
- rc = kstrtouint(minblks_trim, 0,
- &sbi->minblks_trim);
- if (rc)
- goto cleanup;
- } else
- pr_err("JFS: discard option not supported on device\n");
- break;
- }
+ case Opt_discard_minblk:
+ ctx->minblks_trim = result.uint_32;
+ ctx->flag |= JFS_DISCARD;
+ break;
- default:
- printk("jfs: Unrecognized mount option \"%s\" or missing value\n",
- p);
- goto cleanup;
- }
- }
-
- if (nls_map != (void *) -1) {
- /* Discard old (if remount) */
- unload_nls(sbi->nls_tab);
- sbi->nls_tab = nls_map;
+ default:
+ return -EINVAL;
}
- return 1;
-cleanup:
- if (nls_map && nls_map != (void *) -1)
- unload_nls(nls_map);
return 0;
}
-static int jfs_remount(struct super_block *sb, int *flags, char *data)
+static int jfs_reconfigure(struct fs_context *fc)
{
- s64 newLVSize = 0;
+ struct jfs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
+ int readonly = fc->sb_flags & SB_RDONLY;
int rc = 0;
- int flag = JFS_SBI(sb)->flag;
+ int flag = ctx->flag;
int ret;
sync_filesystem(sb);
- if (!parse_options(data, sb, &newLVSize, &flag))
- return -EINVAL;
- if (newLVSize) {
+ /* Transfer results of parsing to the sbi */
+ JFS_SBI(sb)->flag = ctx->flag;
+ JFS_SBI(sb)->uid = ctx->uid;
+ JFS_SBI(sb)->gid = ctx->gid;
+ JFS_SBI(sb)->umask = ctx->umask;
+ JFS_SBI(sb)->minblks_trim = ctx->minblks_trim;
+ if (ctx->nls_map != (void *) -1) {
+ unload_nls(JFS_SBI(sb)->nls_tab);
+ JFS_SBI(sb)->nls_tab = ctx->nls_map;
+ }
+ ctx->nls_map = NULL;
+
+ if (ctx->resize) {
if (sb_rdonly(sb)) {
pr_err("JFS: resize requires volume to be mounted read-write\n");
return -EROFS;
}
- rc = jfs_extendfs(sb, newLVSize, 0);
+
+ if (!ctx->newLVSize) {
+ ctx->newLVSize = sb_bdev_nr_blocks(sb);
+ if (ctx->newLVSize == 0)
+ pr_err("JFS: Cannot determine volume size\n");
+ }
+
+ rc = jfs_extendfs(sb, ctx->newLVSize, 0);
if (rc)
return rc;
}
- if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
+ if (sb_rdonly(sb) && !readonly) {
/*
* Invalidate any previously read metadata. fsck may have
* changed the on-disk data since we mounted r/o
@@ -459,7 +414,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
dquot_resume(sb, -1);
return ret;
}
- if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
+ if (!sb_rdonly(sb) && readonly) {
rc = dquot_suspend(sb, -1);
if (rc < 0)
return rc;
@@ -467,7 +422,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
JFS_SBI(sb)->flag = flag;
return rc;
}
- if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
+ if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) {
if (!sb_rdonly(sb)) {
rc = jfs_umount_rw(sb);
if (rc)
@@ -477,18 +432,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
ret = jfs_mount_rw(sb, 1);
return ret;
}
+ }
JFS_SBI(sb)->flag = flag;
return 0;
}
-static int jfs_fill_super(struct super_block *sb, void *data, int silent)
+static int jfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct jfs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
struct jfs_sb_info *sbi;
struct inode *inode;
int rc;
- s64 newLVSize = 0;
- int flag, ret = -EINVAL;
+ int ret = -EINVAL;
jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
@@ -501,24 +458,34 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_min = 0;
sb->s_time_max = U32_MAX;
sbi->sb = sb;
- sbi->uid = INVALID_UID;
- sbi->gid = INVALID_GID;
- sbi->umask = -1;
-
- /* initialize the mount flag and determine the default error handler */
- flag = JFS_ERR_REMOUNT_RO;
- if (!parse_options((char *) data, sb, &newLVSize, &flag))
- goto out_kfree;
- sbi->flag = flag;
+ /* Transfer results of parsing to the sbi */
+ sbi->flag = ctx->flag;
+ sbi->uid = ctx->uid;
+ sbi->gid = ctx->gid;
+ sbi->umask = ctx->umask;
+ if (ctx->nls_map != (void *) -1) {
+ unload_nls(sbi->nls_tab);
+ sbi->nls_tab = ctx->nls_map;
+ }
+ ctx->nls_map = NULL;
+
+ if (sbi->flag & JFS_DISCARD) {
+ if (!bdev_max_discard_sectors(sb->s_bdev)) {
+ pr_err("JFS: discard option not supported on device\n");
+ sbi->flag &= ~JFS_DISCARD;
+ } else {
+ sbi->minblks_trim = ctx->minblks_trim;
+ }
+ }
#ifdef CONFIG_JFS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
#endif
- if (newLVSize) {
+ if (ctx->resize) {
pr_err("resize option for remount only\n");
- goto out_kfree;
+ goto out_unload;
}
/*
@@ -608,7 +575,6 @@ out_mount_failed:
sbi->direct_inode = NULL;
out_unload:
unload_nls(sbi->nls_tab);
-out_kfree:
kfree(sbi);
return ret;
}
@@ -664,10 +630,9 @@ out:
return rc;
}
-static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int jfs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+ return get_tree_bdev(fc, jfs_fill_super);
}
static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -886,7 +851,6 @@ static const struct super_operations jfs_super_operations = {
.freeze_fs = jfs_freeze,
.unfreeze_fs = jfs_unfreeze,
.statfs = jfs_statfs,
- .remount_fs = jfs_remount,
.show_options = jfs_show_options,
#ifdef CONFIG_QUOTA
.quota_read = jfs_quota_read,
@@ -902,12 +866,71 @@ static const struct export_operations jfs_export_operations = {
.get_parent = jfs_get_parent,
};
+static void jfs_init_options(struct fs_context *fc, struct jfs_context *ctx)
+{
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+
+ /* Copy over current option values and mount flags */
+ ctx->uid = JFS_SBI(sb)->uid;
+ ctx->gid = JFS_SBI(sb)->gid;
+ ctx->umask = JFS_SBI(sb)->umask;
+ ctx->nls_map = (void *)-1;
+ ctx->minblks_trim = JFS_SBI(sb)->minblks_trim;
+ ctx->flag = JFS_SBI(sb)->flag;
+
+ } else {
+ /*
+ * Initialize the mount flag and determine the default
+ * error handler
+ */
+ ctx->flag = JFS_ERR_REMOUNT_RO;
+ ctx->uid = INVALID_UID;
+ ctx->gid = INVALID_GID;
+ ctx->umask = -1;
+ ctx->nls_map = (void *)-1;
+ }
+}
+
+static void jfs_free_fc(struct fs_context *fc)
+{
+ struct jfs_context *ctx = fc->fs_private;
+
+ if (ctx->nls_map != (void *) -1)
+ unload_nls(ctx->nls_map);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations jfs_context_ops = {
+ .parse_param = jfs_parse_param,
+ .get_tree = jfs_get_tree,
+ .reconfigure = jfs_reconfigure,
+ .free = jfs_free_fc,
+};
+
+static int jfs_init_fs_context(struct fs_context *fc)
+{
+ struct jfs_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ jfs_init_options(fc, ctx);
+
+ fc->fs_private = ctx;
+ fc->ops = &jfs_context_ops;
+
+ return 0;
+}
+
static struct file_system_type jfs_fs_type = {
.owner = THIS_MODULE,
.name = "jfs",
- .mount = jfs_do_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = jfs_init_fs_context,
+ .parameters = jfs_param_spec,
};
MODULE_ALIAS_FS("jfs");
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 0fb05e314edf..24afbae87225 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -559,7 +559,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
size_check:
if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
- int size = min_t(int, EALIST_SIZE(ea_buf->xattr), ea_size);
+ int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr));
printk(KERN_ERR "ea_get: invalid extended attribute\n");
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 9ff37ae650ea..de32c95d823d 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -175,15 +175,11 @@ ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
size_t buf_size, size_t *file_size,
enum kernel_read_file_id id)
{
- struct fd f = fdget(fd);
- ssize_t ret = -EBADF;
+ CLASS(fd, f)(fd);
- if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ))
- goto out;
+ if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ return -EBADF;
- ret = kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id);
-out:
- fdput(f);
- return ret;
+ return kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id);
}
EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
diff --git a/fs/libfs.c b/fs/libfs.c
index 46966fd8bcf9..748ac5923154 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -77,6 +77,10 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
return ERR_PTR(-ENAMETOOLONG);
if (!dentry->d_sb->s_d_op)
d_set_d_op(dentry, &simple_dentry_operations);
+
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ return NULL;
+
d_add(dentry, NULL);
return NULL;
}
@@ -1711,15 +1715,6 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(-ENOENT);
}
-static int empty_dir_getattr(struct mnt_idmap *idmap,
- const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
-{
- struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- return 0;
-}
-
static int empty_dir_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *attr)
{
@@ -1733,9 +1728,7 @@ static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t siz
static const struct inode_operations empty_dir_inode_operations = {
.lookup = empty_dir_lookup,
- .permission = generic_permission,
.setattr = empty_dir_setattr,
- .getattr = empty_dir_getattr,
.listxattr = empty_dir_listxattr,
};
@@ -1791,8 +1784,8 @@ bool is_empty_dir_inode(struct inode *inode)
*
* Return: 0 if names match, 1 if mismatch, or -ERRNO
*/
-static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name)
+int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name)
{
const struct dentry *parent;
const struct inode *dir;
@@ -1835,6 +1828,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
}
+EXPORT_SYMBOL(generic_ci_d_compare);
/**
* generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
@@ -1843,7 +1837,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
*
* Return: 0 if hash was successful or unchanged, and -EINVAL on error
*/
-static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct inode *dir = READ_ONCE(dentry->d_inode);
struct super_block *sb = dentry->d_sb;
@@ -1858,6 +1852,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
return -EINVAL;
return 0;
}
+EXPORT_SYMBOL(generic_ci_d_hash);
static const struct dentry_operations generic_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 1f2149db10f2..2359347c9fbd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -30,7 +30,6 @@
#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/nlm.h>
#include <linux/lockd/lockd.h>
-#include <linux/exportfs.h>
#define NLMDBG_FACILITY NLMDBG_SVCLOCK
@@ -481,7 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
struct nlm_cookie *cookie, int reclaim)
{
- struct inode *inode = nlmsvc_file_inode(file);
+ struct inode *inode __maybe_unused = nlmsvc_file_inode(file);
struct nlm_block *block = NULL;
int error;
int mode;
@@ -496,7 +495,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_end,
wait);
- if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) {
+ if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) {
async_block = wait;
wait = 0;
}
@@ -550,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
* requests on the underlaying ->lock() implementation but
* only one nlm_block to being granted by lm_grant().
*/
- if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) &&
+ if (locks_can_async_lock(nlmsvc_file_file(file)->f_op) &&
!list_empty(&block->b_list)) {
spin_unlock(&nlm_blocked_lock);
ret = nlm_lck_blocked;
diff --git a/fs/locks.c b/fs/locks.c
index 204847628f3e..25afc8d9c9d1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2136,7 +2136,6 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
int can_sleep, error, type;
struct file_lock fl;
- struct fd f;
/*
* LOCK_MAND locks were broken for a long time in that they never
@@ -2155,19 +2154,18 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
if (type < 0)
return type;
- error = -EBADF;
- f = fdget(fd);
- if (!fd_file(f))
- return error;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE)))
- goto out_putf;
+ return -EBADF;
flock_make_lock(fd_file(f), &fl, type);
error = security_file_lock(fd_file(f), fl.c.flc_type);
if (error)
- goto out_putf;
+ return error;
can_sleep = !(cmd & LOCK_NB);
if (can_sleep)
@@ -2181,9 +2179,6 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
error = locks_lock_file_wait(fd_file(f), &fl);
locks_release_private(&fl);
- out_putf:
- fdput(f);
-
return error;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index b5b5ddf9d513..82aecf372743 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -606,7 +606,7 @@ alloc_new:
* the confused fail path above (OOM) will be very confused when
* it finds all bh marked clean (i.e. it will not write anything)
*/
- wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
+ wbc_account_cgroup_owner(wbc, folio, folio_size(folio));
length = first_unmapped << blkbits;
if (!bio_add_folio(bio, folio, length, 0)) {
bio = mpage_bio_submit_write(bio);
diff --git a/fs/namei.c b/fs/namei.c
index 4a4a22a08ac2..9d30c7aa9aa6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -211,22 +211,38 @@ getname_flags(const char __user *filename, int flags)
return result;
}
-struct filename *
-getname_uflags(const char __user *filename, int uflags)
+struct filename *getname_uflags(const char __user *filename, int uflags)
{
int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
return getname_flags(filename, flags);
}
-struct filename *
-getname(const char __user * filename)
+struct filename *getname(const char __user * filename)
{
return getname_flags(filename, 0);
}
-struct filename *
-getname_kernel(const char * filename)
+struct filename *__getname_maybe_null(const char __user *pathname)
+{
+ struct filename *name;
+ char c;
+
+ /* try to save on allocations; loss on um, though */
+ if (get_user(c, pathname))
+ return ERR_PTR(-EFAULT);
+ if (!c)
+ return NULL;
+
+ name = getname_flags(pathname, LOOKUP_EMPTY);
+ if (!IS_ERR(name) && !(name->name[0])) {
+ putname(name);
+ name = NULL;
+ }
+ return name;
+}
+
+struct filename *getname_kernel(const char * filename)
{
struct filename *result;
int len = strlen(filename) + 1;
@@ -264,7 +280,7 @@ EXPORT_SYMBOL(getname_kernel);
void putname(struct filename *name)
{
- if (IS_ERR(name))
+ if (IS_ERR_OR_NULL(name))
return;
if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
@@ -326,6 +342,25 @@ static int check_acl(struct mnt_idmap *idmap,
return -EAGAIN;
}
+/*
+ * Very quick optimistic "we know we have no ACL's" check.
+ *
+ * Note that this is purely for ACL_TYPE_ACCESS, and purely
+ * for the "we have cached that there are no ACLs" case.
+ *
+ * If this returns true, we know there are no ACLs. But if
+ * it returns false, we might still not have ACLs (it could
+ * be the is_uncached_acl() case).
+ */
+static inline bool no_acl_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_POSIX_ACL
+ return likely(!READ_ONCE(inode->i_acl));
+#else
+ return true;
+#endif
+}
+
/**
* acl_permission_check - perform basic UNIX permission checking
* @idmap: idmap of the mount the inode was found from
@@ -348,6 +383,28 @@ static int acl_permission_check(struct mnt_idmap *idmap,
unsigned int mode = inode->i_mode;
vfsuid_t vfsuid;
+ /*
+ * Common cheap case: everybody has the requested
+ * rights, and there are no ACLs to check. No need
+ * to do any owner/group checks in that case.
+ *
+ * - 'mask&7' is the requested permission bit set
+ * - multiplying by 0111 spreads them out to all of ugo
+ * - '& ~mode' looks for missing inode permission bits
+ * - the '!' is for "no missing permissions"
+ *
+ * After that, we just need to check that there are no
+ * ACL's on the inode - do the 'IS_POSIXACL()' check last
+ * because it will dereference the ->i_sb pointer and we
+ * want to avoid that if at all possible.
+ */
+ if (!((mask & 7) * 0111 & ~mode)) {
+ if (no_acl_inode(inode))
+ return 0;
+ if (!IS_POSIXACL(inode))
+ return 0;
+ }
+
/* Are we the owner? If so, ACL's don't matter */
vfsuid = i_uid_into_vfsuid(idmap, inode);
if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
@@ -588,6 +645,7 @@ struct nameidata {
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
+ const char *pathname;
struct nameidata *saved;
unsigned root_seq;
int dfd;
@@ -606,6 +664,7 @@ static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
p->depth = 0;
p->dfd = dfd;
p->name = name;
+ p->pathname = likely(name) ? name->name : "";
p->path.mnt = NULL;
p->path.dentry = NULL;
p->total_link_count = old ? old->total_link_count : 0;
@@ -2439,7 +2498,7 @@ OK:
static const char *path_init(struct nameidata *nd, unsigned flags)
{
int error;
- const char *s = nd->name->name;
+ const char *s = nd->pathname;
/* LOOKUP_CACHED requires RCU, ask caller to retry */
if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
@@ -2503,26 +2562,22 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
} else {
/* Caller must check execute permissions on the starting path component */
- struct fd f = fdget_raw(nd->dfd);
+ CLASS(fd_raw, f)(nd->dfd);
struct dentry *dentry;
- if (!fd_file(f))
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
if (flags & LOOKUP_LINKAT_EMPTY) {
if (fd_file(f)->f_cred != current_cred() &&
- !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
- fdput(f);
+ !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
return ERR_PTR(-ENOENT);
- }
}
dentry = fd_file(f)->f_path.dentry;
- if (*s && unlikely(!d_can_lookup(dentry))) {
- fdput(f);
+ if (*s && unlikely(!d_can_lookup(dentry)))
return ERR_PTR(-ENOTDIR);
- }
nd->path = fd_file(f)->f_path;
if (flags & LOOKUP_RCU) {
@@ -2532,7 +2587,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
path_get(&nd->path);
nd->inode = nd->path.dentry->d_inode;
}
- fdput(f);
}
/* For scoped-lookups we need to set the root to the dirfd as well. */
diff --git a/fs/namespace.c b/fs/namespace.c
index d26f5e6d2ca3..6b0a17487d0f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3901,7 +3901,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
}
new_ns->ns.ops = &mntns_operations;
if (!anon)
- new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
+ new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
@@ -4107,7 +4107,6 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
struct file *file;
struct path newmount;
struct mount *mnt;
- struct fd f;
unsigned int mnt_flags = 0;
long ret;
@@ -4135,19 +4134,18 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
return -EINVAL;
}
- f = fdget(fs_fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fs_fd);
+ if (fd_empty(f))
return -EBADF;
- ret = -EINVAL;
if (fd_file(f)->f_op != &fscontext_fops)
- goto err_fsfd;
+ return -EINVAL;
fc = fd_file(f)->private_data;
ret = mutex_lock_interruptible(&fc->uapi_mutex);
if (ret < 0)
- goto err_fsfd;
+ return ret;
/* There must be a valid superblock or we can't mount it */
ret = -EINVAL;
@@ -4214,8 +4212,6 @@ err_path:
path_put(&newmount);
err_unlock:
mutex_unlock(&fc->uapi_mutex);
-err_fsfd:
- fdput(f);
return ret;
}
@@ -4670,10 +4666,8 @@ out:
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
struct mount_kattr *kattr, unsigned int flags)
{
- int err = 0;
struct ns_common *ns;
struct user_namespace *mnt_userns;
- struct fd f;
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
return 0;
@@ -4689,20 +4683,16 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
if (attr->userns_fd > INT_MAX)
return -EINVAL;
- f = fdget(attr->userns_fd);
- if (!fd_file(f))
+ CLASS(fd, f)(attr->userns_fd);
+ if (fd_empty(f))
return -EBADF;
- if (!proc_ns_file(fd_file(f))) {
- err = -EINVAL;
- goto out_fput;
- }
+ if (!proc_ns_file(fd_file(f)))
+ return -EINVAL;
ns = get_proc_ns(file_inode(fd_file(f)));
- if (ns->ops->type != CLONE_NEWUSER) {
- err = -EINVAL;
- goto out_fput;
- }
+ if (ns->ops->type != CLONE_NEWUSER)
+ return -EINVAL;
/*
* The initial idmapping cannot be used to create an idmapped
@@ -4713,22 +4703,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
* result.
*/
mnt_userns = container_of(ns, struct user_namespace, ns);
- if (mnt_userns == &init_user_ns) {
- err = -EPERM;
- goto out_fput;
- }
+ if (mnt_userns == &init_user_ns)
+ return -EPERM;
/* We're not controlling the target namespace. */
- if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
- err = -EPERM;
- goto out_fput;
- }
+ if (!ns_capable(mnt_userns, CAP_SYS_ADMIN))
+ return -EPERM;
kattr->mnt_userns = get_user_ns(mnt_userns);
-
-out_fput:
- fdput(f);
- return err;
+ return 0;
}
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
@@ -5006,6 +4989,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+
+ if (sb->s_subtype)
+ seq_puts(seq, sb->s_subtype);
+}
+
+static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+ struct mount *r = real_mount(s->mnt);
+
+ if (sb->s_op->show_devname) {
+ size_t start = seq->count;
+ int ret;
+
+ ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
+ if (ret)
+ return ret;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ /* Unescape the result */
+ seq->buf[seq->count] = '\0';
+ seq->count = start;
+ seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
+ } else if (r->mnt_devname) {
+ seq_puts(seq, r->mnt_devname);
+ }
+ return 0;
+}
+
static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
s->sm.mask |= STATMOUNT_MNT_NS_ID;
@@ -5040,35 +5057,134 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static inline int statmount_opt_unescape(struct seq_file *seq, char *buf_start)
+{
+ char *buf_end, *opt_start, *opt_end;
+ int count = 0;
+
+ buf_end = seq->buf + seq->count;
+ *buf_end = '\0';
+ for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) {
+ opt_end = strchrnul(opt_start, ',');
+ *opt_end = '\0';
+ buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1;
+ if (WARN_ON_ONCE(++count == INT_MAX))
+ return -EOVERFLOW;
+ }
+ seq->count = buf_start - 1 - seq->buf;
+ return count;
+}
+
+static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
+ char *buf_start;
+ int err;
+
+ if (!sb->s_op->show_options)
+ return 0;
+
+ buf_start = seq->buf + start;
+ err = sb->s_op->show_options(seq, mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (seq->count == start)
+ return 0;
+
+ err = statmount_opt_unescape(seq, buf_start);
+ if (err < 0)
+ return err;
+
+ s->sm.opt_num = err;
+ return 0;
+}
+
+static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
+ char *buf_start;
+ int err;
+
+ buf_start = seq->buf + start;
+
+ err = security_sb_show_options(seq, sb);
+ if (!err)
+ return err;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (seq->count == start)
+ return 0;
+
+ err = statmount_opt_unescape(seq, buf_start);
+ if (err < 0)
+ return err;
+
+ s->sm.opt_sec_num = err;
+ return 0;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
- int ret;
+ int ret = 0;
size_t kbufsize;
struct seq_file *seq = &s->seq;
struct statmount *sm = &s->sm;
+ u32 start = seq->count;
switch (flag) {
case STATMOUNT_FS_TYPE:
- sm->fs_type = seq->count;
+ sm->fs_type = start;
ret = statmount_fs_type(s, seq);
break;
case STATMOUNT_MNT_ROOT:
- sm->mnt_root = seq->count;
+ sm->mnt_root = start;
ret = statmount_mnt_root(s, seq);
break;
case STATMOUNT_MNT_POINT:
- sm->mnt_point = seq->count;
+ sm->mnt_point = start;
ret = statmount_mnt_point(s, seq);
break;
case STATMOUNT_MNT_OPTS:
- sm->mnt_opts = seq->count;
+ sm->mnt_opts = start;
ret = statmount_mnt_opts(s, seq);
break;
+ case STATMOUNT_OPT_ARRAY:
+ sm->opt_array = start;
+ ret = statmount_opt_array(s, seq);
+ break;
+ case STATMOUNT_OPT_SEC_ARRAY:
+ sm->opt_sec_array = start;
+ ret = statmount_opt_sec_array(s, seq);
+ break;
+ case STATMOUNT_FS_SUBTYPE:
+ sm->fs_subtype = start;
+ statmount_fs_subtype(s, seq);
+ break;
+ case STATMOUNT_SB_SOURCE:
+ sm->sb_source = start;
+ ret = statmount_sb_source(s, seq);
+ break;
default:
WARN_ON_ONCE(true);
return -EINVAL;
}
+ /*
+ * If nothing was emitted, return to avoid setting the flag
+ * and terminating the buffer.
+ */
+ if (seq->count == start)
+ return ret;
if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
return -EOVERFLOW;
if (kbufsize >= s->bufsize)
@@ -5203,6 +5319,18 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!err && s->mask & STATMOUNT_MNT_OPTS)
err = statmount_string(s, STATMOUNT_MNT_OPTS);
+ if (!err && s->mask & STATMOUNT_OPT_ARRAY)
+ err = statmount_string(s, STATMOUNT_OPT_ARRAY);
+
+ if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
+ err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
+
+ if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
+ err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
+
+ if (!err && s->mask & STATMOUNT_SB_SOURCE)
+ err = statmount_string(s, STATMOUNT_SB_SOURCE);
+
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
statmount_mnt_ns_id(s, ns);
@@ -5224,7 +5352,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
}
#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
- STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS)
+ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
+ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
+ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY)
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
struct statmount __user *buf, size_t bufsize,
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index af46a598f4d7..7ac34550c403 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -627,7 +627,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
if (unlikely(always_fill)) {
if (pos - offset + len <= i_size)
return false; /* Page entirely before EOF */
- zero_user_segment(&folio->page, 0, plen);
+ folio_zero_segment(folio, 0, plen);
folio_mark_uptodate(folio);
return true;
}
@@ -646,7 +646,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
return false;
zero_out:
- zero_user_segments(&folio->page, 0, offset, offset + len, plen);
+ folio_zero_segments(folio, 0, offset, offset + len, plen);
return true;
}
@@ -713,7 +713,7 @@ retry:
if (folio_test_uptodate(folio))
goto have_folio;
- /* If the page is beyond the EOF, we want to clear it - unless it's
+ /* If the folio is beyond the EOF, we want to clear it - unless it's
* within the cache granule containing the EOF, in which case we need
* to preload the granule.
*/
@@ -773,7 +773,7 @@ error:
EXPORT_SYMBOL(netfs_write_begin);
/*
- * Preload the data into a page we're proposing to write into.
+ * Preload the data into a folio we're proposing to write into.
*/
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len)
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index b3910dfcb56d..b4826360a411 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -83,13 +83,13 @@ static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
* netfs_perform_write - Copy data into the pagecache.
* @iocb: The operation parameters
* @iter: The source buffer
- * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
*
- * Copy data into pagecache pages attached to the inode specified by @iocb.
+ * Copy data into pagecache folios attached to the inode specified by @iocb.
* The caller must hold appropriate inode locks.
*
- * Dirty pages are tagged with a netfs_folio struct if they're not up to date
- * to indicate the range modified. Dirty pages may also be tagged with a
+ * Dirty folios are tagged with a netfs_folio struct if they're not up to date
+ * to indicate the range modified. Dirty folios may also be tagged with a
* netfs-specific grouping such that data from an old group gets flushed before
* a new one is started.
*/
@@ -223,11 +223,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
* we try to read it.
*/
if (fpos >= ctx->zero_point) {
- zero_user_segment(&folio->page, 0, offset);
+ folio_zero_segment(folio, 0, offset);
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
- zero_user_segment(&folio->page, offset + copied, flen);
+ folio_zero_segment(folio, offset + copied, flen);
__netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
trace_netfs_folio(folio, netfs_modify_and_clear);
@@ -407,7 +407,7 @@ EXPORT_SYMBOL(netfs_perform_write);
* netfs_buffered_write_iter_locked - write data to a file
* @iocb: IO state structure (file, offset, etc.)
* @from: iov_iter with data to write
- * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
*
* This function does all the work needed for actually writing data to a
* file. It does all basic checks, removes SUID from the file, updates
@@ -491,7 +491,9 @@ EXPORT_SYMBOL(netfs_file_write_iter);
/*
* Notification that a previously read-only page is about to become writable.
- * Note that the caller indicates a single page of a multipage folio.
+ * The caller indicates the precise page that needs to be written to, but
+ * we only track group on a per-folio basis, so we block more often than
+ * we might otherwise.
*/
vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
{
@@ -501,7 +503,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file);
struct netfs_inode *ictx = netfs_inode(inode);
- vm_fault_t ret = VM_FAULT_RETRY;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
int err;
_enter("%lx", folio->index);
@@ -510,21 +512,15 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
if (folio_lock_killable(folio) < 0)
goto out;
- if (folio->mapping != mapping) {
- folio_unlock(folio);
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- if (folio_wait_writeback_killable(folio)) {
- ret = VM_FAULT_LOCKED;
- goto out;
- }
+ if (folio->mapping != mapping)
+ goto unlock;
+ if (folio_wait_writeback_killable(folio) < 0)
+ goto unlock;
/* Can we see a streaming write here? */
if (WARN_ON(!folio_test_uptodate(folio))) {
- ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
- goto out;
+ ret = VM_FAULT_SIGBUS;
+ goto unlock;
}
group = netfs_folio_group(folio);
@@ -559,5 +555,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
out:
sb_end_pagefault(inode->i_sb);
return ret;
+unlock:
+ folio_unlock(folio);
+ goto out;
}
EXPORT_SYMBOL(netfs_page_mkwrite);
diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c
index cb75c07b5281..ced14ac78cc1 100644
--- a/fs/netfs/fscache_volume.c
+++ b/fs/netfs/fscache_volume.c
@@ -322,8 +322,7 @@ maybe_wait:
}
return;
no_wait:
- clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags);
- wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING);
+ clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags);
}
/*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 551d2958ec29..d80406f8b568 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -8001,9 +8001,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
- if (nfsd4_has_session(cstate) ||
- exportfs_lock_op_is_async(sb->s_export_op))
- flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
spin_lock(&fp->fi_lock);
@@ -8014,9 +8011,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
- if (nfsd4_has_session(cstate) ||
- exportfs_lock_op_is_async(sb->s_export_op))
- flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
spin_lock(&fp->fi_lock);
@@ -8036,15 +8030,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- /*
- * Most filesystems with their own ->lock operations will block
- * the nfsd thread waiting to acquire the lock. That leads to
- * deadlocks (we don't want every nfsd thread tied up waiting
- * for file locks), so don't attempt blocking lock notifications
- * on those filesystems:
- */
- if (!exportfs_lock_op_is_async(sb->s_export_op))
- flags &= ~FL_SLEEP;
+ if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) &&
+ nfsd4_has_session(cstate) &&
+ locks_can_async_lock(nf->nf_file->f_op))
+ flags |= FL_SLEEP;
nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
if (!nbl) {
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 6dd8b854cd1f..9a849397c768 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -99,16 +99,16 @@ void nilfs_forget_buffer(struct buffer_head *bh)
*/
void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
{
- void *kaddr0, *kaddr1;
+ void *saddr, *daddr;
unsigned long bits;
- struct page *spage = sbh->b_page, *dpage = dbh->b_page;
+ struct folio *sfolio = sbh->b_folio, *dfolio = dbh->b_folio;
struct buffer_head *bh;
- kaddr0 = kmap_local_page(spage);
- kaddr1 = kmap_local_page(dpage);
- memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
- kunmap_local(kaddr1);
- kunmap_local(kaddr0);
+ saddr = kmap_local_folio(sfolio, bh_offset(sbh));
+ daddr = kmap_local_folio(dfolio, bh_offset(dbh));
+ memcpy(daddr, saddr, sbh->b_size);
+ kunmap_local(daddr);
+ kunmap_local(saddr);
dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
dbh->b_blocknr = sbh->b_blocknr;
@@ -122,13 +122,13 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
unlock_buffer(bh);
}
if (bits & BIT(BH_Uptodate))
- SetPageUptodate(dpage);
+ folio_mark_uptodate(dfolio);
else
- ClearPageUptodate(dpage);
+ folio_clear_uptodate(dfolio);
if (bits & BIT(BH_Mapped))
- SetPageMappedToDisk(dpage);
+ folio_set_mappedtodisk(dfolio);
else
- ClearPageMappedToDisk(dpage);
+ folio_clear_mappedtodisk(dfolio);
}
/**
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index d5dbef7f5c95..6004dfdfdf0f 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -16,7 +16,6 @@
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
-#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>
static int dir_notify_enable __read_mostly = 1;
@@ -347,9 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
new_fsn_mark = NULL;
}
- rcu_read_lock();
- f = lookup_fdget_rcu(fd);
- rcu_read_unlock();
+ f = fget_raw(fd);
/* if (f != filp) means that we lost a race and another task/thread
* actually closed the fd we are still playing with before we grabbed
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index a511f9d8677b..0e36aaf379b7 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -15,7 +15,6 @@ config FANOTIFY
config FANOTIFY_ACCESS_PERMISSIONS
bool "fanotify permissions checking"
depends on FANOTIFY
- depends on SECURITY
default n
help
Say Y here is you want fanotify listeners to be able to make permissions
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 224bccaab4cc..24c7c5df4998 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h>
-#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>
#include <linux/init.h>
#include <linux/jiffies.h>
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9644bc72e457..2d85c71717d6 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h>
#include <linux/fcntl.h>
-#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/anon_inodes.h>
@@ -266,13 +265,6 @@ static int create_fd(struct fsnotify_group *group, const struct path *path,
group->fanotify_data.f_flags | __FMODE_NONOTIFY,
current_cred());
if (IS_ERR(new_file)) {
- /*
- * we still send an event even if we can't open the file. this
- * can happen when say tasks are gone and we try to open their
- * /proc files or we try to open a WRONLY file like in sysfs
- * we just send the errno to userspace since there isn't much
- * else we can do.
- */
put_unused_fd(client_fd);
client_fd = PTR_ERR(new_file);
} else {
@@ -663,7 +655,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
struct file *f = NULL, *pidfd_file = NULL;
- int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
+ int ret, pidfd = -ESRCH, fd = -EBADF;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
@@ -691,10 +683,39 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
path && path->mnt && path->dentry) {
fd = create_fd(group, path, &f);
- if (fd < 0)
- return fd;
+ /*
+ * Opening an fd from dentry can fail for several reasons.
+ * For example, when tasks are gone and we try to open their
+ * /proc files or we try to open a WRONLY file like in sysfs
+ * or when trying to open a file that was deleted on the
+ * remote network server.
+ *
+ * For a group with FAN_REPORT_FD_ERROR, we will send the
+ * event with the error instead of the open fd, otherwise
+ * Userspace may not get the error at all.
+ * In any case, userspace will not know which file failed to
+ * open, so add a debug print for further investigation.
+ */
+ if (fd < 0) {
+ pr_debug("fanotify: create_fd(%pd2) failed err=%d\n",
+ path->dentry, fd);
+ if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) {
+ /*
+ * Historically, we've handled EOPENSTALE in a
+ * special way and silently dropped such
+ * events. Now we have to keep it to maintain
+ * backward compatibility...
+ */
+ if (fd == -EOPENSTALE)
+ fd = 0;
+ return fd;
+ }
+ }
}
- metadata.fd = fd;
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR))
+ metadata.fd = fd;
+ else
+ metadata.fd = fd >= 0 ? fd : FAN_NOFD;
if (pidfd_mode) {
/*
@@ -709,18 +730,16 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
* The PIDTYPE_TGID check for an event->pid is performed
* preemptively in an attempt to catch out cases where the event
* listener reads events after the event generating process has
- * already terminated. Report FAN_NOPIDFD to the event listener
- * in those cases, with all other pidfd creation errors being
- * reported as FAN_EPIDFD.
+ * already terminated. Depending on flag FAN_REPORT_FD_ERROR,
+ * report either -ESRCH or FAN_NOPIDFD to the event listener in
+ * those cases with all other pidfd creation errors reported as
+ * the error code itself or as FAN_EPIDFD.
*/
- if (metadata.pid == 0 ||
- !pid_has_task(event->pid, PIDTYPE_TGID)) {
- pidfd = FAN_NOPIDFD;
- } else {
+ if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID))
pidfd = pidfd_prepare(event->pid, 0, &pidfd_file);
- if (pidfd < 0)
- pidfd = FAN_EPIDFD;
- }
+
+ if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0)
+ pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD;
}
ret = -EFAULT;
@@ -737,9 +756,6 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
buf += FAN_EVENT_METADATA_LEN;
count -= FAN_EVENT_METADATA_LEN;
- if (fanotify_is_perm_event(event->mask))
- FANOTIFY_PERM(event)->fd = fd;
-
if (info_mode) {
ret = copy_info_records_to_user(event, info, info_mode, pidfd,
buf, count);
@@ -753,15 +769,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
if (pidfd_file)
fd_install(pidfd, pidfd_file);
+ if (fanotify_is_perm_event(event->mask))
+ FANOTIFY_PERM(event)->fd = fd;
+
return metadata.event_len;
out_close_fd:
- if (fd != FAN_NOFD) {
+ if (f) {
put_unused_fd(fd);
fput(f);
}
- if (pidfd >= 0) {
+ if (pidfd_file) {
put_unused_fd(pidfd);
fput(pidfd_file);
}
@@ -828,15 +847,6 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
}
ret = copy_event_to_user(group, event, buf, count);
- if (unlikely(ret == -EOPENSTALE)) {
- /*
- * We cannot report events with stale fd so drop it.
- * Setting ret to 0 will continue the event loop and
- * do the right thing if there are no more events to
- * read (i.e. return bytes read, -EAGAIN or wait).
- */
- ret = 0;
- }
/*
* Permission events get queued to wait for response. Other
@@ -845,7 +855,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
if (!fanotify_is_perm_event(event->mask)) {
fsnotify_destroy_event(group, &event->fse);
} else {
- if (ret <= 0) {
+ if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) {
spin_lock(&group->notification_lock);
finish_permission_event(group,
FANOTIFY_PERM(event), FAN_DENY, NULL);
@@ -1003,22 +1013,17 @@ static int fanotify_find_path(int dfd, const char __user *filename,
dfd, filename, flags);
if (filename == NULL) {
- struct fd f = fdget(dfd);
+ CLASS(fd, f)(dfd);
- ret = -EBADF;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
- ret = -ENOTDIR;
if ((flags & FAN_MARK_ONLYDIR) &&
- !(S_ISDIR(file_inode(fd_file(f))->i_mode))) {
- fdput(f);
- goto out;
- }
+ !(S_ISDIR(file_inode(fd_file(f))->i_mode)))
+ return -ENOTDIR;
*path = fd_file(f)->f_path;
path_get(path);
- fdput(f);
} else {
unsigned int lookup_flags = 0;
@@ -1682,7 +1687,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct inode *inode = NULL;
struct vfsmount *mnt = NULL;
struct fsnotify_group *group;
- struct fd f;
struct path path;
struct fan_fsid __fsid, *fsid = NULL;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
@@ -1752,14 +1756,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
umask = FANOTIFY_EVENT_FLAGS;
}
- f = fdget(fanotify_fd);
- if (unlikely(!fd_file(f)))
+ CLASS(fd, f)(fanotify_fd);
+ if (fd_empty(f))
return -EBADF;
/* verify that this is indeed an fanotify instance */
- ret = -EINVAL;
if (unlikely(fd_file(f)->f_op != &fanotify_fops))
- goto fput_and_out;
+ return -EINVAL;
group = fd_file(f)->private_data;
/*
@@ -1767,23 +1770,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* marks. This also includes setting up such marks by a group that
* was initialized by an unprivileged user.
*/
- ret = -EPERM;
if ((!capable(CAP_SYS_ADMIN) ||
FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
mark_type != FAN_MARK_INODE)
- goto fput_and_out;
+ return -EPERM;
/*
* Permission events require minimum priority FAN_CLASS_CONTENT.
*/
- ret = -EINVAL;
if (mask & FANOTIFY_PERM_EVENTS &&
group->priority < FSNOTIFY_PRIO_CONTENT)
- goto fput_and_out;
+ return -EINVAL;
if (mask & FAN_FS_ERROR &&
mark_type != FAN_MARK_FILESYSTEM)
- goto fput_and_out;
+ return -EINVAL;
/*
* Evictable is only relevant for inode marks, because only inode object
@@ -1791,7 +1792,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
*/
if (flags & FAN_MARK_EVICTABLE &&
mark_type != FAN_MARK_INODE)
- goto fput_and_out;
+ return -EINVAL;
/*
* Events that do not carry enough information to report
@@ -1803,7 +1804,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
(!fid_mode || mark_type == FAN_MARK_MOUNT))
- goto fput_and_out;
+ return -EINVAL;
/*
* FAN_RENAME uses special info type records to report the old and
@@ -1811,23 +1812,22 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* useful and was not implemented.
*/
if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
- goto fput_and_out;
+ return -EINVAL;
if (mark_cmd == FAN_MARK_FLUSH) {
- ret = 0;
if (mark_type == FAN_MARK_MOUNT)
fsnotify_clear_vfsmount_marks_by_group(group);
else if (mark_type == FAN_MARK_FILESYSTEM)
fsnotify_clear_sb_marks_by_group(group);
else
fsnotify_clear_inode_marks_by_group(group);
- goto fput_and_out;
+ return 0;
}
ret = fanotify_find_path(dfd, pathname, &path, flags,
(mask & ALL_FSNOTIFY_EVENTS), obj_type);
if (ret)
- goto fput_and_out;
+ return ret;
if (mark_cmd == FAN_MARK_ADD) {
ret = fanotify_events_supported(group, &path, mask, flags);
@@ -1906,8 +1906,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
path_put_and_out:
path_put(&path);
-fput_and_out:
- fdput(f);
return ret;
}
@@ -1954,7 +1952,7 @@ static int __init fanotify_user_setup(void)
FANOTIFY_DEFAULT_MAX_USER_MARKS);
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 82ae8254c068..f976949d2634 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -333,16 +333,19 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
if (!inode_mark)
return 0;
- if (mask & FS_EVENT_ON_CHILD) {
- /*
- * Some events can be sent on both parent dir and child marks
- * (e.g. FS_ATTRIB). If both parent dir and child are
- * watching, report the event once to parent dir with name (if
- * interested) and once to child without name (if interested).
- * The child watcher is expecting an event without a file name
- * and without the FS_EVENT_ON_CHILD flag.
- */
- mask &= ~FS_EVENT_ON_CHILD;
+ /*
+ * Some events can be sent on both parent dir and child marks (e.g.
+ * FS_ATTRIB). If both parent dir and child are watching, report the
+ * event once to parent dir with name (if interested) and once to child
+ * without name (if interested).
+ *
+ * In any case regardless whether the parent is watching or not, the
+ * child watcher is expecting an event without the FS_EVENT_ON_CHILD
+ * flag. The file name is expected if and only if this is a directory
+ * event.
+ */
+ mask &= ~FS_EVENT_ON_CHILD;
+ if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) {
dir = NULL;
name = NULL;
}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 0794dcaf1e47..e0c48956608a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -732,7 +732,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
struct fsnotify_group *group;
struct inode *inode;
struct path path;
- struct fd f;
int ret;
unsigned flags = 0;
@@ -752,21 +751,17 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
if (unlikely(!(mask & ALL_INOTIFY_BITS)))
return -EINVAL;
- f = fdget(fd);
- if (unlikely(!fd_file(f)))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
/* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */
- if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) {
- ret = -EINVAL;
- goto fput_and_out;
- }
+ if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE)))
+ return -EINVAL;
/* verify that this is indeed an inotify instance */
- if (unlikely(fd_file(f)->f_op != &inotify_fops)) {
- ret = -EINVAL;
- goto fput_and_out;
- }
+ if (unlikely(fd_file(f)->f_op != &inotify_fops))
+ return -EINVAL;
if (!(mask & IN_DONT_FOLLOW))
flags |= LOOKUP_FOLLOW;
@@ -776,7 +771,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
ret = inotify_find_inode(pathname, &path, flags,
(mask & IN_ALL_EVENTS));
if (ret)
- goto fput_and_out;
+ return ret;
/* inode held in place by reference to path; group by fget on fd */
inode = path.dentry->d_inode;
@@ -785,8 +780,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
/* create/update an inode mark */
ret = inotify_update_watch(group, inode, mask);
path_put(&path);
-fput_and_out:
- fdput(f);
return ret;
}
@@ -794,33 +787,26 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
{
struct fsnotify_group *group;
struct inotify_inode_mark *i_mark;
- struct fd f;
- int ret = -EINVAL;
+ CLASS(fd, f)(fd);
- f = fdget(fd);
- if (unlikely(!fd_file(f)))
+ if (fd_empty(f))
return -EBADF;
/* verify that this is indeed an inotify instance */
if (unlikely(fd_file(f)->f_op != &inotify_fops))
- goto out;
+ return -EINVAL;
group = fd_file(f)->private_data;
i_mark = inotify_idr_find(group, wd);
if (unlikely(!i_mark))
- goto out;
-
- ret = 0;
+ return -EINVAL;
fsnotify_destroy_mark(&i_mark->fsn_mark, group);
/* match ref taken by inotify_idr_find */
fsnotify_put_mark(&i_mark->fsn_mark);
-
-out:
- fdput(f);
- return ret;
+ return 0;
}
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index c45b222cf9c1..4981439e6209 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -138,8 +138,11 @@ static void fsnotify_get_sb_watched_objects(struct super_block *sb)
static void fsnotify_put_sb_watched_objects(struct super_block *sb)
{
- if (atomic_long_dec_and_test(fsnotify_sb_watched_objects(sb)))
- wake_up_var(fsnotify_sb_watched_objects(sb));
+ atomic_long_t *watched_objects = fsnotify_sb_watched_objects(sb);
+
+ /* the superblock can go away after this decrement */
+ if (atomic_long_dec_and_test(watched_objects))
+ wake_up_var(watched_objects);
}
static void fsnotify_get_inode_ref(struct inode *inode)
@@ -150,8 +153,11 @@ static void fsnotify_get_inode_ref(struct inode *inode)
static void fsnotify_put_inode_ref(struct inode *inode)
{
- fsnotify_put_sb_watched_objects(inode->i_sb);
+ /* read ->i_sb before the inode can go away */
+ struct super_block *sb = inode->i_sb;
+
iput(inode);
+ fsnotify_put_sb_watched_objects(sb);
}
/*
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4b9f45d7049e..4200a0341343 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1765,42 +1765,41 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
long fd;
int sectsize;
char *p = (char *)page;
- struct fd f;
ssize_t ret = -EINVAL;
int live_threshold;
if (reg->hr_bdev_file)
- goto out;
+ return -EINVAL;
/* We can't heartbeat without having had our node number
* configured yet. */
if (o2nm_this_node() == O2NM_MAX_NODES)
- goto out;
+ return -EINVAL;
fd = simple_strtol(p, &p, 0);
if (!p || (*p && (*p != '\n')))
- goto out;
+ return -EINVAL;
if (fd < 0 || fd >= INT_MAX)
- goto out;
+ return -EINVAL;
- f = fdget(fd);
- if (fd_file(f) == NULL)
- goto out;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EINVAL;
if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
reg->hr_block_bytes == 0)
- goto out2;
+ return -EINVAL;
if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode))
- goto out2;
+ return -EINVAL;
reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev,
BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
if (IS_ERR(reg->hr_bdev_file)) {
ret = PTR_ERR(reg->hr_bdev_file);
reg->hr_bdev_file = NULL;
- goto out2;
+ return ret;
}
sectsize = bdev_logical_block_size(reg_bdev(reg));
@@ -1906,9 +1905,6 @@ out3:
fput(reg->hr_bdev_file);
reg->hr_bdev_file = NULL;
}
-out2:
- fdput(f);
-out:
return ret;
}
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 96b684763b39..b95724b767e1 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -280,5 +280,4 @@ const struct export_operations ocfs2_export_ops = {
.fh_to_dentry = ocfs2_fh_to_dentry,
.fh_to_parent = ocfs2_fh_to_parent,
.get_parent = ocfs2_get_parent,
- .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 06af21982c16..4fa6c840d20b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2812,6 +2812,7 @@ const struct file_operations ocfs2_fops = {
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
.remap_file_range = ocfs2_remap_file_range,
+ .fop_flags = FOP_ASYNC_LOCK,
};
WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
@@ -2828,6 +2829,7 @@ const struct file_operations ocfs2_dops = {
#endif
.lock = ocfs2_lock,
.flock = ocfs2_flock,
+ .fop_flags = FOP_ASYNC_LOCK,
};
/*
diff --git a/fs/open.c b/fs/open.c
index 5da4df2f9b18..e6911101fe71 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -187,19 +187,13 @@ long do_ftruncate(struct file *file, loff_t length, int small)
long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
- struct fd f;
- int error;
-
if (length < 0)
return -EINVAL;
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- error = do_ftruncate(fd_file(f), length, small);
-
- fdput(f);
- return error;
+ return do_ftruncate(fd_file(f), length, small);
}
SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
@@ -349,14 +343,12 @@ EXPORT_SYMBOL_GPL(vfs_fallocate);
int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
- struct fd f = fdget(fd);
- int error = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- error = vfs_fallocate(fd_file(f), mode, offset, len);
- fdput(f);
- }
- return error;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fallocate(fd_file(f), mode, offset, len);
}
SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
@@ -580,23 +572,18 @@ out:
SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
- struct fd f = fdget_raw(fd);
+ CLASS(fd_raw, f)(fd);
int error;
- error = -EBADF;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
- error = -ENOTDIR;
if (!d_can_lookup(fd_file(f)->f_path.dentry))
- goto out_putf;
+ return -ENOTDIR;
error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &fd_file(f)->f_path);
-out_putf:
- fdput(f);
-out:
return error;
}
@@ -671,14 +658,12 @@ int vfs_fchmod(struct file *file, umode_t mode)
SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
- struct fd f = fdget(fd);
- int err = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- err = vfs_fchmod(fd_file(f), mode);
- fdput(f);
- }
- return err;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fchmod(fd_file(f), mode);
}
static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
@@ -865,14 +850,12 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group)
int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
- struct fd f = fdget(fd);
- int error = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- error = vfs_fchown(fd_file(f), user, group);
- fdput(f);
- }
- return error;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fchown(fd_file(f), user, group);
}
SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
@@ -946,6 +929,10 @@ static int do_dentry_open(struct file *f,
if (error)
goto cleanup_all;
+ error = fsnotify_open_perm(f);
+ if (error)
+ goto cleanup_all;
+
error = break_lease(file_inode(f), f->f_flags);
if (error)
goto cleanup_all;
@@ -1576,23 +1563,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
return retval;
}
-/**
- * sys_close_range() - Close all file descriptors in a given range.
- *
- * @fd: starting file descriptor to close
- * @max_fd: last file descriptor to close
- * @flags: reserved for future extensions
- *
- * This closes a range of file descriptors. All file descriptors
- * from @fd up to and including @max_fd are closed.
- * Currently, errors to close a given file descriptor are ignored.
- */
-SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
- unsigned int, flags)
-{
- return __close_range(fd, max_fd, flags);
-}
-
/*
* This routine simulates a hangup on the tty, to arrange that users
* are given clean terminals at login time.
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 2ed6ad641a20..3601ddfeddc2 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -16,7 +16,6 @@
#include <linux/sched/signal.h>
#include <linux/cred.h>
#include <linux/namei.h>
-#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include <linux/exportfs.h>
#include "overlayfs.h"
@@ -1260,7 +1259,7 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
dput(parent);
dput(next);
}
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return err;
}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index ab65e98a1def..08e683917d12 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -553,15 +553,17 @@ out_cleanup:
goto out_dput;
}
-static int ovl_setup_cred_for_create(struct dentry *dentry, struct inode *inode,
- umode_t mode, const struct cred *old_cred)
+static const struct cred *ovl_setup_cred_for_create(struct dentry *dentry,
+ struct inode *inode,
+ umode_t mode,
+ const struct cred *old_cred)
{
int err;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
override_cred->fsuid = inode->i_uid;
override_cred->fsgid = inode->i_gid;
@@ -569,19 +571,26 @@ static int ovl_setup_cred_for_create(struct dentry *dentry, struct inode *inode,
old_cred, override_cred);
if (err) {
put_cred(override_cred);
- return err;
+ return ERR_PTR(err);
}
- put_cred(override_creds(override_cred));
- put_cred(override_cred);
- return 0;
+ /*
+ * Caller is going to match this with revert_creds_light() and drop
+ * referenec on the returned creds.
+ * We must be called with creator creds already, otherwise we risk
+ * leaking creds.
+ */
+ old_cred = override_creds_light(override_cred);
+ WARN_ON_ONCE(old_cred != ovl_creds(dentry->d_sb));
+
+ return override_cred;
}
static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
struct ovl_cattr *attr, bool origin)
{
int err;
- const struct cred *old_cred;
+ const struct cred *old_cred, *new_cred = NULL;
struct dentry *parent = dentry->d_parent;
old_cred = ovl_override_creds(dentry->d_sb);
@@ -610,9 +619,13 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
* create a new inode, so just use the ovl mounter's
* fs{u,g}id.
*/
- err = ovl_setup_cred_for_create(dentry, inode, attr->mode, old_cred);
- if (err)
+ new_cred = ovl_setup_cred_for_create(dentry, inode, attr->mode,
+ old_cred);
+ err = PTR_ERR(new_cred);
+ if (IS_ERR(new_cred)) {
+ new_cred = NULL;
goto out_revert_creds;
+ }
}
if (!ovl_dentry_is_whiteout(dentry))
@@ -621,7 +634,8 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
err = ovl_create_over_whiteout(dentry, inode, attr);
out_revert_creds:
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
+ put_cred(new_cred);
return err;
}
@@ -702,7 +716,7 @@ static int ovl_set_link_redirect(struct dentry *dentry)
old_cred = ovl_override_creds(dentry->d_sb);
err = ovl_set_redirect(dentry, false);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return err;
}
@@ -912,7 +926,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
err = ovl_remove_upper(dentry, is_dir, &list);
else
err = ovl_remove_and_whiteout(dentry, &list);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
if (!err) {
if (is_dir)
clear_nlink(dentry->d_inode);
@@ -1292,7 +1306,7 @@ out_dput_old:
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
if (update_nlink)
ovl_nlink_end(new);
else
@@ -1306,18 +1320,22 @@ out:
static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
struct inode *inode, umode_t mode)
{
- const struct cred *old_cred;
+ const struct cred *old_cred, *new_cred = NULL;
struct path realparentpath;
struct file *realfile;
+ struct ovl_file *of;
struct dentry *newdentry;
/* It's okay to set O_NOATIME, since the owner will be current fsuid */
int flags = file->f_flags | OVL_OPEN_FLAGS;
int err;
old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred);
- if (err)
+ new_cred = ovl_setup_cred_for_create(dentry, inode, mode, old_cred);
+ err = PTR_ERR(new_cred);
+ if (IS_ERR(new_cred)) {
+ new_cred = NULL;
goto out_revert_creds;
+ }
ovl_path_upper(dentry->d_parent, &realparentpath);
realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath,
@@ -1327,17 +1345,25 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
if (err)
goto out_revert_creds;
+ of = ovl_file_alloc(realfile);
+ if (!of) {
+ fput(realfile);
+ err = -ENOMEM;
+ goto out_revert_creds;
+ }
+
/* ovl_instantiate() consumes the newdentry reference on success */
newdentry = dget(realfile->f_path.dentry);
err = ovl_instantiate(dentry, inode, newdentry, false, file);
if (!err) {
- file->private_data = realfile;
+ file->private_data = of;
} else {
dput(newdentry);
- fput(realfile);
+ ovl_file_free(of);
}
out_revert_creds:
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
+ put_cred(new_cred);
return err;
}
@@ -1389,7 +1415,7 @@ static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
put_realfile:
/* Without FMODE_OPENED ->release() won't be called on @file */
if (!(file->f_mode & FMODE_OPENED))
- fput(file->private_data);
+ ovl_file_free(file->private_data);
put_inode:
iput(inode);
drop_write:
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 4444c78e2e0c..969b458100fe 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -51,7 +51,7 @@ static struct file *ovl_open_realfile(const struct file *file,
realfile = backing_file_open(&file->f_path, flags, realpath,
current_cred());
}
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
file, file, ovl_whatisit(inode, realinode), file->f_flags,
@@ -89,56 +89,110 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
return 0;
}
-static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
- bool allow_meta)
+struct ovl_file {
+ struct file *realfile;
+ struct file *upperfile;
+};
+
+struct ovl_file *ovl_file_alloc(struct file *realfile)
{
- struct dentry *dentry = file_dentry(file);
- struct file *realfile = file->private_data;
- struct path realpath;
- int err;
+ struct ovl_file *of = kzalloc(sizeof(struct ovl_file), GFP_KERNEL);
- real->word = (unsigned long)realfile;
+ if (unlikely(!of))
+ return NULL;
- if (allow_meta) {
- ovl_path_real(dentry, &realpath);
- } else {
- /* lazy lookup and verify of lowerdata */
- err = ovl_verify_lowerdata(dentry);
- if (err)
- return err;
+ of->realfile = realfile;
+ return of;
+}
- ovl_path_realdata(dentry, &realpath);
- }
- if (!realpath.dentry)
- return -EIO;
+void ovl_file_free(struct ovl_file *of)
+{
+ fput(of->realfile);
+ if (of->upperfile)
+ fput(of->upperfile);
+ kfree(of);
+}
- /* Has it been copied up since we'd opened it? */
- if (unlikely(file_inode(realfile) != d_inode(realpath.dentry))) {
- struct file *f = ovl_open_realfile(file, &realpath);
- if (IS_ERR(f))
- return PTR_ERR(f);
- real->word = (unsigned long)f | FDPUT_FPUT;
- return 0;
+static bool ovl_is_real_file(const struct file *realfile,
+ const struct path *realpath)
+{
+ return file_inode(realfile) == d_inode(realpath->dentry);
+}
+
+static struct file *ovl_real_file_path(const struct file *file,
+ struct path *realpath)
+{
+ struct ovl_file *of = file->private_data;
+ struct file *realfile = of->realfile;
+
+ if (WARN_ON_ONCE(!realpath->dentry))
+ return ERR_PTR(-EIO);
+
+ /*
+ * If the realfile that we want is not where the data used to be at
+ * open time, either we'd been copied up, or it's an fsync of a
+ * metacopied file. We need the upperfile either way, so see if it
+ * is already opened and if it is not then open and store it.
+ */
+ if (unlikely(!ovl_is_real_file(realfile, realpath))) {
+ struct file *upperfile = READ_ONCE(of->upperfile);
+ struct file *old;
+
+ if (!upperfile) { /* Nobody opened upperfile yet */
+ upperfile = ovl_open_realfile(file, realpath);
+ if (IS_ERR(upperfile))
+ return upperfile;
+
+ /* Store the upperfile for later */
+ old = cmpxchg_release(&of->upperfile, NULL, upperfile);
+ if (old) { /* Someone opened upperfile before us */
+ fput(upperfile);
+ upperfile = old;
+ }
+ }
+ /*
+ * Stored file must be from the right inode, unless someone's
+ * been corrupting the upper layer.
+ */
+ if (WARN_ON_ONCE(!ovl_is_real_file(upperfile, realpath)))
+ return ERR_PTR(-EIO);
+
+ realfile = upperfile;
}
/* Did the flags change since open? */
- if (unlikely((file->f_flags ^ realfile->f_flags) & ~OVL_OPEN_FLAGS))
- return ovl_change_flags(realfile, file->f_flags);
+ if (unlikely((file->f_flags ^ realfile->f_flags) & ~OVL_OPEN_FLAGS)) {
+ int err = ovl_change_flags(realfile, file->f_flags);
- return 0;
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ return realfile;
}
-static int ovl_real_fdget(const struct file *file, struct fd *real)
+static struct file *ovl_real_file(const struct file *file)
{
- if (d_is_dir(file_dentry(file))) {
+ struct dentry *dentry = file_dentry(file);
+ struct path realpath;
+ int err;
+
+ if (d_is_dir(dentry)) {
struct file *f = ovl_dir_real_file(file, false);
- if (IS_ERR(f))
- return PTR_ERR(f);
- real->word = (unsigned long)f;
- return 0;
+
+ if (WARN_ON_ONCE(!f))
+ return ERR_PTR(-EIO);
+ return f;
}
- return ovl_real_fdget_meta(file, real, false);
+ /* lazy lookup and verify of lowerdata */
+ err = ovl_verify_lowerdata(dentry);
+ if (err)
+ return ERR_PTR(err);
+
+ ovl_path_realdata(dentry, &realpath);
+
+ return ovl_real_file_path(file, &realpath);
}
static int ovl_open(struct inode *inode, struct file *file)
@@ -146,6 +200,7 @@ static int ovl_open(struct inode *inode, struct file *file)
struct dentry *dentry = file_dentry(file);
struct file *realfile;
struct path realpath;
+ struct ovl_file *of;
int err;
/* lazy lookup and verify lowerdata */
@@ -168,22 +223,27 @@ static int ovl_open(struct inode *inode, struct file *file)
if (IS_ERR(realfile))
return PTR_ERR(realfile);
- file->private_data = realfile;
+ of = ovl_file_alloc(realfile);
+ if (!of) {
+ fput(realfile);
+ return -ENOMEM;
+ }
+
+ file->private_data = of;
return 0;
}
static int ovl_release(struct inode *inode, struct file *file)
{
- fput(file->private_data);
-
+ ovl_file_free(file->private_data);
return 0;
}
static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file_inode(file);
- struct fd real;
+ struct file *realfile;
const struct cred *old_cred;
loff_t ret;
@@ -199,9 +259,9 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
return vfs_setpos(file, 0, 0);
}
- ret = ovl_real_fdget(file, &real);
- if (ret)
- return ret;
+ realfile = ovl_real_file(file);
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
/*
* Overlay file f_pos is the master copy that is preserved
@@ -211,17 +271,15 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
* files, so we use the real file to perform seeks.
*/
ovl_inode_lock(inode);
- fd_file(real)->f_pos = file->f_pos;
+ realfile->f_pos = file->f_pos;
old_cred = ovl_override_creds(inode->i_sb);
- ret = vfs_llseek(fd_file(real), offset, whence);
- revert_creds(old_cred);
+ ret = vfs_llseek(realfile, offset, whence);
+ ovl_revert_creds(old_cred);
- file->f_pos = fd_file(real)->f_pos;
+ file->f_pos = realfile->f_pos;
ovl_inode_unlock(inode);
- fdput(real);
-
return ret;
}
@@ -231,9 +289,9 @@ static void ovl_file_modified(struct file *file)
ovl_copyattr(file_inode(file));
}
-static void ovl_file_end_write(struct file *file, loff_t pos, ssize_t ret)
+static void ovl_file_end_write(struct kiocb *iocb, ssize_t ret)
{
- ovl_file_modified(file);
+ ovl_file_modified(iocb->ki_filp);
}
static void ovl_file_accessed(struct file *file)
@@ -267,38 +325,32 @@ static void ovl_file_accessed(struct file *file)
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
- struct fd real;
- ssize_t ret;
+ struct file *realfile;
struct backing_file_ctx ctx = {
.cred = ovl_creds(file_inode(file)->i_sb),
- .user_file = file,
.accessed = ovl_file_accessed,
};
if (!iov_iter_count(iter))
return 0;
- ret = ovl_real_fdget(file, &real);
- if (ret)
- return ret;
-
- ret = backing_file_read_iter(fd_file(real), iter, iocb, iocb->ki_flags,
- &ctx);
- fdput(real);
+ realfile = ovl_real_file(file);
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
- return ret;
+ return backing_file_read_iter(realfile, iter, iocb, iocb->ki_flags,
+ &ctx);
}
static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct fd real;
+ struct file *realfile;
ssize_t ret;
int ifl = iocb->ki_flags;
struct backing_file_ctx ctx = {
.cred = ovl_creds(inode->i_sb),
- .user_file = file,
.end_write = ovl_file_end_write,
};
@@ -309,8 +361,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
/* Update mode */
ovl_copyattr(inode);
- ret = ovl_real_fdget(file, &real);
- if (ret)
+ realfile = ovl_real_file(file);
+ ret = PTR_ERR(realfile);
+ if (IS_ERR(realfile))
goto out_unlock;
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
@@ -321,8 +374,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* this property in case it is set by the issuer.
*/
ifl &= ~IOCB_DIO_CALLER_COMP;
- ret = backing_file_write_iter(fd_file(real), iter, iocb, ifl, &ctx);
- fdput(real);
+ ret = backing_file_write_iter(realfile, iter, iocb, ifl, &ctx);
out_unlock:
inode_unlock(inode);
@@ -334,20 +386,22 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
- struct fd real;
+ struct file *realfile;
ssize_t ret;
struct backing_file_ctx ctx = {
.cred = ovl_creds(file_inode(in)->i_sb),
- .user_file = in,
.accessed = ovl_file_accessed,
};
+ struct kiocb iocb;
- ret = ovl_real_fdget(in, &real);
- if (ret)
- return ret;
+ realfile = ovl_real_file(in);
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
- ret = backing_file_splice_read(fd_file(real), ppos, pipe, len, flags, &ctx);
- fdput(real);
+ init_sync_kiocb(&iocb, in);
+ iocb.ki_pos = *ppos;
+ ret = backing_file_splice_read(realfile, &iocb, pipe, len, flags, &ctx);
+ *ppos = iocb.ki_pos;
return ret;
}
@@ -355,7 +409,7 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
/*
* Calling iter_file_splice_write() directly from overlay's f_op may deadlock
* due to lock order inversion between pipe->mutex in iter_file_splice_write()
- * and file_start_write(fd_file(real)) in ovl_write_iter().
+ * and file_start_write(realfile) in ovl_write_iter().
*
* So do everything ovl_write_iter() does and call iter_file_splice_write() on
* the real file.
@@ -363,25 +417,28 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
- struct fd real;
+ struct file *realfile;
struct inode *inode = file_inode(out);
ssize_t ret;
struct backing_file_ctx ctx = {
.cred = ovl_creds(inode->i_sb),
- .user_file = out,
.end_write = ovl_file_end_write,
};
+ struct kiocb iocb;
inode_lock(inode);
/* Update mode */
ovl_copyattr(inode);
- ret = ovl_real_fdget(out, &real);
- if (ret)
+ realfile = ovl_real_file(out);
+ ret = PTR_ERR(realfile);
+ if (IS_ERR(realfile))
goto out_unlock;
- ret = backing_file_splice_write(pipe, fd_file(real), ppos, len, flags, &ctx);
- fdput(real);
+ init_sync_kiocb(&iocb, out);
+ iocb.ki_pos = *ppos;
+ ret = backing_file_splice_write(pipe, realfile, &iocb, len, flags, &ctx);
+ *ppos = iocb.ki_pos;
out_unlock:
inode_unlock(inode);
@@ -391,7 +448,10 @@ out_unlock:
static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct fd real;
+ struct dentry *dentry = file_dentry(file);
+ enum ovl_path_type type;
+ struct path upperpath;
+ struct file *upperfile;
const struct cred *old_cred;
int ret;
@@ -399,38 +459,38 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (ret <= 0)
return ret;
- ret = ovl_real_fdget_meta(file, &real, !datasync);
- if (ret)
- return ret;
-
/* Don't sync lower file for fear of receiving EROFS error */
- if (file_inode(fd_file(real)) == ovl_inode_upper(file_inode(file))) {
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fsync_range(fd_file(real), start, end, datasync);
- revert_creds(old_cred);
- }
+ type = ovl_path_type(dentry);
+ if (!OVL_TYPE_UPPER(type) || (datasync && OVL_TYPE_MERGE(type)))
+ return 0;
- fdput(real);
+ ovl_path_upper(dentry, &upperpath);
+ upperfile = ovl_real_file_path(file, &upperpath);
+ if (IS_ERR(upperfile))
+ return PTR_ERR(upperfile);
+
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ ret = vfs_fsync_range(upperfile, start, end, datasync);
+ ovl_revert_creds(old_cred);
return ret;
}
static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
{
- struct file *realfile = file->private_data;
+ struct ovl_file *of = file->private_data;
struct backing_file_ctx ctx = {
.cred = ovl_creds(file_inode(file)->i_sb),
- .user_file = file,
.accessed = ovl_file_accessed,
};
- return backing_file_mmap(realfile, vma, &ctx);
+ return backing_file_mmap(of->realfile, vma, &ctx);
}
static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- struct fd real;
+ struct file *realfile;
const struct cred *old_cred;
int ret;
@@ -441,19 +501,18 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
if (ret)
goto out_unlock;
- ret = ovl_real_fdget(file, &real);
- if (ret)
+ realfile = ovl_real_file(file);
+ ret = PTR_ERR(realfile);
+ if (IS_ERR(realfile))
goto out_unlock;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fallocate(fd_file(real), mode, offset, len);
- revert_creds(old_cred);
+ ret = vfs_fallocate(realfile, mode, offset, len);
+ ovl_revert_creds(old_cred);
/* Update size */
ovl_file_modified(file);
- fdput(real);
-
out_unlock:
inode_unlock(inode);
@@ -462,19 +521,17 @@ out_unlock:
static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
- struct fd real;
+ struct file *realfile;
const struct cred *old_cred;
int ret;
- ret = ovl_real_fdget(file, &real);
- if (ret)
- return ret;
+ realfile = ovl_real_file(file);
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fadvise(fd_file(real), offset, len, advice);
- revert_creds(old_cred);
-
- fdput(real);
+ ret = vfs_fadvise(realfile, offset, len, advice);
+ ovl_revert_creds(old_cred);
return ret;
}
@@ -490,7 +547,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
loff_t len, unsigned int flags, enum ovl_copyop op)
{
struct inode *inode_out = file_inode(file_out);
- struct fd real_in, real_out;
+ struct file *realfile_in, *realfile_out;
const struct cred *old_cred;
loff_t ret;
@@ -503,42 +560,39 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
goto out_unlock;
}
- ret = ovl_real_fdget(file_out, &real_out);
- if (ret)
+ realfile_out = ovl_real_file(file_out);
+ ret = PTR_ERR(realfile_out);
+ if (IS_ERR(realfile_out))
goto out_unlock;
- ret = ovl_real_fdget(file_in, &real_in);
- if (ret) {
- fdput(real_out);
+ realfile_in = ovl_real_file(file_in);
+ ret = PTR_ERR(realfile_in);
+ if (IS_ERR(realfile_in))
goto out_unlock;
- }
old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
switch (op) {
case OVL_COPY:
- ret = vfs_copy_file_range(fd_file(real_in), pos_in,
- fd_file(real_out), pos_out, len, flags);
+ ret = vfs_copy_file_range(realfile_in, pos_in,
+ realfile_out, pos_out, len, flags);
break;
case OVL_CLONE:
- ret = vfs_clone_file_range(fd_file(real_in), pos_in,
- fd_file(real_out), pos_out, len, flags);
+ ret = vfs_clone_file_range(realfile_in, pos_in,
+ realfile_out, pos_out, len, flags);
break;
case OVL_DEDUPE:
- ret = vfs_dedupe_file_range_one(fd_file(real_in), pos_in,
- fd_file(real_out), pos_out, len,
+ ret = vfs_dedupe_file_range_one(realfile_in, pos_in,
+ realfile_out, pos_out, len,
flags);
break;
}
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
/* Update size */
ovl_file_modified(file_out);
- fdput(real_in);
- fdput(real_out);
-
out_unlock:
inode_unlock(inode_out);
@@ -582,20 +636,19 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
static int ovl_flush(struct file *file, fl_owner_t id)
{
- struct fd real;
+ struct file *realfile;
const struct cred *old_cred;
- int err;
+ int err = 0;
- err = ovl_real_fdget(file, &real);
- if (err)
- return err;
+ realfile = ovl_real_file(file);
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
- if (fd_file(real)->f_op->flush) {
+ if (realfile->f_op->flush) {
old_cred = ovl_override_creds(file_inode(file)->i_sb);
- err = fd_file(real)->f_op->flush(fd_file(real), id);
- revert_creds(old_cred);
+ err = realfile->f_op->flush(realfile, id);
+ ovl_revert_creds(old_cred);
}
- fdput(real);
return err;
}
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 35fd3e3e1778..6f0e15f86c21 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -80,7 +80,7 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
inode_lock(upperdentry->d_inode);
old_cred = ovl_override_creds(dentry->d_sb);
err = ovl_do_notify_change(ofs, upperdentry, attr);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
if (!err)
ovl_copyattr(dentry->d_inode);
inode_unlock(upperdentry->d_inode);
@@ -170,7 +170,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
type = ovl_path_real(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_do_getattr(&realpath, stat, request_mask, flags);
+ err = vfs_getattr_nosec(&realpath, stat, request_mask, flags);
if (err)
goto out;
@@ -195,8 +195,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
(!is_dir ? STATX_NLINK : 0);
ovl_path_lower(dentry, &realpath);
- err = ovl_do_getattr(&realpath, &lowerstat, lowermask,
- flags);
+ err = vfs_getattr_nosec(&realpath, &lowerstat, lowermask,
+ flags);
if (err)
goto out;
@@ -248,8 +248,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
ovl_path_lowerdata(dentry, &realpath);
if (realpath.dentry) {
- err = ovl_do_getattr(&realpath, &lowerdatastat,
- lowermask, flags);
+ err = vfs_getattr_nosec(&realpath, &lowerdatastat,
+ lowermask, flags);
if (err)
goto out;
} else {
@@ -280,7 +280,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->nlink = dentry->d_inode->i_nlink;
out:
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return err;
}
@@ -317,7 +317,7 @@ int ovl_permission(struct mnt_idmap *idmap,
mask |= MAY_READ;
}
err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return err;
}
@@ -334,7 +334,7 @@ static const char *ovl_get_link(struct dentry *dentry,
old_cred = ovl_override_creds(dentry->d_sb);
p = vfs_get_link(ovl_dentry_real(dentry), done);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return p;
}
@@ -469,7 +469,7 @@ struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap,
old_cred = ovl_override_creds(inode->i_sb);
acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
}
return acl;
@@ -498,7 +498,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
old_cred = ovl_override_creds(dentry->d_sb);
real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry,
acl_name);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
if (IS_ERR(real_acl)) {
err = PTR_ERR(real_acl);
goto out;
@@ -523,7 +523,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
else
err = ovl_do_remove_acl(ofs, realdentry, acl_name);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
ovl_drop_write(dentry);
/* copy c/mtime */
@@ -600,7 +600,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
old_cred = ovl_override_creds(inode->i_sb);
err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return err;
}
@@ -616,8 +616,13 @@ static int ovl_security_fileattr(const struct path *realpath, struct fileattr *f
struct file *file;
unsigned int cmd;
int err;
+ unsigned int flags;
+
+ flags = O_RDONLY;
+ if (force_o_largefile())
+ flags |= O_LARGEFILE;
- file = dentry_open(realpath, O_RDONLY, current_cred());
+ file = dentry_open(realpath, flags, current_cred());
if (IS_ERR(file))
return PTR_ERR(file);
@@ -671,7 +676,7 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
err = ovl_set_protattr(inode, upperpath.dentry, fa);
if (!err)
err = ovl_real_fileattr_set(&upperpath, fa);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
ovl_drop_write(dentry);
/*
@@ -733,7 +738,7 @@ int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
old_cred = ovl_override_creds(inode->i_sb);
err = ovl_real_fileattr_get(&realpath, fa);
ovl_fileattr_prot_flags(inode, fa);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return err;
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 5764f91d283e..7e27b7d4adee 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -961,7 +961,7 @@ static int ovl_maybe_validate_verity(struct dentry *dentry)
if (err == 0)
ovl_set_flag(OVL_VERIFIED_DIGEST, inode);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
}
ovl_inode_unlock(inode);
@@ -995,7 +995,7 @@ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry)
old_cred = ovl_override_creds(dentry->d_sb);
err = ovl_lookup_data_layers(dentry, redirect, &datapath);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
if (err)
goto out_err;
@@ -1342,7 +1342,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode));
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
if (origin_path) {
dput(origin_path->dentry);
kfree(origin_path);
@@ -1366,7 +1366,7 @@ out_put_upper:
kfree(upperredirect);
out:
kfree(d.redirect);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return ERR_PTR(err);
}
@@ -1423,7 +1423,7 @@ bool ovl_lower_positive(struct dentry *dentry)
dput(this);
}
}
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return positive;
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0bfe35da4b7b..b361f35762be 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -412,14 +412,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC));
}
-static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
-{
- if (flags & AT_GETATTR_NOSEC)
- return vfs_getattr_nosec(path, stat, request_mask, flags);
- return vfs_getattr(path, stat, request_mask, flags);
-}
-
/* util.c */
int ovl_get_write_access(struct dentry *dentry);
void ovl_put_write_access(struct dentry *dentry);
@@ -429,6 +421,7 @@ int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
+void ovl_revert_creds(const struct cred *old_cred);
static inline const struct cred *ovl_creds(struct super_block *sb)
{
@@ -862,6 +855,9 @@ int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int ovl_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
+struct ovl_file;
+struct ovl_file *ovl_file_alloc(struct file *realfile);
+void ovl_file_free(struct ovl_file *of);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index e42546c6c5df..1115c22deca0 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -141,10 +141,10 @@ static int ovl_verity_mode_def(void)
const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_string_empty("lowerdir", Opt_lowerdir),
- fsparam_string("lowerdir+", Opt_lowerdir_add),
- fsparam_string("datadir+", Opt_datadir_add),
- fsparam_string("upperdir", Opt_upperdir),
- fsparam_string("workdir", Opt_workdir),
+ fsparam_file_or_string("lowerdir+", Opt_lowerdir_add),
+ fsparam_file_or_string("datadir+", Opt_datadir_add),
+ fsparam_file_or_string("upperdir", Opt_upperdir),
+ fsparam_file_or_string("workdir", Opt_workdir),
fsparam_flag("default_permissions", Opt_default_permissions),
fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir),
fsparam_enum("index", Opt_index, ovl_parameter_bool),
@@ -367,40 +367,100 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer,
}
}
-static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer)
+static inline bool is_upper_layer(enum ovl_opt layer)
+{
+ return layer == Opt_upperdir || layer == Opt_workdir;
+}
+
+/* Handle non-file descriptor-based layer options that require path lookup. */
+static inline int ovl_kern_path(const char *layer_name, struct path *layer_path,
+ enum ovl_opt layer)
{
- char *name = kstrdup(layer_name, GFP_KERNEL);
- bool upper = (layer == Opt_upperdir || layer == Opt_workdir);
- struct path path;
int err;
+ switch (layer) {
+ case Opt_upperdir:
+ fallthrough;
+ case Opt_workdir:
+ fallthrough;
+ case Opt_lowerdir:
+ err = ovl_mount_dir(layer_name, layer_path);
+ break;
+ case Opt_lowerdir_add:
+ fallthrough;
+ case Opt_datadir_add:
+ err = ovl_mount_dir_noesc(layer_name, layer_path);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static int ovl_do_parse_layer(struct fs_context *fc, const char *layer_name,
+ struct path *layer_path, enum ovl_opt layer)
+{
+ char *name __free(kfree) = kstrdup(layer_name, GFP_KERNEL);
+ bool upper;
+ int err = 0;
+
if (!name)
return -ENOMEM;
- if (upper || layer == Opt_lowerdir)
- err = ovl_mount_dir(name, &path);
- else
- err = ovl_mount_dir_noesc(name, &path);
+ upper = is_upper_layer(layer);
+ err = ovl_mount_dir_check(fc, layer_path, layer, name, upper);
if (err)
- goto out_free;
-
- err = ovl_mount_dir_check(fc, &path, layer, name, upper);
- if (err)
- goto out_put;
+ return err;
if (!upper) {
err = ovl_ctx_realloc_lower(fc);
if (err)
- goto out_put;
+ return err;
}
/* Store the user provided path string in ctx to show in mountinfo */
- ovl_add_layer(fc, layer, &path, &name);
+ ovl_add_layer(fc, layer, layer_path, &name);
+ return err;
+}
+
+static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param,
+ enum ovl_opt layer)
+{
+ struct path layer_path __free(path_put) = {};
+ int err = 0;
+
+ switch (param->type) {
+ case fs_value_is_string:
+ err = ovl_kern_path(param->string, &layer_path, layer);
+ if (err)
+ return err;
+ err = ovl_do_parse_layer(fc, param->string, &layer_path, layer);
+ break;
+ case fs_value_is_file: {
+ char *buf __free(kfree);
+ char *layer_name;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
+ if (!buf)
+ return -ENOMEM;
+
+ layer_path = param->file->f_path;
+ path_get(&layer_path);
+
+ layer_name = d_path(&layer_path, buf, PATH_MAX);
+ if (IS_ERR(layer_name))
+ return PTR_ERR(layer_name);
+
+ err = ovl_do_parse_layer(fc, layer_name, &layer_path, layer);
+ break;
+ }
+ default:
+ WARN_ON_ONCE(true);
+ err = -EINVAL;
+ }
-out_put:
- path_put(&path);
-out_free:
- kfree(name);
return err;
}
@@ -474,7 +534,13 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
iter = dup;
for (nr = 0; nr < nr_lower; nr++) {
- err = ovl_parse_layer(fc, iter, Opt_lowerdir);
+ struct path path __free(path_put) = {};
+
+ err = ovl_kern_path(iter, &path, Opt_lowerdir);
+ if (err)
+ goto out_err;
+
+ err = ovl_do_parse_layer(fc, iter, &path, Opt_lowerdir);
if (err)
goto out_err;
@@ -555,7 +621,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_datadir_add:
case Opt_upperdir:
case Opt_workdir:
- err = ovl_parse_layer(fc, param->string, opt);
+ err = ovl_parse_layer(fc, param, opt);
break;
case Opt_default_permissions:
config->default_permissions = true;
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 0ca8af060b0c..881ec5592da5 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -290,7 +290,7 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data
}
inode_unlock(dir->d_inode);
}
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return err;
}
@@ -808,7 +808,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
}
err = 0;
out:
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return err;
}
@@ -860,7 +860,7 @@ static struct file *ovl_dir_open_realfile(const struct file *file,
old_cred = ovl_override_creds(file_inode(file)->i_sb);
res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE));
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return res;
}
@@ -987,7 +987,7 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
old_cred = ovl_override_creds(dentry->d_sb);
err = ovl_dir_read_merged(dentry, list, &root);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
if (err)
return err;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index edc9216f6e27..9aa7493b1e10 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -65,7 +65,12 @@ const struct cred *ovl_override_creds(struct super_block *sb)
{
struct ovl_fs *ofs = OVL_FS(sb);
- return override_creds(ofs->creator_cred);
+ return override_creds_light(ofs->creator_cred);
+}
+
+void ovl_revert_creds(const struct cred *old_cred)
+{
+ revert_creds_light(old_cred);
}
/*
@@ -197,6 +202,9 @@ void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry,
bool ovl_dentry_weird(struct dentry *dentry)
{
+ if (!d_can_lookup(dentry) && !d_is_file(dentry) && !d_is_symlink(dentry))
+ return true;
+
return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
DCACHE_MANAGE_TRANSIT |
DCACHE_OP_HASH |
@@ -1178,7 +1186,7 @@ int ovl_nlink_start(struct dentry *dentry)
* value relative to the upper inode nlink in an upper inode xattr.
*/
err = ovl_set_nlink_upper(dentry);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
if (err)
goto out_drop_write;
@@ -1203,7 +1211,7 @@ void ovl_nlink_end(struct dentry *dentry)
old_cred = ovl_override_creds(dentry->d_sb);
ovl_cleanup_index(dentry);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
}
ovl_inode_unlock(inode);
diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c
index 383978e4663c..88055deca936 100644
--- a/fs/overlayfs/xattrs.c
+++ b/fs/overlayfs/xattrs.c
@@ -47,7 +47,7 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char
ovl_path_lower(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
if (err < 0)
goto out;
}
@@ -72,7 +72,7 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char
WARN_ON(flags != XATTR_REPLACE);
err = ovl_do_removexattr(ofs, realdentry, name);
}
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
ovl_drop_write(dentry);
/* copy c/mtime */
@@ -91,7 +91,7 @@ static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char
ovl_i_path_real(inode, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
return res;
}
@@ -121,7 +121,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
old_cred = ovl_override_creds(dentry->d_sb);
res = vfs_listxattr(realdentry, list, size);
- revert_creds(old_cred);
+ ovl_revert_creds(old_cred);
if (res <= 0 || size == 0)
return res;
@@ -268,4 +268,3 @@ const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs)
return ofs->config.userxattr ? ovl_user_xattr_handlers :
ovl_trusted_xattr_handlers;
}
-
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 80675b6bf884..618abb1fa1b8 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -2,6 +2,7 @@
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/cgroup.h>
#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/pid.h>
@@ -114,6 +115,81 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
return poll_flags;
}
+static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+ struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+ size_t usize = _IOC_SIZE(cmd);
+ struct pidfd_info kinfo = {};
+ struct user_namespace *user_ns;
+ const struct cred *c;
+ __u64 mask;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+#endif
+
+ if (!uinfo)
+ return -EINVAL;
+ if (usize < PIDFD_INFO_SIZE_VER0)
+ return -EINVAL; /* First version, no smaller struct possible */
+
+ if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
+ return -EFAULT;
+
+ c = get_task_cred(task);
+ if (!c)
+ return -ESRCH;
+
+ /* Unconditionally return identifiers and credentials, the rest only on request */
+
+ user_ns = current_user_ns();
+ kinfo.ruid = from_kuid_munged(user_ns, c->uid);
+ kinfo.rgid = from_kgid_munged(user_ns, c->gid);
+ kinfo.euid = from_kuid_munged(user_ns, c->euid);
+ kinfo.egid = from_kgid_munged(user_ns, c->egid);
+ kinfo.suid = from_kuid_munged(user_ns, c->suid);
+ kinfo.sgid = from_kgid_munged(user_ns, c->sgid);
+ kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid);
+ kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid);
+ kinfo.mask |= PIDFD_INFO_CREDS;
+ put_cred(c);
+
+#ifdef CONFIG_CGROUPS
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(task);
+ kinfo.cgroupid = cgroup_id(cgrp);
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+ rcu_read_unlock();
+#endif
+
+ /*
+ * Copy pid/tgid last, to reduce the chances the information might be
+ * stale. Note that it is not possible to ensure it will be valid as the
+ * task might return as soon as the copy_to_user finishes, but that's ok
+ * and userspace expects that might happen and can act accordingly, so
+ * this is just best-effort. What we can do however is checking that all
+ * the fields are set correctly, or return ESRCH to avoid providing
+ * incomplete information. */
+
+ kinfo.ppid = task_ppid_nr_ns(task, NULL);
+ kinfo.tgid = task_tgid_vnr(task);
+ kinfo.pid = task_pid_vnr(task);
+ kinfo.mask |= PIDFD_INFO_PID;
+
+ if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
+ return -ESRCH;
+
+ /*
+ * If userspace and the kernel have the same struct size it can just
+ * be copied. If userspace provides an older struct, only the bits that
+ * userspace knows about will be copied. If userspace provides a new
+ * struct, only the bits that the kernel knows about will be copied.
+ */
+ if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
+ return -EFAULT;
+
+ return 0;
+}
+
static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
@@ -122,13 +198,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns;
- if (arg)
- return -EINVAL;
-
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
+ /* Extensible IOCTL that does not open namespace FDs, take a shortcut */
+ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
+ return pidfd_info(task, cmd, arg);
+
+ if (arg)
+ return -EINVAL;
+
scoped_guard(task_lock, task) {
nsp = task->nsproxy;
if (nsp)
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 6c66a37522d0..4050942ab52f 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init);
* Allocate a new ACL with the specified number of entries.
*/
struct posix_acl *
-posix_acl_alloc(int count, gfp_t flags)
+posix_acl_alloc(unsigned int count, gfp_t flags)
{
- const size_t size = sizeof(struct posix_acl) +
- count * sizeof(struct posix_acl_entry);
- struct posix_acl *acl = kmalloc(size, flags);
+ struct posix_acl *acl;
+
+ acl = kmalloc(struct_size(acl, a_entries, count), flags);
if (acl)
posix_acl_init(acl, count);
return acl;
@@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
struct posix_acl *clone = NULL;
if (acl) {
- int size = sizeof(struct posix_acl) + acl->a_count *
- sizeof(struct posix_acl_entry);
- clone = kmemdup(acl, size, flags);
+ clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count),
+ flags);
if (clone)
refcount_set(&clone->a_refcount, 1);
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b31283d81c52..3e31a4805427 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -58,7 +58,6 @@
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
@@ -2553,8 +2552,8 @@ static int show_timer(struct seq_file *m, void *v)
seq_printf(m, "ID: %d\n", timer->it_id);
seq_printf(m, "signal: %d/%px\n",
- timer->sigq->info.si_signo,
- timer->sigq->info.si_value.sival_ptr);
+ timer->sigq.info.si_signo,
+ timer->sigq.info.si_value.sival_ptr);
seq_printf(m, "notify: %s/%s.%d\n",
nstr[notify & ~SIGEV_THREAD_ID],
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 5e391cbca7a3..24baf23e864f 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -116,9 +116,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
{
struct file *file;
- rcu_read_lock();
- file = task_lookup_fdget_rcu(task, fd);
- rcu_read_unlock();
+ file = fget_task(task, fd);
if (file) {
*mode = file->f_mode;
fput(file);
@@ -258,19 +256,17 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx))
goto out;
- rcu_read_lock();
for (fd = ctx->pos - 2;; fd++) {
struct file *f;
struct fd_data data;
char name[10 + 1];
unsigned int len;
- f = task_lookup_next_fdget_rcu(p, &fd);
+ f = fget_task_next(p, &fd);
ctx->pos = fd + 2LL;
if (!f)
break;
data.mode = f->f_mode;
- rcu_read_unlock();
fput(f);
data.fd = fd;
@@ -278,11 +274,9 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
&data))
- goto out;
+ break;
cond_resched();
- rcu_read_lock();
}
- rcu_read_unlock();
out:
put_task_struct(p);
return 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 87e4d6282025..1695509370b8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -102,7 +102,7 @@ struct proc_inode {
union proc_op op;
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
- struct ctl_table *sysctl_entry;
+ const struct ctl_table *sysctl_entry;
struct hlist_node sibling_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index cb0edc7cbf09..714a22ded8a8 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -11,13 +11,13 @@
*/
static void *int_seq_start(struct seq_file *f, loff_t *pos)
{
- return (*pos <= nr_irqs) ? pos : NULL;
+ return *pos <= irq_get_nr_irqs() ? pos : NULL;
}
static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
- if (*pos > nr_irqs)
+ if (*pos > irq_get_nr_irqs())
return NULL;
return pos;
}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d11ebc055ce0..27a283d85a6e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -17,6 +17,7 @@
#include <linux/bpf-cgroup.h>
#include <linux/mount.h>
#include <linux/kmemleak.h>
+#include <linux/lockdep.h>
#include "internal.h"
#define list_for_each_table_entry(entry, header) \
@@ -33,7 +34,7 @@ static const struct inode_operations proc_sys_dir_operations;
* Support for permanently empty directories.
* Must be non-empty to avoid sharing an address with other tables.
*/
-static struct ctl_table sysctl_mount_point[] = {
+static const struct ctl_table sysctl_mount_point[] = {
{ }
};
@@ -67,7 +68,7 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll)
wake_up_interruptible(&poll->wait);
}
-static struct ctl_table root_table[] = {
+static const struct ctl_table root_table[] = {
{
.procname = "",
.mode = S_IFDIR|S_IRUGO|S_IXUGO,
@@ -88,7 +89,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
static void drop_sysctl_table(struct ctl_table_header *header);
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry);
+ const struct ctl_table **pentry);
static int insert_links(struct ctl_table_header *head);
static void put_links(struct ctl_table_header *header);
@@ -109,14 +110,15 @@ static int namecmp(const char *name1, int len1, const char *name2, int len2)
return cmp;
}
-/* Called under sysctl_lock */
-static struct ctl_table *find_entry(struct ctl_table_header **phead,
+static const struct ctl_table *find_entry(struct ctl_table_header **phead,
struct ctl_dir *dir, const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
struct rb_node *node = dir->root.rb_node;
+ lockdep_assert_held(&sysctl_lock);
+
while (node)
{
struct ctl_node *ctl_node;
@@ -141,7 +143,7 @@ static struct ctl_table *find_entry(struct ctl_table_header **phead,
return NULL;
}
-static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
+static int insert_entry(struct ctl_table_header *head, const struct ctl_table *entry)
{
struct rb_node *node = &head->node[entry - head->ctl_table].node;
struct rb_node **p = &head->parent->root.rb_node;
@@ -151,7 +153,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
while (*p) {
struct ctl_table_header *parent_head;
- struct ctl_table *parent_entry;
+ const struct ctl_table *parent_entry;
struct ctl_node *parent_node;
const char *parent_name;
int cmp;
@@ -180,7 +182,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
return 0;
}
-static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
+static void erase_entry(struct ctl_table_header *head, const struct ctl_table *entry)
{
struct rb_node *node = &head->node[entry - head->ctl_table].node;
@@ -189,7 +191,7 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
static void init_header(struct ctl_table_header *head,
struct ctl_table_root *root, struct ctl_table_set *set,
- struct ctl_node *node, struct ctl_table *table, size_t table_size)
+ struct ctl_node *node, const struct ctl_table *table, size_t table_size)
{
head->ctl_table = table;
head->ctl_table_size = table_size;
@@ -204,7 +206,7 @@ static void init_header(struct ctl_table_header *head,
head->node = node;
INIT_HLIST_HEAD(&head->inodes);
if (node) {
- struct ctl_table *entry;
+ const struct ctl_table *entry;
list_for_each_table_entry(entry, head) {
node->header = head;
@@ -217,7 +219,7 @@ static void init_header(struct ctl_table_header *head,
static void erase_header(struct ctl_table_header *head)
{
- struct ctl_table *entry;
+ const struct ctl_table *entry;
list_for_each_table_entry(entry, head)
erase_entry(head, entry);
@@ -225,7 +227,7 @@ static void erase_header(struct ctl_table_header *head)
static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
{
- struct ctl_table *entry;
+ const struct ctl_table *entry;
struct ctl_table_header *dir_h = &dir->header;
int err;
@@ -263,18 +265,20 @@ fail_links:
return err;
}
-/* called under sysctl_lock */
static int use_table(struct ctl_table_header *p)
{
+ lockdep_assert_held(&sysctl_lock);
+
if (unlikely(p->unregistering))
return 0;
p->used++;
return 1;
}
-/* called under sysctl_lock */
static void unuse_table(struct ctl_table_header *p)
{
+ lockdep_assert_held(&sysctl_lock);
+
if (!--p->used)
if (unlikely(p->unregistering))
complete(p->unregistering);
@@ -285,9 +289,11 @@ static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
}
-/* called under sysctl_lock, will reacquire if has to wait */
static void start_unregistering(struct ctl_table_header *p)
{
+ /* will reacquire if has to wait */
+ lockdep_assert_held(&sysctl_lock);
+
/*
* if p->used is 0, nobody will ever touch that entry again;
* we'll eliminate all paths to it before dropping sysctl_lock
@@ -344,12 +350,12 @@ lookup_header_set(struct ctl_table_root *root)
return set;
}
-static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
- struct ctl_dir *dir,
- const char *name, int namelen)
+static const struct ctl_table *lookup_entry(struct ctl_table_header **phead,
+ struct ctl_dir *dir,
+ const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
spin_lock(&sysctl_lock);
entry = find_entry(&head, dir, name, namelen);
@@ -374,10 +380,10 @@ static struct ctl_node *first_usable_entry(struct rb_node *node)
}
static void first_entry(struct ctl_dir *dir,
- struct ctl_table_header **phead, struct ctl_table **pentry)
+ struct ctl_table_header **phead, const struct ctl_table **pentry)
{
struct ctl_table_header *head = NULL;
- struct ctl_table *entry = NULL;
+ const struct ctl_table *entry = NULL;
struct ctl_node *ctl_node;
spin_lock(&sysctl_lock);
@@ -391,10 +397,10 @@ static void first_entry(struct ctl_dir *dir,
*pentry = entry;
}
-static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
+static void next_entry(struct ctl_table_header **phead, const struct ctl_table **pentry)
{
struct ctl_table_header *head = *phead;
- struct ctl_table *entry = *pentry;
+ const struct ctl_table *entry = *pentry;
struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
spin_lock(&sysctl_lock);
@@ -427,7 +433,7 @@ static int test_perm(int mode, int op)
return -EACCES;
}
-static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
+static int sysctl_perm(struct ctl_table_header *head, const struct ctl_table *table, int op)
{
struct ctl_table_root *root = head->root;
int mode;
@@ -441,7 +447,7 @@ static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, i
}
static struct inode *proc_sys_make_inode(struct super_block *sb,
- struct ctl_table_header *head, struct ctl_table *table)
+ struct ctl_table_header *head, const struct ctl_table *table)
{
struct ctl_table_root *root = head->root;
struct inode *inode;
@@ -512,7 +518,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
struct ctl_table_header *head = grab_header(dir);
struct ctl_table_header *h = NULL;
const struct qstr *name = &dentry->d_name;
- struct ctl_table *p;
+ const struct ctl_table *p;
struct inode *inode;
struct dentry *err = ERR_PTR(-ENOENT);
struct ctl_dir *ctl_dir;
@@ -550,7 +556,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
{
struct inode *inode = file_inode(iocb->ki_filp);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
size_t count = iov_iter_count(iter);
char *kbuf;
ssize_t error;
@@ -624,7 +630,7 @@ static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter)
static int proc_sys_open(struct inode *inode, struct file *filp)
{
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
/* sysctl was unregistered */
if (IS_ERR(head))
@@ -642,7 +648,7 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
{
struct inode *inode = file_inode(filp);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
__poll_t ret = DEFAULT_POLLMASK;
unsigned long event;
@@ -673,7 +679,7 @@ out:
static bool proc_sys_fill_cache(struct file *file,
struct dir_context *ctx,
struct ctl_table_header *head,
- struct ctl_table *table)
+ const struct ctl_table *table)
{
struct dentry *child, *dir = file->f_path.dentry;
struct inode *inode;
@@ -698,11 +704,11 @@ static bool proc_sys_fill_cache(struct file *file,
res = d_splice_alias(inode, child);
d_lookup_done(child);
if (unlikely(res)) {
- if (IS_ERR(res)) {
- dput(child);
- return false;
- }
dput(child);
+
+ if (IS_ERR(res))
+ return false;
+
child = res;
}
}
@@ -717,7 +723,7 @@ static bool proc_sys_fill_cache(struct file *file,
static bool proc_sys_link_fill_cache(struct file *file,
struct dir_context *ctx,
struct ctl_table_header *head,
- struct ctl_table *table)
+ const struct ctl_table *table)
{
bool ret = true;
@@ -735,7 +741,7 @@ out:
return ret;
}
-static int scan(struct ctl_table_header *head, struct ctl_table *table,
+static int scan(struct ctl_table_header *head, const struct ctl_table *table,
unsigned long *pos, struct file *file,
struct dir_context *ctx)
{
@@ -759,7 +765,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
{
struct ctl_table_header *head = grab_header(file_inode(file));
struct ctl_table_header *h = NULL;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
struct ctl_dir *ctl_dir;
unsigned long pos;
@@ -792,7 +798,7 @@ static int proc_sys_permission(struct mnt_idmap *idmap,
* are _NOT_ writeable, capabilities or not.
*/
struct ctl_table_header *head;
- struct ctl_table *table;
+ const struct ctl_table *table;
int error;
/* Executable files are not allowed under /proc/sys/ */
@@ -836,7 +842,7 @@ static int proc_sys_getattr(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(path->dentry);
struct ctl_table_header *head = grab_header(inode);
- struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ const struct ctl_table *table = PROC_I(inode)->sysctl_entry;
if (IS_ERR(head))
return PTR_ERR(head);
@@ -935,7 +941,7 @@ static struct ctl_dir *find_subdir(struct ctl_dir *dir,
const char *name, int namelen)
{
struct ctl_table_header *head;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
entry = find_entry(&head, dir, name, namelen);
if (!entry)
@@ -1046,12 +1052,12 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
}
static int sysctl_follow_link(struct ctl_table_header **phead,
- struct ctl_table **pentry)
+ const struct ctl_table **pentry)
{
struct ctl_table_header *head;
+ const struct ctl_table *entry;
struct ctl_table_root *root;
struct ctl_table_set *set;
- struct ctl_table *entry;
struct ctl_dir *dir;
int ret;
@@ -1078,7 +1084,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
return ret;
}
-static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
+static int sysctl_err(const char *path, const struct ctl_table *table, char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -1094,7 +1100,7 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
return -EINVAL;
}
-static int sysctl_check_table_array(const char *path, struct ctl_table *table)
+static int sysctl_check_table_array(const char *path, const struct ctl_table *table)
{
unsigned int extra;
int err = 0;
@@ -1133,7 +1139,7 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
static int sysctl_check_table(const char *path, struct ctl_table_header *header)
{
- struct ctl_table *entry;
+ const struct ctl_table *entry;
int err = 0;
list_for_each_table_entry(entry, header) {
if (!entry->procname)
@@ -1169,8 +1175,9 @@ static int sysctl_check_table(const char *path, struct ctl_table_header *header)
static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head)
{
- struct ctl_table *link_table, *entry, *link;
+ struct ctl_table *link_table, *link;
struct ctl_table_header *links;
+ const struct ctl_table *entry;
struct ctl_node *node;
char *link_name;
int name_bytes;
@@ -1215,7 +1222,7 @@ static bool get_links(struct ctl_dir *dir,
struct ctl_table_root *link_root)
{
struct ctl_table_header *tmp_head;
- struct ctl_table *entry, *link;
+ const struct ctl_table *entry, *link;
if (header->ctl_table_size == 0 ||
sysctl_is_perm_empty_ctl_header(header))
@@ -1358,7 +1365,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
*/
struct ctl_table_header *__register_sysctl_table(
struct ctl_table_set *set,
- const char *path, struct ctl_table *table, size_t table_size)
+ const char *path, const struct ctl_table *table, size_t table_size)
{
struct ctl_table_root *root = set->dir.header.root;
struct ctl_table_header *header;
@@ -1419,7 +1426,7 @@ fail:
*
* See __register_sysctl_table for more details.
*/
-struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table,
+struct ctl_table_header *register_sysctl_sz(const char *path, const struct ctl_table *table,
size_t table_size)
{
return __register_sysctl_table(&sysctl_table_root.default_set,
@@ -1448,7 +1455,7 @@ EXPORT_SYMBOL(register_sysctl_sz);
*
* Context: if your base directory does not exist it will be created for you.
*/
-void __init __register_sysctl_init(const char *path, struct ctl_table *table,
+void __init __register_sysctl_init(const char *path, const struct ctl_table *table,
const char *table_name, size_t table_size)
{
struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size);
@@ -1466,7 +1473,7 @@ static void put_links(struct ctl_table_header *header)
struct ctl_table_root *root = header->root;
struct ctl_dir *parent = header->parent;
struct ctl_dir *core_parent;
- struct ctl_table *entry;
+ const struct ctl_table *entry;
if (header->set == root_set)
return;
@@ -1477,7 +1484,7 @@ static void put_links(struct ctl_table_header *header)
list_for_each_table_entry(entry, header) {
struct ctl_table_header *link_head;
- struct ctl_table *link;
+ const struct ctl_table *link;
const char *name = entry->procname;
link = find_entry(&link_head, core_parent, name, strlen(name));
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index da60956b2915..8b444e862319 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -76,7 +76,7 @@ static void show_all_irqs(struct seq_file *p)
seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
next = i + 1;
}
- show_irq_gap(p, nr_irqs - next);
+ show_irq_gap(p, irq_get_nr_irqs() - next);
}
static int show_stat(struct seq_file *p, void *v)
@@ -196,7 +196,7 @@ static int stat_open(struct inode *inode, struct file *file)
unsigned int size = 1024 + 128 * num_online_cpus();
/* minimum size to display an interrupt count : 2 bytes */
- size += 2 * nr_irqs;
+ size += 2 * irq_get_nr_irqs();
return single_open_size(file, show_stat, NULL, size);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7eb010de39fe..38a5a3e9cba2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -978,7 +978,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
[ilog2(VM_UFFD_MINOR)] = "ui",
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-#ifdef CONFIG_X86_USER_SHADOW_STACK
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
[ilog2(VM_SHADOW_STACK)] = "ss",
#endif
#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 4c925e55dbcd..818083a36bef 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -9,14 +9,13 @@ config QUOTA
help
If you say Y here, you will be able to set per user limits for disk
usage (also called disk quotas). Currently, it works for the
- ext2, ext3, ext4, f2fs, jfs, ocfs2 and reiserfs file systems.
- Note that gfs2 and xfs use their own quota system.
- Ext3, ext4 and reiserfs also support journaled quotas for which
- you don't need to run quotacheck(8) after an unclean shutdown.
- For further details, read the Quota mini-HOWTO, available from
- <https://www.tldp.org/docs.html#howto>, or the documentation provided
- with the quota tools. Probably the quota support is only useful for
- multi user systems. If unsure, say N.
+ ext2, ext3, ext4, f2fs, jfs and ocfs2 file systems. Note that gfs2
+ and xfs use their own quota system. Ext3 and ext4 also support
+ journaled quotas for which you don't need to run quotacheck(8) after
+ an unclean shutdown. For further details, read the Quota mini-HOWTO,
+ available from <https://www.tldp.org/docs.html#howto>, or the
+ documentation provided with the quota tools. Probably the quota
+ support is only useful for multi user systems. If unsure, say N.
config QUOTA_NETLINK_INTERFACE
bool "Report quota messages through netlink interface"
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b40410cd39af..3dd8d6f27725 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -80,7 +80,6 @@
#include <linux/quotaops.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>
-#include "../internal.h" /* ugh */
#include <linux/uaccess.h>
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 290157bc7bec..7c2b75a44485 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -976,21 +976,19 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
struct super_block *sb;
unsigned int cmds = cmd >> SUBCMDSHIFT;
unsigned int type = cmd & SUBCMDMASK;
- struct fd f;
+ CLASS(fd_raw, f)(fd);
int ret;
- f = fdget_raw(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- ret = -EINVAL;
if (type >= MAXQUOTAS)
- goto out;
+ return -EINVAL;
if (quotactl_cmd_write(cmds)) {
ret = mnt_want_write(fd_file(f)->f_path.mnt);
if (ret)
- goto out;
+ return ret;
}
sb = fd_file(f)->f_path.mnt->mnt_sb;
@@ -1008,7 +1006,5 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
if (quotactl_cmd_write(cmds))
mnt_drop_write(fd_file(f)->f_path.mnt);
-out:
- fdput(f);
return ret;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 64dc24afdb3a..a6133241dfb8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -386,8 +386,8 @@ EXPORT_SYMBOL(vfs_llseek);
static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
off_t retval;
- struct fd f = fdget_pos(fd);
- if (!fd_file(f))
+ CLASS(fd_pos, f)(fd);
+ if (fd_empty(f))
return -EBADF;
retval = -EINVAL;
@@ -397,7 +397,6 @@ static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
}
- fdput_pos(f);
return retval;
}
@@ -420,15 +419,14 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
unsigned int, whence)
{
int retval;
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
loff_t offset;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- retval = -EINVAL;
if (whence > SEEK_MAX)
- goto out_putf;
+ return -EINVAL;
offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
whence);
@@ -439,8 +437,6 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
if (!copy_to_user(result, &offset, sizeof(offset)))
retval = 0;
}
-out_putf:
- fdput_pos(f);
return retval;
}
#endif
@@ -700,10 +696,10 @@ static inline loff_t *file_ppos(struct file *file)
ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -712,7 +708,6 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
ret = vfs_read(fd_file(f), buf, count, ppos);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
return ret;
}
@@ -724,10 +719,10 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -736,7 +731,6 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
ret = vfs_write(fd_file(f), buf, count, ppos);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
return ret;
@@ -751,21 +745,17 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
loff_t pos)
{
- struct fd f;
- ssize_t ret = -EBADF;
-
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
- ret = -ESPIPE;
- if (fd_file(f)->f_mode & FMODE_PREAD)
- ret = vfs_read(fd_file(f), buf, count, &pos);
- fdput(f);
- }
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- return ret;
+ if (fd_file(f)->f_mode & FMODE_PREAD)
+ return vfs_read(fd_file(f), buf, count, &pos);
+
+ return -ESPIPE;
}
SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
@@ -785,21 +775,17 @@ COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
size_t count, loff_t pos)
{
- struct fd f;
- ssize_t ret = -EBADF;
-
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
- ret = -ESPIPE;
- if (fd_file(f)->f_mode & FMODE_PWRITE)
- ret = vfs_write(fd_file(f), buf, count, &pos);
- fdput(f);
- }
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- return ret;
+ if (fd_file(f)->f_mode & FMODE_PWRITE)
+ return vfs_write(fd_file(f), buf, count, &pos);
+
+ return -ESPIPE;
}
SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
@@ -1075,10 +1061,10 @@ out:
static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, rwf_t flags)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -1087,7 +1073,6 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
if (ret > 0)
@@ -1099,10 +1084,10 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, rwf_t flags)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -1111,7 +1096,6 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
if (ret > 0)
@@ -1129,18 +1113,16 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, loff_t pos, rwf_t flags)
{
- struct fd f;
ssize_t ret = -EBADF;
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
+ CLASS(fd, f)(fd);
+ if (!fd_empty(f)) {
ret = -ESPIPE;
if (fd_file(f)->f_mode & FMODE_PREAD)
ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
- fdput(f);
}
if (ret > 0)
@@ -1152,18 +1134,16 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, loff_t pos, rwf_t flags)
{
- struct fd f;
ssize_t ret = -EBADF;
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
+ CLASS(fd, f)(fd);
+ if (!fd_empty(f)) {
ret = -ESPIPE;
if (fd_file(f)->f_mode & FMODE_PWRITE)
ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
- fdput(f);
}
if (ret > 0)
@@ -1315,7 +1295,6 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
size_t count, loff_t max)
{
- struct fd in, out;
struct inode *in_inode, *out_inode;
struct pipe_inode_info *opipe;
loff_t pos;
@@ -1326,35 +1305,32 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
/*
* Get input file, and verify that it is ok..
*/
- retval = -EBADF;
- in = fdget(in_fd);
- if (!fd_file(in))
- goto out;
+ CLASS(fd, in)(in_fd);
+ if (fd_empty(in))
+ return -EBADF;
if (!(fd_file(in)->f_mode & FMODE_READ))
- goto fput_in;
- retval = -ESPIPE;
+ return -EBADF;
if (!ppos) {
pos = fd_file(in)->f_pos;
} else {
pos = *ppos;
if (!(fd_file(in)->f_mode & FMODE_PREAD))
- goto fput_in;
+ return -ESPIPE;
}
retval = rw_verify_area(READ, fd_file(in), &pos, count);
if (retval < 0)
- goto fput_in;
+ return retval;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
/*
* Get output file, and verify that it is ok..
*/
- retval = -EBADF;
- out = fdget(out_fd);
- if (!fd_file(out))
- goto fput_in;
+ CLASS(fd, out)(out_fd);
+ if (fd_empty(out))
+ return -EBADF;
if (!(fd_file(out)->f_mode & FMODE_WRITE))
- goto fput_out;
+ return -EBADF;
in_inode = file_inode(fd_file(in));
out_inode = file_inode(fd_file(out));
out_pos = fd_file(out)->f_pos;
@@ -1363,9 +1339,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
if (unlikely(pos + count > max)) {
- retval = -EOVERFLOW;
if (pos >= max)
- goto fput_out;
+ return -EOVERFLOW;
count = max - pos;
}
@@ -1384,7 +1359,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
if (!opipe) {
retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
if (retval < 0)
- goto fput_out;
+ return retval;
retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
count, fl);
} else {
@@ -1410,12 +1385,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
inc_syscw(current);
if (pos > max)
retval = -EOVERFLOW;
-
-fput_out:
- fdput(out);
-fput_in:
- fdput(in);
-out:
return retval;
}
@@ -1671,36 +1640,32 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
{
loff_t pos_in;
loff_t pos_out;
- struct fd f_in;
- struct fd f_out;
ssize_t ret = -EBADF;
- f_in = fdget(fd_in);
- if (!fd_file(f_in))
- goto out2;
+ CLASS(fd, f_in)(fd_in);
+ if (fd_empty(f_in))
+ return -EBADF;
- f_out = fdget(fd_out);
- if (!fd_file(f_out))
- goto out1;
+ CLASS(fd, f_out)(fd_out);
+ if (fd_empty(f_out))
+ return -EBADF;
- ret = -EFAULT;
if (off_in) {
if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
- goto out;
+ return -EFAULT;
} else {
pos_in = fd_file(f_in)->f_pos;
}
if (off_out) {
if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
- goto out;
+ return -EFAULT;
} else {
pos_out = fd_file(f_out)->f_pos;
}
- ret = -EINVAL;
if (flags != 0)
- goto out;
+ return -EINVAL;
ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
flags);
@@ -1722,12 +1687,6 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
fd_file(f_out)->f_pos = pos_out;
}
}
-
-out:
- fdput(f_out);
-out1:
- fdput(f_in);
-out2:
return ret;
}
@@ -1830,18 +1789,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
return 0;
}
-bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
+int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
{
size_t len = iov_iter_count(iter);
if (!iter_is_ubuf(iter))
- return false;
+ return -EINVAL;
if (!is_power_of_2(len))
- return false;
+ return -EINVAL;
- if (!IS_ALIGNED(pos, len))
- return false;
+ if (!IS_ALIGNED(iocb->ki_pos, len))
+ return -EINVAL;
- return true;
+ if (!(iocb->ki_flags & IOCB_DIRECT))
+ return -EOPNOTSUPP;
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(generic_atomic_write_valid);
diff --git a/fs/readdir.c b/fs/readdir.c
index 6d29cab8576e..0038efda417b 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -219,20 +219,19 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
struct old_linux_dirent __user *, dirent, unsigned int, count)
{
int error;
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
.dirent = dirent
};
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
if (buf.result)
error = buf.result;
- fdput_pos(f);
return error;
}
@@ -309,7 +308,7 @@ efault:
SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct linux_dirent __user *, dirent, unsigned int, count)
{
- struct fd f;
+ CLASS(fd_pos, f)(fd);
struct getdents_callback buf = {
.ctx.actor = filldir,
.count = count,
@@ -317,8 +316,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
};
int error;
- f = fdget_pos(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
@@ -333,7 +331,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
else
error = count - buf.count;
}
- fdput_pos(f);
return error;
}
@@ -392,7 +389,7 @@ efault:
SYSCALL_DEFINE3(getdents64, unsigned int, fd,
struct linux_dirent64 __user *, dirent, unsigned int, count)
{
- struct fd f;
+ CLASS(fd_pos, f)(fd);
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
.count = count,
@@ -400,8 +397,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
};
int error;
- f = fdget_pos(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
@@ -417,7 +413,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
else
error = count - buf.count;
}
- fdput_pos(f);
return error;
}
@@ -477,20 +472,19 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
{
int error;
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
.dirent = dirent
};
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
if (buf.result)
error = buf.result;
- fdput_pos(f);
return error;
}
@@ -560,7 +554,7 @@ efault:
COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct compat_linux_dirent __user *, dirent, unsigned int, count)
{
- struct fd f;
+ CLASS(fd_pos, f)(fd);
struct compat_getdents_callback buf = {
.ctx.actor = compat_filldir,
.current_dir = dirent,
@@ -568,8 +562,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
};
int error;
- f = fdget_pos(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
@@ -584,7 +577,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
else
error = count - buf.count;
}
- fdput_pos(f);
return error;
}
#endif
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
deleted file mode 100644
index 0e6fe26458fe..000000000000
--- a/fs/reiserfs/Kconfig
+++ /dev/null
@@ -1,91 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config REISERFS_FS
- tristate "Reiserfs support (deprecated)"
- select BUFFER_HEAD
- select CRC32
- select LEGACY_DIRECT_IO
- help
- Reiserfs is deprecated and scheduled to be removed from the kernel
- in 2025. If you are still using it, please migrate to another
- filesystem or tell us your usecase for reiserfs.
-
- Reiserfs stores not just filenames but the files themselves in a
- balanced tree. Uses journalling.
-
- Balanced trees are more efficient than traditional file system
- architectural foundations.
-
- In general, ReiserFS is as fast as ext2, but is very efficient with
- large directories and small files. Additional patches are needed
- for NFS and quotas, please see
- <https://reiser4.wiki.kernel.org/index.php/Main_Page> for links.
-
- It is more easily extended to have features currently found in
- database and keyword search systems than block allocation based file
- systems are. The next version will be so extended, and will support
- plugins consistent with our motto ``It takes more than a license to
- make source code open.''
-
- Read <https://reiser4.wiki.kernel.org/index.php/Main_Page>
- to learn more about reiserfs.
-
- Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
-
- If you like it, you can pay us to add new features to it that you
- need, buy a support contract, or pay us to port it to another OS.
-
-config REISERFS_CHECK
- bool "Enable reiserfs debug mode"
- depends on REISERFS_FS
- help
- If you set this to Y, then ReiserFS will perform every check it can
- possibly imagine of its internal consistency throughout its
- operation. It will also go substantially slower. More than once we
- have forgotten that this was on, and then gone despondent over the
- latest benchmarks.:-) Use of this option allows our team to go all
- out in checking for consistency when debugging without fear of its
- effect on end users. If you are on the verge of sending in a bug
- report, say Y and you might get a useful error message. Almost
- everyone should say N.
-
-config REISERFS_PROC_INFO
- bool "Stats in /proc/fs/reiserfs"
- depends on REISERFS_FS && PROC_FS
- help
- Create under /proc/fs/reiserfs a hierarchy of files, displaying
- various ReiserFS statistics and internal data at the expense of
- making your kernel or module slightly larger (+8 KB). This also
- increases the amount of kernel memory required for each mount.
- Almost everyone but ReiserFS developers and people fine-tuning
- reiserfs or tracing problems should say N.
-
-config REISERFS_FS_XATTR
- bool "ReiserFS extended attributes"
- depends on REISERFS_FS
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page for details).
-
- If unsure, say N.
-
-config REISERFS_FS_POSIX_ACL
- bool "ReiserFS POSIX Access Control Lists"
- depends on REISERFS_FS_XATTR
- select FS_POSIX_ACL
- help
- Posix Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
-
- If you don't know what Access Control Lists are, say N
-
-config REISERFS_FS_SECURITY
- bool "ReiserFS Security Labels"
- depends on REISERFS_FS_XATTR
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute handler for file security
- labels in the ReiserFS filesystem.
-
- If you are not using a security module that requires using
- extended attributes for file security labels, say N.
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
deleted file mode 100644
index bd29c58ccbd8..000000000000
--- a/fs/reiserfs/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the linux reiser-filesystem routines.
-#
-
-obj-$(CONFIG_REISERFS_FS) += reiserfs.o
-
-reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
- super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
- hashes.o tail_conversion.o journal.o resize.o \
- item_ops.o ioctl.o xattr.o lock.o
-
-ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
-reiserfs-objs += procfs.o
-endif
-
-ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
-reiserfs-objs += xattr_user.o xattr_trusted.o
-endif
-
-ifeq ($(CONFIG_REISERFS_FS_SECURITY),y)
-reiserfs-objs += xattr_security.o
-endif
-
-ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y)
-reiserfs-objs += xattr_acl.o
-endif
-
-TAGS:
- etags *.c
diff --git a/fs/reiserfs/README b/fs/reiserfs/README
deleted file mode 100644
index 11e9ecf24b63..000000000000
--- a/fs/reiserfs/README
+++ /dev/null
@@ -1,151 +0,0 @@
-[LICENSING]
-
-ReiserFS is hereby licensed under the GNU General
-Public License version 2.
-
-Source code files that contain the phrase "licensing governed by
-reiserfs/README" are "governed files" throughout this file. Governed
-files are licensed under the GPL. The portions of them owned by Hans
-Reiser, or authorized to be licensed by him, have been in the past,
-and likely will be in the future, licensed to other parties under
-other licenses. If you add your code to governed files, and don't
-want it to be owned by Hans Reiser, put your copyright label on that
-code so the poor blight and his customers can keep things straight.
-All portions of governed files not labeled otherwise are owned by Hans
-Reiser, and by adding your code to it, widely distributing it to
-others or sending us a patch, and leaving the sentence in stating that
-licensing is governed by the statement in this file, you accept this.
-It will be a kindness if you identify whether Hans Reiser is allowed
-to license code labeled as owned by you on your behalf other than
-under the GPL, because he wants to know if it is okay to do so and put
-a check in the mail to you (for non-trivial improvements) when he
-makes his next sale. He makes no guarantees as to the amount if any,
-though he feels motivated to motivate contributors, and you can surely
-discuss this with him before or after contributing. You have the
-right to decline to allow him to license your code contribution other
-than under the GPL.
-
-Further licensing options are available for commercial and/or other
-interests directly from Hans Reiser: hans@reiser.to. If you interpret
-the GPL as not allowing those additional licensing options, you read
-it wrongly, and Richard Stallman agrees with me, when carefully read
-you can see that those restrictions on additional terms do not apply
-to the owner of the copyright, and my interpretation of this shall
-govern for this license.
-
-Finally, nothing in this license shall be interpreted to allow you to
-fail to fairly credit me, or to remove my credits, without my
-permission, unless you are an end user not redistributing to others.
-If you have doubts about how to properly do that, or about what is
-fair, ask. (Last I spoke with him Richard was contemplating how best
-to address the fair crediting issue in the next GPL version.)
-
-[END LICENSING]
-
-Reiserfs is a file system based on balanced tree algorithms, which is
-described at https://reiser4.wiki.kernel.org/index.php/Main_Page
-
-Stop reading here. Go there, then return.
-
-Send bug reports to yura@namesys.botik.ru.
-
-mkreiserfs and other utilities are in reiserfs/utils, or wherever your
-Linux provider put them. There is some disagreement about how useful
-it is for users to get their fsck and mkreiserfs out of sync with the
-version of reiserfs that is in their kernel, with many important
-distributors wanting them out of sync.:-) Please try to remember to
-recompile and reinstall fsck and mkreiserfs with every update of
-reiserfs, this is a common source of confusion. Note that some of the
-utilities cannot be compiled without accessing the balancing code
-which is in the kernel code, and relocating the utilities may require
-you to specify where that code can be found.
-
-Yes, if you update your reiserfs kernel module you do have to
-recompile your kernel, most of the time. The errors you get will be
-quite cryptic if your forget to do so.
-
-Real users, as opposed to folks who want to hack and then understand
-what went wrong, will want REISERFS_CHECK off.
-
-Hideous Commercial Pitch: Spread your development costs across other OS
-vendors. Select from the best in the world, not the best in your
-building, by buying from third party OS component suppliers. Leverage
-the software component development power of the internet. Be the most
-aggressive in taking advantage of the commercial possibilities of
-decentralized internet development, and add value through your branded
-integration that you sell as an operating system. Let your competitors
-be the ones to compete against the entire internet by themselves. Be
-hip, get with the new economic trend, before your competitors do. Send
-email to hans@reiser.to.
-
-To understand the code, after reading the website, start reading the
-code by reading reiserfs_fs.h first.
-
-Hans Reiser was the project initiator, primary architect, source of all
-funding for the first 5.5 years, and one of the programmers. He owns
-the copyright.
-
-Vladimir Saveljev was one of the programmers, and he worked long hours
-writing the cleanest code. He always made the effort to be the best he
-could be, and to make his code the best that it could be. What resulted
-was quite remarkable. I don't think that money can ever motivate someone
-to work the way he did, he is one of the most selfless men I know.
-
-Yura helps with benchmarking, coding hashes, and block pre-allocation
-code.
-
-Anatoly Pinchuk is a former member of our team who worked closely with
-Vladimir throughout the project's development. He wrote a quite
-substantial portion of the total code. He realized that there was a
-space problem with packing tails of files for files larger than a node
-that start on a node aligned boundary (there are reasons to want to node
-align files), and he invented and implemented indirect items and
-unformatted nodes as the solution.
-
-Konstantin Shvachko was taking part in the early days.
-
-Mikhail Gilula was a brilliant innovator that has shown much generosity.
-
-Grigory Zaigralin was an extremely effective system administrator for
-our group.
-
-Igor Krasheninnikov was wonderful at hardware procurement, repair, and
-network installation.
-
-Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a
-textbook he got the algorithm from in the code. Note that his analysis
-of how we could use the hashing code in making 32 bit NFS cookies work
-was probably more important than the actual algorithm. Colin Plumb also
-contributed to it.
-
-Chris Mason dived right into our code, and in just a few months produced
-the journaling code that dramatically increased the value of ReiserFS.
-He is just an amazing programmer.
-
-Igor Zagorovsky is writing much of the new item handler and extent code
-for our next major release.
-
-Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
-resizer, and is hard at work on implementing allocate on flush. SGI
-implemented allocate on flush before us for XFS, and generously took
-the time to convince me we should do it also. They are great people,
-and a great company.
-
-Yuri Shevchuk and Nikita Danilov are doing squid cache optimization.
-
-Vitaly Fertman is doing fsck.
-
-Jeff Mahoney, of SuSE, contributed a few cleanup fixes, most notably
-the endian safe patches which allow ReiserFS to run on any platform
-supported by the Linux kernel.
-
-SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the
-Alpha PC Company made it possible for me to not have a day job
-anymore, and to dramatically increase our staffing. Ecila funded
-hypertext feature development, MP3.com funded journaling, SuSE funded
-core development, IntegratedLinux.com funded squid web cache
-appliances, bigstorage.com funded HSM, and the alpha PC company funded
-the alpha port. Many of these tasks were helped by sponsors other
-than the ones just named. SuSE has helped in much more than just
-funding....
-
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
deleted file mode 100644
index 2571b1a8be84..000000000000
--- a/fs/reiserfs/acl.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/init.h>
-#include <linux/posix_acl.h>
-
-#define REISERFS_ACL_VERSION 0x0001
-
-typedef struct {
- __le16 e_tag;
- __le16 e_perm;
- __le32 e_id;
-} reiserfs_acl_entry;
-
-typedef struct {
- __le16 e_tag;
- __le16 e_perm;
-} reiserfs_acl_entry_short;
-
-typedef struct {
- __le32 a_version;
-} reiserfs_acl_header;
-
-static inline size_t reiserfs_acl_size(int count)
-{
- if (count <= 4) {
- return sizeof(reiserfs_acl_header) +
- count * sizeof(reiserfs_acl_entry_short);
- } else {
- return sizeof(reiserfs_acl_header) +
- 4 * sizeof(reiserfs_acl_entry_short) +
- (count - 4) * sizeof(reiserfs_acl_entry);
- }
-}
-
-static inline int reiserfs_acl_count(size_t size)
-{
- ssize_t s;
- size -= sizeof(reiserfs_acl_header);
- s = size - 4 * sizeof(reiserfs_acl_entry_short);
- if (s < 0) {
- if (size % sizeof(reiserfs_acl_entry_short))
- return -1;
- return size / sizeof(reiserfs_acl_entry_short);
- } else {
- if (s % sizeof(reiserfs_acl_entry))
- return -1;
- return s / sizeof(reiserfs_acl_entry) + 4;
- }
-}
-
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
-struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu);
-int reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
- struct posix_acl *acl, int type);
-int reiserfs_acl_chmod(struct dentry *dentry);
-int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
- struct inode *dir, struct dentry *dentry,
- struct inode *inode);
-int reiserfs_cache_default_acl(struct inode *dir);
-
-#else
-
-#define reiserfs_cache_default_acl(inode) 0
-#define reiserfs_get_acl NULL
-#define reiserfs_set_acl NULL
-
-static inline int reiserfs_acl_chmod(struct dentry *dentry)
-{
- return 0;
-}
-
-static inline int
-reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
- const struct inode *dir, struct dentry *dentry,
- struct inode *inode)
-{
- return 0;
-}
-#endif
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
deleted file mode 100644
index bf708ac287b4..000000000000
--- a/fs/reiserfs/bitmap.c
+++ /dev/null
@@ -1,1476 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-/* Reiserfs block (de)allocator, bitmap-based. */
-
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/buffer_head.h>
-#include <linux/kernel.h>
-#include <linux/pagemap.h>
-#include <linux/vmalloc.h>
-#include <linux/quotaops.h>
-#include <linux/seq_file.h>
-
-#define PREALLOCATION_SIZE 9
-
-/* different reiserfs block allocator options */
-
-#define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits)
-
-#define _ALLOC_concentrating_formatted_nodes 0
-#define _ALLOC_displacing_large_files 1
-#define _ALLOC_displacing_new_packing_localities 2
-#define _ALLOC_old_hashed_relocation 3
-#define _ALLOC_new_hashed_relocation 4
-#define _ALLOC_skip_busy 5
-#define _ALLOC_displace_based_on_dirid 6
-#define _ALLOC_hashed_formatted_nodes 7
-#define _ALLOC_old_way 8
-#define _ALLOC_hundredth_slices 9
-#define _ALLOC_dirid_groups 10
-#define _ALLOC_oid_groups 11
-#define _ALLOC_packing_groups 12
-
-#define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s))
-#define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s))
-#define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s))
-
-#define SET_OPTION(optname) \
- do { \
- reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \
- set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \
- } while(0)
-#define TEST_OPTION(optname, s) \
- test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
-
-static inline void get_bit_address(struct super_block *s,
- b_blocknr_t block,
- unsigned int *bmap_nr,
- unsigned int *offset)
-{
- /*
- * It is in the bitmap block number equal to the block
- * number divided by the number of bits in a block.
- */
- *bmap_nr = block >> (s->s_blocksize_bits + 3);
- /* Within that bitmap block it is located at bit offset *offset. */
- *offset = block & ((s->s_blocksize << 3) - 1);
-}
-
-int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
-{
- unsigned int bmap, offset;
- unsigned int bmap_count = reiserfs_bmap_count(s);
-
- if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
- reiserfs_error(s, "vs-4010",
- "block number is out of range %lu (%u)",
- block, SB_BLOCK_COUNT(s));
- return 0;
- }
-
- get_bit_address(s, block, &bmap, &offset);
-
- /*
- * Old format filesystem? Unlikely, but the bitmaps are all
- * up front so we need to account for it.
- */
- if (unlikely(test_bit(REISERFS_OLD_FORMAT,
- &REISERFS_SB(s)->s_properties))) {
- b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
- if (block >= bmap1 &&
- block <= bmap1 + bmap_count) {
- reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) "
- "can't be freed or reused",
- block, bmap_count);
- return 0;
- }
- } else {
- if (offset == 0) {
- reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) "
- "can't be freed or reused",
- block, bmap_count);
- return 0;
- }
- }
-
- if (bmap >= bmap_count) {
- reiserfs_error(s, "vs-4030", "bitmap for requested block "
- "is out of range: block=%lu, bitmap_nr=%u",
- block, bmap);
- return 0;
- }
-
- if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
- reiserfs_error(s, "vs-4050", "this is root block (%u), "
- "it must be busy", SB_ROOT_BLOCK(s));
- return 0;
- }
-
- return 1;
-}
-
-/*
- * Searches in journal structures for a given block number (bmap, off).
- * If block is found in reiserfs journal it suggests next free block
- * candidate to test.
- */
-static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
- int off, int *next)
-{
- b_blocknr_t tmp;
-
- if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) {
- if (tmp) { /* hint supplied */
- *next = tmp;
- PROC_INFO_INC(s, scan_bitmap.in_journal_hint);
- } else {
- (*next) = off + 1; /* inc offset to avoid looping. */
- PROC_INFO_INC(s, scan_bitmap.in_journal_nohint);
- }
- PROC_INFO_INC(s, scan_bitmap.retry);
- return 1;
- }
- return 0;
-}
-
-/*
- * Searches for a window of zero bits with given minimum and maximum
- * lengths in one bitmap block
- */
-static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
- unsigned int bmap_n, int *beg, int boundary,
- int min, int max, int unfm)
-{
- struct super_block *s = th->t_super;
- struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n];
- struct buffer_head *bh;
- int end, next;
- int org = *beg;
-
- BUG_ON(!th->t_trans_id);
- RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of "
- "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1);
- PROC_INFO_INC(s, scan_bitmap.bmap);
-
- if (!bi) {
- reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
- "for bitmap %d", bmap_n);
- return 0;
- }
-
- bh = reiserfs_read_bitmap_block(s, bmap_n);
- if (bh == NULL)
- return 0;
-
- while (1) {
-cont:
- if (bi->free_count < min) {
- brelse(bh);
- return 0; /* No free blocks in this bitmap */
- }
-
- /* search for a first zero bit -- beginning of a window */
- *beg = reiserfs_find_next_zero_le_bit
- ((unsigned long *)(bh->b_data), boundary, *beg);
-
- /*
- * search for a zero bit fails or the rest of bitmap block
- * cannot contain a zero window of minimum size
- */
- if (*beg + min > boundary) {
- brelse(bh);
- return 0;
- }
-
- if (unfm && is_block_in_journal(s, bmap_n, *beg, beg))
- continue;
- /* first zero bit found; we check next bits */
- for (end = *beg + 1;; end++) {
- if (end >= *beg + max || end >= boundary
- || reiserfs_test_le_bit(end, bh->b_data)) {
- next = end;
- break;
- }
-
- /*
- * finding the other end of zero bit window requires
- * looking into journal structures (in case of
- * searching for free blocks for unformatted nodes)
- */
- if (unfm && is_block_in_journal(s, bmap_n, end, &next))
- break;
- }
-
- /*
- * now (*beg) points to beginning of zero bits window,
- * (end) points to one bit after the window end
- */
-
- /* found window of proper size */
- if (end - *beg >= min) {
- int i;
- reiserfs_prepare_for_journal(s, bh, 1);
- /*
- * try to set all blocks used checking are
- * they still free
- */
- for (i = *beg; i < end; i++) {
- /* Don't check in journal again. */
- if (reiserfs_test_and_set_le_bit
- (i, bh->b_data)) {
- /*
- * bit was set by another process while
- * we slept in prepare_for_journal()
- */
- PROC_INFO_INC(s, scan_bitmap.stolen);
-
- /*
- * we can continue with smaller set
- * of allocated blocks, if length of
- * this set is more or equal to `min'
- */
- if (i >= *beg + min) {
- end = i;
- break;
- }
-
- /*
- * otherwise we clear all bit
- * were set ...
- */
- while (--i >= *beg)
- reiserfs_clear_le_bit
- (i, bh->b_data);
- reiserfs_restore_prepared_buffer(s, bh);
- *beg = org;
-
- /*
- * Search again in current block
- * from beginning
- */
- goto cont;
- }
- }
- bi->free_count -= (end - *beg);
- journal_mark_dirty(th, bh);
- brelse(bh);
-
- /* free block count calculation */
- reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
- 1);
- PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
- journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
-
- return end - (*beg);
- } else {
- *beg = next;
- }
- }
-}
-
-static int bmap_hash_id(struct super_block *s, u32 id)
-{
- char *hash_in = NULL;
- unsigned long hash;
- unsigned bm;
-
- if (id <= 2) {
- bm = 1;
- } else {
- hash_in = (char *)(&id);
- hash = keyed_hash(hash_in, 4);
- bm = hash % reiserfs_bmap_count(s);
- if (!bm)
- bm = 1;
- }
- /* this can only be true when SB_BMAP_NR = 1 */
- if (bm >= reiserfs_bmap_count(s))
- bm = 0;
- return bm;
-}
-
-/*
- * hashes the id and then returns > 0 if the block group for the
- * corresponding hash is full
- */
-static inline int block_group_used(struct super_block *s, u32 id)
-{
- int bm = bmap_hash_id(s, id);
- struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm];
-
- /*
- * If we don't have cached information on this bitmap block, we're
- * going to have to load it later anyway. Loading it here allows us
- * to make a better decision. This favors long-term performance gain
- * with a better on-disk layout vs. a short term gain of skipping the
- * read and potentially having a bad placement.
- */
- if (info->free_count == UINT_MAX) {
- struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm);
- brelse(bh);
- }
-
- if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) {
- return 0;
- }
- return 1;
-}
-
-/*
- * the packing is returned in disk byte order
- */
-__le32 reiserfs_choose_packing(struct inode * dir)
-{
- __le32 packing;
- if (TEST_OPTION(packing_groups, dir->i_sb)) {
- u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
- /*
- * some versions of reiserfsck expect packing locality 1 to be
- * special
- */
- if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir))
- packing = INODE_PKEY(dir)->k_objectid;
- else
- packing = INODE_PKEY(dir)->k_dir_id;
- } else
- packing = INODE_PKEY(dir)->k_objectid;
- return packing;
-}
-
-/*
- * Tries to find contiguous zero bit window (given size) in given region of
- * bitmap and place new blocks there. Returns number of allocated blocks.
- */
-static int scan_bitmap(struct reiserfs_transaction_handle *th,
- b_blocknr_t * start, b_blocknr_t finish,
- int min, int max, int unfm, sector_t file_block)
-{
- int nr_allocated = 0;
- struct super_block *s = th->t_super;
- unsigned int bm, off;
- unsigned int end_bm, end_off;
- unsigned int off_max = s->s_blocksize << 3;
-
- BUG_ON(!th->t_trans_id);
- PROC_INFO_INC(s, scan_bitmap.call);
-
- /* No point in looking for more free blocks */
- if (SB_FREE_BLOCKS(s) <= 0)
- return 0;
-
- get_bit_address(s, *start, &bm, &off);
- get_bit_address(s, finish, &end_bm, &end_off);
- if (bm > reiserfs_bmap_count(s))
- return 0;
- if (end_bm > reiserfs_bmap_count(s))
- end_bm = reiserfs_bmap_count(s);
-
- /*
- * When the bitmap is more than 10% free, anyone can allocate.
- * When it's less than 10% free, only files that already use the
- * bitmap are allowed. Once we pass 80% full, this restriction
- * is lifted.
- *
- * We do this so that files that grow later still have space close to
- * their original allocation. This improves locality, and presumably
- * performance as a result.
- *
- * This is only an allocation policy and does not make up for getting a
- * bad hint. Decent hinting must be implemented for this to work well.
- */
- if (TEST_OPTION(skip_busy, s)
- && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) {
- for (; bm < end_bm; bm++, off = 0) {
- if ((off && (!unfm || (file_block != 0)))
- || SB_AP_BITMAP(s)[bm].free_count >
- (s->s_blocksize << 3) / 10)
- nr_allocated =
- scan_bitmap_block(th, bm, &off, off_max,
- min, max, unfm);
- if (nr_allocated)
- goto ret;
- }
- /* we know from above that start is a reasonable number */
- get_bit_address(s, *start, &bm, &off);
- }
-
- for (; bm < end_bm; bm++, off = 0) {
- nr_allocated =
- scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
- if (nr_allocated)
- goto ret;
- }
-
- nr_allocated =
- scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
-
-ret:
- *start = bm * off_max + off;
- return nr_allocated;
-
-}
-
-static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
- struct inode *inode, b_blocknr_t block,
- int for_unformatted)
-{
- struct super_block *s = th->t_super;
- struct reiserfs_super_block *rs;
- struct buffer_head *sbh, *bmbh;
- struct reiserfs_bitmap_info *apbi;
- unsigned int nr, offset;
-
- BUG_ON(!th->t_trans_id);
- PROC_INFO_INC(s, free_block);
- rs = SB_DISK_SUPER_BLOCK(s);
- sbh = SB_BUFFER_WITH_SB(s);
- apbi = SB_AP_BITMAP(s);
-
- get_bit_address(s, block, &nr, &offset);
-
- if (nr >= reiserfs_bmap_count(s)) {
- reiserfs_error(s, "vs-4075", "block %lu is out of range",
- block);
- return;
- }
-
- bmbh = reiserfs_read_bitmap_block(s, nr);
- if (!bmbh)
- return;
-
- reiserfs_prepare_for_journal(s, bmbh, 1);
-
- /* clear bit for the given block in bit map */
- if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
- reiserfs_error(s, "vs-4080",
- "block %lu: bit already cleared", block);
- }
- apbi[nr].free_count++;
- journal_mark_dirty(th, bmbh);
- brelse(bmbh);
-
- reiserfs_prepare_for_journal(s, sbh, 1);
- /* update super block */
- set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
-
- journal_mark_dirty(th, sbh);
- if (for_unformatted) {
- int depth = reiserfs_write_unlock_nested(s);
- dquot_free_block_nodirty(inode, 1);
- reiserfs_write_lock_nested(s, depth);
- }
-}
-
-void reiserfs_free_block(struct reiserfs_transaction_handle *th,
- struct inode *inode, b_blocknr_t block,
- int for_unformatted)
-{
- struct super_block *s = th->t_super;
-
- BUG_ON(!th->t_trans_id);
- RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
- if (!is_reusable(s, block, 1))
- return;
-
- if (block > sb_block_count(REISERFS_SB(s)->s_rs)) {
- reiserfs_error(th->t_super, "bitmap-4072",
- "Trying to free block outside file system "
- "boundaries (%lu > %lu)",
- block, sb_block_count(REISERFS_SB(s)->s_rs));
- return;
- }
- /* mark it before we clear it, just in case */
- journal_mark_freed(th, s, block);
- _reiserfs_free_block(th, inode, block, for_unformatted);
-}
-
-/* preallocated blocks don't need to be run through journal_mark_freed */
-static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th,
- struct inode *inode, b_blocknr_t block)
-{
- BUG_ON(!th->t_trans_id);
- RFALSE(!th->t_super,
- "vs-4060: trying to free block on nonexistent device");
- if (!is_reusable(th->t_super, block, 1))
- return;
- _reiserfs_free_block(th, inode, block, 1);
-}
-
-static void __discard_prealloc(struct reiserfs_transaction_handle *th,
- struct reiserfs_inode_info *ei)
-{
- unsigned long save = ei->i_prealloc_block;
- int dirty = 0;
- struct inode *inode = &ei->vfs_inode;
-
- BUG_ON(!th->t_trans_id);
-#ifdef CONFIG_REISERFS_CHECK
- if (ei->i_prealloc_count < 0)
- reiserfs_error(th->t_super, "zam-4001",
- "inode has negative prealloc blocks count.");
-#endif
- while (ei->i_prealloc_count > 0) {
- b_blocknr_t block_to_free;
-
- /*
- * reiserfs_free_prealloc_block can drop the write lock,
- * which could allow another caller to free the same block.
- * We can protect against it by modifying the prealloc
- * state before calling it.
- */
- block_to_free = ei->i_prealloc_block++;
- ei->i_prealloc_count--;
- reiserfs_free_prealloc_block(th, inode, block_to_free);
- dirty = 1;
- }
- if (dirty)
- reiserfs_update_sd(th, inode);
- ei->i_prealloc_block = save;
- list_del_init(&ei->i_prealloc_list);
-}
-
-/* FIXME: It should be inline function */
-void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
- struct inode *inode)
-{
- struct reiserfs_inode_info *ei = REISERFS_I(inode);
-
- BUG_ON(!th->t_trans_id);
- if (ei->i_prealloc_count)
- __discard_prealloc(th, ei);
-}
-
-void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
-{
- struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
-
- BUG_ON(!th->t_trans_id);
- while (!list_empty(plist)) {
- struct reiserfs_inode_info *ei;
- ei = list_entry(plist->next, struct reiserfs_inode_info,
- i_prealloc_list);
-#ifdef CONFIG_REISERFS_CHECK
- if (!ei->i_prealloc_count) {
- reiserfs_error(th->t_super, "zam-4001",
- "inode is in prealloc list but has "
- "no preallocated blocks.");
- }
-#endif
- __discard_prealloc(th, ei);
- }
-}
-
-void reiserfs_init_alloc_options(struct super_block *s)
-{
- set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
- set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
- set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
-}
-
-/* block allocator related options are parsed here */
-int reiserfs_parse_alloc_options(struct super_block *s, char *options)
-{
- char *this_char, *value;
-
- /* clear default settings */
- REISERFS_SB(s)->s_alloc_options.bits = 0;
-
- while ((this_char = strsep(&options, ":")) != NULL) {
- if ((value = strchr(this_char, '=')) != NULL)
- *value++ = 0;
-
- if (!strcmp(this_char, "concentrating_formatted_nodes")) {
- int temp;
- SET_OPTION(concentrating_formatted_nodes);
- temp = (value
- && *value) ? simple_strtoul(value, &value,
- 0) : 10;
- if (temp <= 0 || temp > 100) {
- REISERFS_SB(s)->s_alloc_options.border = 10;
- } else {
- REISERFS_SB(s)->s_alloc_options.border =
- 100 / temp;
- }
- continue;
- }
- if (!strcmp(this_char, "displacing_large_files")) {
- SET_OPTION(displacing_large_files);
- REISERFS_SB(s)->s_alloc_options.large_file_size =
- (value
- && *value) ? simple_strtoul(value, &value, 0) : 16;
- continue;
- }
- if (!strcmp(this_char, "displacing_new_packing_localities")) {
- SET_OPTION(displacing_new_packing_localities);
- continue;
- }
-
- if (!strcmp(this_char, "old_hashed_relocation")) {
- SET_OPTION(old_hashed_relocation);
- continue;
- }
-
- if (!strcmp(this_char, "new_hashed_relocation")) {
- SET_OPTION(new_hashed_relocation);
- continue;
- }
-
- if (!strcmp(this_char, "dirid_groups")) {
- SET_OPTION(dirid_groups);
- continue;
- }
- if (!strcmp(this_char, "oid_groups")) {
- SET_OPTION(oid_groups);
- continue;
- }
- if (!strcmp(this_char, "packing_groups")) {
- SET_OPTION(packing_groups);
- continue;
- }
- if (!strcmp(this_char, "hashed_formatted_nodes")) {
- SET_OPTION(hashed_formatted_nodes);
- continue;
- }
-
- if (!strcmp(this_char, "skip_busy")) {
- SET_OPTION(skip_busy);
- continue;
- }
-
- if (!strcmp(this_char, "hundredth_slices")) {
- SET_OPTION(hundredth_slices);
- continue;
- }
-
- if (!strcmp(this_char, "old_way")) {
- SET_OPTION(old_way);
- continue;
- }
-
- if (!strcmp(this_char, "displace_based_on_dirid")) {
- SET_OPTION(displace_based_on_dirid);
- continue;
- }
-
- if (!strcmp(this_char, "preallocmin")) {
- REISERFS_SB(s)->s_alloc_options.preallocmin =
- (value
- && *value) ? simple_strtoul(value, &value, 0) : 4;
- continue;
- }
-
- if (!strcmp(this_char, "preallocsize")) {
- REISERFS_SB(s)->s_alloc_options.preallocsize =
- (value
- && *value) ? simple_strtoul(value, &value,
- 0) :
- PREALLOCATION_SIZE;
- continue;
- }
-
- reiserfs_warning(s, "zam-4001", "unknown option - %s",
- this_char);
- return 1;
- }
-
- reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
- return 0;
-}
-
-static void print_sep(struct seq_file *seq, int *first)
-{
- if (!*first)
- seq_puts(seq, ":");
- else
- *first = 0;
-}
-
-void show_alloc_options(struct seq_file *seq, struct super_block *s)
-{
- int first = 1;
-
- if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) |
- (1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups)))
- return;
-
- seq_puts(seq, ",alloc=");
-
- if (TEST_OPTION(concentrating_formatted_nodes, s)) {
- print_sep(seq, &first);
- if (REISERFS_SB(s)->s_alloc_options.border != 10) {
- seq_printf(seq, "concentrating_formatted_nodes=%d",
- 100 / REISERFS_SB(s)->s_alloc_options.border);
- } else
- seq_puts(seq, "concentrating_formatted_nodes");
- }
- if (TEST_OPTION(displacing_large_files, s)) {
- print_sep(seq, &first);
- if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) {
- seq_printf(seq, "displacing_large_files=%lu",
- REISERFS_SB(s)->s_alloc_options.large_file_size);
- } else
- seq_puts(seq, "displacing_large_files");
- }
- if (TEST_OPTION(displacing_new_packing_localities, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "displacing_new_packing_localities");
- }
- if (TEST_OPTION(old_hashed_relocation, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "old_hashed_relocation");
- }
- if (TEST_OPTION(new_hashed_relocation, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "new_hashed_relocation");
- }
- if (TEST_OPTION(dirid_groups, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "dirid_groups");
- }
- if (TEST_OPTION(oid_groups, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "oid_groups");
- }
- if (TEST_OPTION(packing_groups, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "packing_groups");
- }
- if (TEST_OPTION(hashed_formatted_nodes, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "hashed_formatted_nodes");
- }
- if (TEST_OPTION(skip_busy, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "skip_busy");
- }
- if (TEST_OPTION(hundredth_slices, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "hundredth_slices");
- }
- if (TEST_OPTION(old_way, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "old_way");
- }
- if (TEST_OPTION(displace_based_on_dirid, s)) {
- print_sep(seq, &first);
- seq_puts(seq, "displace_based_on_dirid");
- }
- if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) {
- print_sep(seq, &first);
- seq_printf(seq, "preallocmin=%d",
- REISERFS_SB(s)->s_alloc_options.preallocmin);
- }
- if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) {
- print_sep(seq, &first);
- seq_printf(seq, "preallocsize=%d",
- REISERFS_SB(s)->s_alloc_options.preallocsize);
- }
-}
-
-static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
-{
- char *hash_in;
-
- if (hint->formatted_node) {
- hash_in = (char *)&hint->key.k_dir_id;
- } else {
- if (!hint->inode) {
- /*hint->search_start = hint->beg;*/
- hash_in = (char *)&hint->key.k_dir_id;
- } else
- if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
- hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
- else
- hash_in =
- (char *)(&INODE_PKEY(hint->inode)->k_objectid);
- }
-
- hint->search_start =
- hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
-}
-
-/*
- * Relocation based on dirid, hashing them into a given bitmap block
- * files. Formatted nodes are unaffected, a separate policy covers them
- */
-static void dirid_groups(reiserfs_blocknr_hint_t * hint)
-{
- unsigned long hash;
- __u32 dirid = 0;
- int bm = 0;
- struct super_block *sb = hint->th->t_super;
-
- if (hint->inode)
- dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
- else if (hint->formatted_node)
- dirid = hint->key.k_dir_id;
-
- if (dirid) {
- bm = bmap_hash_id(sb, dirid);
- hash = bm * (sb->s_blocksize << 3);
- /* give a portion of the block group to metadata */
- if (hint->inode)
- hash += sb->s_blocksize / 2;
- hint->search_start = hash;
- }
-}
-
-/*
- * Relocation based on oid, hashing them into a given bitmap block
- * files. Formatted nodes are unaffected, a separate policy covers them
- */
-static void oid_groups(reiserfs_blocknr_hint_t * hint)
-{
- if (hint->inode) {
- unsigned long hash;
- __u32 oid;
- __u32 dirid;
- int bm;
-
- dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
-
- /*
- * keep the root dir and it's first set of subdirs close to
- * the start of the disk
- */
- if (dirid <= 2)
- hash = (hint->inode->i_sb->s_blocksize << 3);
- else {
- oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
- bm = bmap_hash_id(hint->inode->i_sb, oid);
- hash = bm * (hint->inode->i_sb->s_blocksize << 3);
- }
- hint->search_start = hash;
- }
-}
-
-/*
- * returns 1 if it finds an indirect item and gets valid hint info
- * from it, otherwise 0
- */
-static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
-{
- struct treepath *path;
- struct buffer_head *bh;
- struct item_head *ih;
- int pos_in_item;
- __le32 *item;
- int ret = 0;
-
- /*
- * reiserfs code can call this function w/o pointer to path
- * structure supplied; then we rely on supplied search_start
- */
- if (!hint->path)
- return 0;
-
- path = hint->path;
- bh = get_last_bh(path);
- RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor");
- ih = tp_item_head(path);
- pos_in_item = path->pos_in_item;
- item = tp_item_body(path);
-
- hint->search_start = bh->b_blocknr;
-
- /*
- * for indirect item: go to left and look for the first non-hole entry
- * in the indirect item
- */
- if (!hint->formatted_node && is_indirect_le_ih(ih)) {
- if (pos_in_item == I_UNFM_NUM(ih))
- pos_in_item--;
- while (pos_in_item >= 0) {
- int t = get_block_num(item, pos_in_item);
- if (t) {
- hint->search_start = t;
- ret = 1;
- break;
- }
- pos_in_item--;
- }
- }
-
- /* does result value fit into specified region? */
- return ret;
-}
-
-/*
- * should be, if formatted node, then try to put on first part of the device
- * specified as number of percent with mount option device, else try to put
- * on last of device. This is not to say it is good code to do so,
- * but the effect should be measured.
- */
-static inline void set_border_in_hint(struct super_block *s,
- reiserfs_blocknr_hint_t * hint)
-{
- b_blocknr_t border =
- SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
-
- if (hint->formatted_node)
- hint->end = border - 1;
- else
- hint->beg = border;
-}
-
-static inline void displace_large_file(reiserfs_blocknr_hint_t * hint)
-{
- if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
- hint->search_start =
- hint->beg +
- keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id),
- 4) % (hint->end - hint->beg);
- else
- hint->search_start =
- hint->beg +
- keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid),
- 4) % (hint->end - hint->beg);
-}
-
-static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint)
-{
- char *hash_in;
-
- if (!hint->inode)
- hash_in = (char *)&hint->key.k_dir_id;
- else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
- hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
- else
- hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
-
- hint->search_start =
- hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
-}
-
-static inline int
-this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *
- hint)
-{
- return hint->block ==
- REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
-}
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
-static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint)
-{
- struct in_core_key *key = &hint->key;
-
- hint->th->displace_new_blocks = 0;
- hint->search_start =
- hint->beg + keyed_hash((char *)(&key->k_objectid),
- 4) % (hint->end - hint->beg);
-}
-#endif
-
-static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint)
-{
- b_blocknr_t border;
- u32 hash_in;
-
- if (hint->formatted_node || hint->inode == NULL) {
- return 0;
- }
-
- hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
- border =
- hint->beg + (u32) keyed_hash(((char *)(&hash_in)),
- 4) % (hint->end - hint->beg - 1);
- if (border > hint->search_start)
- hint->search_start = border;
-
- return 1;
-}
-
-static inline int old_way(reiserfs_blocknr_hint_t * hint)
-{
- b_blocknr_t border;
-
- if (hint->formatted_node || hint->inode == NULL) {
- return 0;
- }
-
- border =
- hint->beg +
- le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end -
- hint->beg);
- if (border > hint->search_start)
- hint->search_start = border;
-
- return 1;
-}
-
-static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint)
-{
- struct in_core_key *key = &hint->key;
- b_blocknr_t slice_start;
-
- slice_start =
- (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100);
- if (slice_start > hint->search_start
- || slice_start + (hint->end / 100) <= hint->search_start) {
- hint->search_start = slice_start;
- }
-}
-
-static void determine_search_start(reiserfs_blocknr_hint_t * hint,
- int amount_needed)
-{
- struct super_block *s = hint->th->t_super;
- int unfm_hint;
-
- hint->beg = 0;
- hint->end = SB_BLOCK_COUNT(s) - 1;
-
- /* This is former border algorithm. Now with tunable border offset */
- if (concentrating_formatted_nodes(s))
- set_border_in_hint(s, hint);
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- /*
- * whenever we create a new directory, we displace it. At first
- * we will hash for location, later we might look for a moderately
- * empty place for it
- */
- if (displacing_new_packing_localities(s)
- && hint->th->displace_new_blocks) {
- displace_new_packing_locality(hint);
-
- /*
- * we do not continue determine_search_start,
- * if new packing locality is being displaced
- */
- return;
- }
-#endif
-
- /*
- * all persons should feel encouraged to add more special cases
- * here and test them
- */
-
- if (displacing_large_files(s) && !hint->formatted_node
- && this_blocknr_allocation_would_make_it_a_large_file(hint)) {
- displace_large_file(hint);
- return;
- }
-
- /*
- * if none of our special cases is relevant, use the left
- * neighbor in the tree order of the new node we are allocating for
- */
- if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) {
- hash_formatted_node(hint);
- return;
- }
-
- unfm_hint = get_left_neighbor(hint);
-
- /*
- * Mimic old block allocator behaviour, that is if VFS allowed for
- * preallocation, new blocks are displaced based on directory ID.
- * Also, if suggested search_start is less than last preallocated
- * block, we start searching from it, assuming that HDD dataflow
- * is faster in forward direction
- */
- if (TEST_OPTION(old_way, s)) {
- if (!hint->formatted_node) {
- if (!reiserfs_hashed_relocation(s))
- old_way(hint);
- else if (!reiserfs_no_unhashed_relocation(s))
- old_hashed_relocation(hint);
-
- if (hint->inode
- && hint->search_start <
- REISERFS_I(hint->inode)->i_prealloc_block)
- hint->search_start =
- REISERFS_I(hint->inode)->i_prealloc_block;
- }
- return;
- }
-
- /* This is an approach proposed by Hans */
- if (TEST_OPTION(hundredth_slices, s)
- && !(displacing_large_files(s) && !hint->formatted_node)) {
- hundredth_slices(hint);
- return;
- }
-
- /* old_hashed_relocation only works on unformatted */
- if (!unfm_hint && !hint->formatted_node &&
- TEST_OPTION(old_hashed_relocation, s)) {
- old_hashed_relocation(hint);
- }
-
- /* new_hashed_relocation works with both formatted/unformatted nodes */
- if ((!unfm_hint || hint->formatted_node) &&
- TEST_OPTION(new_hashed_relocation, s)) {
- new_hashed_relocation(hint);
- }
-
- /* dirid grouping works only on unformatted nodes */
- if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
- dirid_groups(hint);
- }
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
- dirid_groups(hint);
- }
-#endif
-
- /* oid grouping works only on unformatted nodes */
- if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) {
- oid_groups(hint);
- }
- return;
-}
-
-static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
-{
- /* make minimum size a mount option and benchmark both ways */
- /* we preallocate blocks only for regular files, specific size */
- /* benchmark preallocating always and see what happens */
-
- hint->prealloc_size = 0;
-
- if (!hint->formatted_node && hint->preallocate) {
- if (S_ISREG(hint->inode->i_mode) && !IS_PRIVATE(hint->inode)
- && hint->inode->i_size >=
- REISERFS_SB(hint->th->t_super)->s_alloc_options.
- preallocmin * hint->inode->i_sb->s_blocksize)
- hint->prealloc_size =
- REISERFS_SB(hint->th->t_super)->s_alloc_options.
- preallocsize - 1;
- }
- return CARRY_ON;
-}
-
-static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
- b_blocknr_t * new_blocknrs,
- b_blocknr_t start,
- b_blocknr_t finish, int min,
- int amount_needed,
- int prealloc_size)
-{
- int rest = amount_needed;
- int nr_allocated;
-
- while (rest > 0 && start <= finish) {
- nr_allocated = scan_bitmap(hint->th, &start, finish, min,
- rest + prealloc_size,
- !hint->formatted_node, hint->block);
-
- if (nr_allocated == 0) /* no new blocks allocated, return */
- break;
-
- /* fill free_blocknrs array first */
- while (rest > 0 && nr_allocated > 0) {
- *new_blocknrs++ = start++;
- rest--;
- nr_allocated--;
- }
-
- /* do we have something to fill prealloc. array also ? */
- if (nr_allocated > 0) {
- /*
- * it means prealloc_size was greater that 0 and
- * we do preallocation
- */
- list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
- &SB_JOURNAL(hint->th->t_super)->
- j_prealloc_list);
- REISERFS_I(hint->inode)->i_prealloc_block = start;
- REISERFS_I(hint->inode)->i_prealloc_count =
- nr_allocated;
- break;
- }
- }
-
- return (amount_needed - rest);
-}
-
-static inline int blocknrs_and_prealloc_arrays_from_search_start
- (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs,
- int amount_needed) {
- struct super_block *s = hint->th->t_super;
- b_blocknr_t start = hint->search_start;
- b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
- int passno = 0;
- int nr_allocated = 0;
- int depth;
-
- determine_prealloc_size(hint);
- if (!hint->formatted_node) {
- int quota_ret;
-#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(s, REISERFS_DEBUG_CODE,
- "reiserquota: allocating %d blocks id=%u",
- amount_needed, hint->inode->i_uid);
-#endif
- depth = reiserfs_write_unlock_nested(s);
- quota_ret =
- dquot_alloc_block_nodirty(hint->inode, amount_needed);
- if (quota_ret) { /* Quota exceeded? */
- reiserfs_write_lock_nested(s, depth);
- return QUOTA_EXCEEDED;
- }
- if (hint->preallocate && hint->prealloc_size) {
-#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(s, REISERFS_DEBUG_CODE,
- "reiserquota: allocating (prealloc) %d blocks id=%u",
- hint->prealloc_size, hint->inode->i_uid);
-#endif
- quota_ret = dquot_prealloc_block_nodirty(hint->inode,
- hint->prealloc_size);
- if (quota_ret)
- hint->preallocate = hint->prealloc_size = 0;
- }
- /* for unformatted nodes, force large allocations */
- reiserfs_write_lock_nested(s, depth);
- }
-
- do {
- switch (passno++) {
- case 0: /* Search from hint->search_start to end of disk */
- start = hint->search_start;
- finish = SB_BLOCK_COUNT(s) - 1;
- break;
- case 1: /* Search from hint->beg to hint->search_start */
- start = hint->beg;
- finish = hint->search_start;
- break;
- case 2: /* Last chance: Search from 0 to hint->beg */
- start = 0;
- finish = hint->beg;
- break;
- default:
- /* We've tried searching everywhere, not enough space */
- /* Free the blocks */
- if (!hint->formatted_node) {
-#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(s, REISERFS_DEBUG_CODE,
- "reiserquota: freeing (nospace) %d blocks id=%u",
- amount_needed +
- hint->prealloc_size -
- nr_allocated,
- hint->inode->i_uid);
-#endif
- /* Free not allocated blocks */
- depth = reiserfs_write_unlock_nested(s);
- dquot_free_block_nodirty(hint->inode,
- amount_needed + hint->prealloc_size -
- nr_allocated);
- reiserfs_write_lock_nested(s, depth);
- }
- while (nr_allocated--)
- reiserfs_free_block(hint->th, hint->inode,
- new_blocknrs[nr_allocated],
- !hint->formatted_node);
-
- return NO_DISK_SPACE;
- }
- } while ((nr_allocated += allocate_without_wrapping_disk(hint,
- new_blocknrs +
- nr_allocated,
- start, finish,
- 1,
- amount_needed -
- nr_allocated,
- hint->
- prealloc_size))
- < amount_needed);
- if (!hint->formatted_node &&
- amount_needed + hint->prealloc_size >
- nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
- /* Some of preallocation blocks were not allocated */
-#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(s, REISERFS_DEBUG_CODE,
- "reiserquota: freeing (failed prealloc) %d blocks id=%u",
- amount_needed + hint->prealloc_size -
- nr_allocated -
- REISERFS_I(hint->inode)->i_prealloc_count,
- hint->inode->i_uid);
-#endif
-
- depth = reiserfs_write_unlock_nested(s);
- dquot_free_block_nodirty(hint->inode, amount_needed +
- hint->prealloc_size - nr_allocated -
- REISERFS_I(hint->inode)->
- i_prealloc_count);
- reiserfs_write_lock_nested(s, depth);
- }
-
- return CARRY_ON;
-}
-
-/* grab new blocknrs from preallocated list */
-/* return amount still needed after using them */
-static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint,
- b_blocknr_t * new_blocknrs,
- int amount_needed)
-{
- struct inode *inode = hint->inode;
-
- if (REISERFS_I(inode)->i_prealloc_count > 0) {
- while (amount_needed) {
-
- *new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++;
- REISERFS_I(inode)->i_prealloc_count--;
-
- amount_needed--;
-
- if (REISERFS_I(inode)->i_prealloc_count <= 0) {
- list_del(&REISERFS_I(inode)->i_prealloc_list);
- break;
- }
- }
- }
- /* return amount still needed after using preallocated blocks */
- return amount_needed;
-}
-
-int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
- b_blocknr_t *new_blocknrs,
- int amount_needed,
- /* Amount of blocks we have already reserved */
- int reserved_by_us)
-{
- int initial_amount_needed = amount_needed;
- int ret;
- struct super_block *s = hint->th->t_super;
-
- /* Check if there is enough space, taking into account reserved space */
- if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
- amount_needed - reserved_by_us)
- return NO_DISK_SPACE;
- /* should this be if !hint->inode && hint->preallocate? */
- /* do you mean hint->formatted_node can be removed ? - Zam */
- /*
- * hint->formatted_node cannot be removed because we try to access
- * inode information here, and there is often no inode associated with
- * metadata allocations - green
- */
-
- if (!hint->formatted_node && hint->preallocate) {
- amount_needed = use_preallocated_list_if_available
- (hint, new_blocknrs, amount_needed);
-
- /*
- * We have all the block numbers we need from the
- * prealloc list
- */
- if (amount_needed == 0)
- return CARRY_ON;
- new_blocknrs += (initial_amount_needed - amount_needed);
- }
-
- /* find search start and save it in hint structure */
- determine_search_start(hint, amount_needed);
- if (hint->search_start >= SB_BLOCK_COUNT(s))
- hint->search_start = SB_BLOCK_COUNT(s) - 1;
-
- /* allocation itself; fill new_blocknrs and preallocation arrays */
- ret = blocknrs_and_prealloc_arrays_from_search_start
- (hint, new_blocknrs, amount_needed);
-
- /*
- * We used prealloc. list to fill (partially) new_blocknrs array.
- * If final allocation fails we need to return blocks back to
- * prealloc. list or just free them. -- Zam (I chose second
- * variant)
- */
- if (ret != CARRY_ON) {
- while (amount_needed++ < initial_amount_needed) {
- reiserfs_free_block(hint->th, hint->inode,
- *(--new_blocknrs), 1);
- }
- }
- return ret;
-}
-
-void reiserfs_cache_bitmap_metadata(struct super_block *sb,
- struct buffer_head *bh,
- struct reiserfs_bitmap_info *info)
-{
- unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size);
-
- /* The first bit must ALWAYS be 1 */
- if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data))
- reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is "
- "corrupted: first bit must be 1", bh->b_blocknr);
-
- info->free_count = 0;
-
- while (--cur >= (unsigned long *)bh->b_data) {
- /* 0 and ~0 are special, we can optimize for them */
- if (*cur == 0)
- info->free_count += BITS_PER_LONG;
- else if (*cur != ~0L) /* A mix, investigate */
- info->free_count += BITS_PER_LONG - hweight_long(*cur);
- }
-}
-
-struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
- unsigned int bitmap)
-{
- b_blocknr_t block = (sb->s_blocksize << 3) * bitmap;
- struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap;
- struct buffer_head *bh;
-
- /*
- * Way old format filesystems had the bitmaps packed up front.
- * I doubt there are any of these left, but just in case...
- */
- if (unlikely(test_bit(REISERFS_OLD_FORMAT,
- &REISERFS_SB(sb)->s_properties)))
- block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap;
- else if (bitmap == 0)
- block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
-
- bh = sb_bread(sb, block);
- if (bh == NULL)
- reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
- "reading failed", __func__, block);
- else {
- if (buffer_locked(bh)) {
- int depth;
- PROC_INFO_INC(sb, scan_bitmap.wait);
- depth = reiserfs_write_unlock_nested(sb);
- __wait_on_buffer(bh);
- reiserfs_write_lock_nested(sb, depth);
- }
- BUG_ON(!buffer_uptodate(bh));
- BUG_ON(atomic_read(&bh->b_count) == 0);
-
- if (info->free_count == UINT_MAX)
- reiserfs_cache_bitmap_metadata(sb, bh, info);
- }
-
- return bh;
-}
-
-int reiserfs_init_bitmap_cache(struct super_block *sb)
-{
- struct reiserfs_bitmap_info *bitmap;
- unsigned int bmap_nr = reiserfs_bmap_count(sb);
-
- bitmap = vmalloc(array_size(bmap_nr, sizeof(*bitmap)));
- if (bitmap == NULL)
- return -ENOMEM;
-
- memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr);
-
- SB_AP_BITMAP(sb) = bitmap;
-
- return 0;
-}
-
-void reiserfs_free_bitmap_cache(struct super_block *sb)
-{
- if (SB_AP_BITMAP(sb)) {
- vfree(SB_AP_BITMAP(sb));
- SB_AP_BITMAP(sb) = NULL;
- }
-}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
deleted file mode 100644
index 79ee2b436685..000000000000
--- a/fs/reiserfs/dir.c
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include "reiserfs.h"
-#include <linux/stat.h>
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-extern const struct reiserfs_key MIN_KEY;
-
-static int reiserfs_readdir(struct file *, struct dir_context *);
-static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync);
-
-const struct file_operations reiserfs_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .iterate_shared = reiserfs_readdir,
- .fsync = reiserfs_dir_fsync,
- .unlocked_ioctl = reiserfs_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = reiserfs_compat_ioctl,
-#endif
-};
-
-static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *inode = filp->f_mapping->host;
- int err;
-
- err = file_write_and_wait_range(filp, start, end);
- if (err)
- return err;
-
- inode_lock(inode);
- reiserfs_write_lock(inode->i_sb);
- err = reiserfs_commit_for_inode(inode);
- reiserfs_write_unlock(inode->i_sb);
- inode_unlock(inode);
- if (err < 0)
- return err;
- return 0;
-}
-
-#define store_ih(where,what) copy_item_head (where, what)
-
-static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
-{
- struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
- return (d_really_is_positive(privroot) &&
- deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid);
-}
-
-int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
-{
-
- /* key of current position in the directory (key of directory entry) */
- struct cpu_key pos_key;
-
- INITIALIZE_PATH(path_to_entry);
- struct buffer_head *bh;
- int item_num, entry_num;
- const struct reiserfs_key *rkey;
- struct item_head *ih, tmp_ih;
- int search_res;
- char *local_buf;
- loff_t next_pos;
- char small_buf[32]; /* avoid kmalloc if we can */
- struct reiserfs_dir_entry de;
- int ret = 0;
- int depth;
-
- reiserfs_write_lock(inode->i_sb);
-
- reiserfs_check_lock_depth(inode->i_sb, "readdir");
-
- /*
- * form key for search the next directory entry using
- * f_pos field of file structure
- */
- make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
- next_pos = cpu_key_k_offset(&pos_key);
-
- path_to_entry.reada = PATH_READA;
- while (1) {
-research:
- /*
- * search the directory item, containing entry with
- * specified key
- */
- search_res =
- search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
- &de);
- if (search_res == IO_ERROR) {
- /*
- * FIXME: we could just skip part of directory
- * which could not be read
- */
- ret = -EIO;
- goto out;
- }
- entry_num = de.de_entry_num;
- bh = de.de_bh;
- item_num = de.de_item_num;
- ih = de.de_ih;
- store_ih(&tmp_ih, ih);
-
- /* we must have found item, that is item of this directory, */
- RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key),
- "vs-9000: found item %h does not match to dir we readdir %K",
- ih, &pos_key);
- RFALSE(item_num > B_NR_ITEMS(bh) - 1,
- "vs-9005 item_num == %d, item amount == %d",
- item_num, B_NR_ITEMS(bh));
-
- /*
- * and entry must be not more than number of entries
- * in the item
- */
- RFALSE(ih_entry_count(ih) < entry_num,
- "vs-9010: entry number is too big %d (%d)",
- entry_num, ih_entry_count(ih));
-
- /*
- * go through all entries in the directory item beginning
- * from the entry, that has been found
- */
- if (search_res == POSITION_FOUND
- || entry_num < ih_entry_count(ih)) {
- struct reiserfs_de_head *deh =
- B_I_DEH(bh, ih) + entry_num;
-
- for (; entry_num < ih_entry_count(ih);
- entry_num++, deh++) {
- int d_reclen;
- char *d_name;
- ino_t d_ino;
- loff_t cur_pos = deh_offset(deh);
-
- /* it is hidden entry */
- if (!de_visible(deh))
- continue;
- d_reclen = entry_length(bh, ih, entry_num);
- d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
-
- if (d_reclen <= 0 ||
- d_name + d_reclen > bh->b_data + bh->b_size) {
- /*
- * There is corrupted data in entry,
- * We'd better stop here
- */
- pathrelse(&path_to_entry);
- ret = -EIO;
- goto out;
- }
-
- if (!d_name[d_reclen - 1])
- d_reclen = strlen(d_name);
-
- /* too big to send back to VFS */
- if (d_reclen >
- REISERFS_MAX_NAME(inode->i_sb->
- s_blocksize)) {
- continue;
- }
-
- /* Ignore the .reiserfs_priv entry */
- if (is_privroot_deh(inode, deh))
- continue;
-
- ctx->pos = deh_offset(deh);
- d_ino = deh_objectid(deh);
- if (d_reclen <= 32) {
- local_buf = small_buf;
- } else {
- local_buf = kmalloc(d_reclen,
- GFP_NOFS);
- if (!local_buf) {
- pathrelse(&path_to_entry);
- ret = -ENOMEM;
- goto out;
- }
- if (item_moved(&tmp_ih, &path_to_entry)) {
- kfree(local_buf);
- goto research;
- }
- }
-
- /*
- * Note, that we copy name to user space via
- * temporary buffer (local_buf) because
- * filldir will block if user space buffer is
- * swapped out. At that time entry can move to
- * somewhere else
- */
- memcpy(local_buf, d_name, d_reclen);
-
- /*
- * Since filldir might sleep, we can release
- * the write lock here for other waiters
- */
- depth = reiserfs_write_unlock_nested(inode->i_sb);
- if (!dir_emit
- (ctx, local_buf, d_reclen, d_ino,
- DT_UNKNOWN)) {
- reiserfs_write_lock_nested(inode->i_sb, depth);
- if (local_buf != small_buf) {
- kfree(local_buf);
- }
- goto end;
- }
- reiserfs_write_lock_nested(inode->i_sb, depth);
- if (local_buf != small_buf) {
- kfree(local_buf);
- }
-
- /* deh_offset(deh) may be invalid now. */
- next_pos = cur_pos + 1;
-
- if (item_moved(&tmp_ih, &path_to_entry)) {
- set_cpu_key_k_offset(&pos_key,
- next_pos);
- goto research;
- }
- } /* for */
- }
-
- /* end of directory has been reached */
- if (item_num != B_NR_ITEMS(bh) - 1)
- goto end;
-
- /*
- * item we went through is last item of node. Using right
- * delimiting key check is it directory end
- */
- rkey = get_rkey(&path_to_entry, inode->i_sb);
- if (!comp_le_keys(rkey, &MIN_KEY)) {
- /*
- * set pos_key to key, that is the smallest and greater
- * that key of the last entry in the item
- */
- set_cpu_key_k_offset(&pos_key, next_pos);
- continue;
- }
-
- /* end of directory has been reached */
- if (COMP_SHORT_KEYS(rkey, &pos_key)) {
- goto end;
- }
-
- /* directory continues in the right neighboring block */
- set_cpu_key_k_offset(&pos_key,
- le_key_k_offset(KEY_FORMAT_3_5, rkey));
-
- } /* while */
-
-end:
- ctx->pos = next_pos;
- pathrelse(&path_to_entry);
- reiserfs_check_path(&path_to_entry);
-out:
- reiserfs_write_unlock(inode->i_sb);
- return ret;
-}
-
-static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
-{
- return reiserfs_readdir_inode(file_inode(file), ctx);
-}
-
-/*
- * compose directory item containing "." and ".." entries (entries are
- * not aligned to 4 byte boundary)
- */
-void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
- __le32 par_dirid, __le32 par_objid)
-{
- struct reiserfs_de_head *dot, *dotdot;
-
- memset(body, 0, EMPTY_DIR_SIZE_V1);
- dot = (struct reiserfs_de_head *)body;
- dotdot = dot + 1;
-
- /* direntry header of "." */
- put_deh_offset(dot, DOT_OFFSET);
- /* these two are from make_le_item_head, and are LE */
- dot->deh_dir_id = dirid;
- dot->deh_objectid = objid;
- dot->deh_state = 0; /* Endian safe if 0 */
- put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen("."));
- mark_de_visible(dot);
-
- /* direntry header of ".." */
- put_deh_offset(dotdot, DOT_DOT_OFFSET);
- /* key of ".." for the root directory */
- /* these two are from the inode, and are LE */
- dotdot->deh_dir_id = par_dirid;
- dotdot->deh_objectid = par_objid;
- dotdot->deh_state = 0; /* Endian safe if 0 */
- put_deh_location(dotdot, deh_location(dot) - strlen(".."));
- mark_de_visible(dotdot);
-
- /* copy ".." and "." */
- memcpy(body + deh_location(dot), ".", 1);
- memcpy(body + deh_location(dotdot), "..", 2);
-}
-
-/* compose directory item containing "." and ".." entries */
-void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
- __le32 par_dirid, __le32 par_objid)
-{
- struct reiserfs_de_head *dot, *dotdot;
-
- memset(body, 0, EMPTY_DIR_SIZE);
- dot = (struct reiserfs_de_head *)body;
- dotdot = dot + 1;
-
- /* direntry header of "." */
- put_deh_offset(dot, DOT_OFFSET);
- /* these two are from make_le_item_head, and are LE */
- dot->deh_dir_id = dirid;
- dot->deh_objectid = objid;
- dot->deh_state = 0; /* Endian safe if 0 */
- put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
- mark_de_visible(dot);
-
- /* direntry header of ".." */
- put_deh_offset(dotdot, DOT_DOT_OFFSET);
- /* key of ".." for the root directory */
- /* these two are from the inode, and are LE */
- dotdot->deh_dir_id = par_dirid;
- dotdot->deh_objectid = par_objid;
- dotdot->deh_state = 0; /* Endian safe if 0 */
- put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen("..")));
- mark_de_visible(dotdot);
-
- /* copy ".." and "." */
- memcpy(body + deh_location(dot), ".", 1);
- memcpy(body + deh_location(dotdot), "..", 2);
-}
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
deleted file mode 100644
index 5129efc6f2e6..000000000000
--- a/fs/reiserfs/do_balan.c
+++ /dev/null
@@ -1,1900 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/*
- * Now we have all buffers that must be used in balancing of the tree
- * Further calculations can not cause schedule(), and thus the buffer
- * tree will be stable until the balancing will be finished
- * balance the tree according to the analysis made before,
- * and using buffers obtained after all above.
- */
-
-#include <linux/uaccess.h>
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-#include <linux/kernel.h>
-
-static inline void buffer_info_init_left(struct tree_balance *tb,
- struct buffer_info *bi)
-{
- bi->tb = tb;
- bi->bi_bh = tb->L[0];
- bi->bi_parent = tb->FL[0];
- bi->bi_position = get_left_neighbor_position(tb, 0);
-}
-
-static inline void buffer_info_init_right(struct tree_balance *tb,
- struct buffer_info *bi)
-{
- bi->tb = tb;
- bi->bi_bh = tb->R[0];
- bi->bi_parent = tb->FR[0];
- bi->bi_position = get_right_neighbor_position(tb, 0);
-}
-
-static inline void buffer_info_init_tbS0(struct tree_balance *tb,
- struct buffer_info *bi)
-{
- bi->tb = tb;
- bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
- bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
- bi->bi_position = PATH_H_POSITION(tb->tb_path, 1);
-}
-
-static inline void buffer_info_init_bh(struct tree_balance *tb,
- struct buffer_info *bi,
- struct buffer_head *bh)
-{
- bi->tb = tb;
- bi->bi_bh = bh;
- bi->bi_parent = NULL;
- bi->bi_position = 0;
-}
-
-inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
- struct buffer_head *bh, int flag)
-{
- journal_mark_dirty(tb->transaction_handle, bh);
-}
-
-#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
-#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
-
-/*
- * summary:
- * if deleting something ( tb->insert_size[0] < 0 )
- * return(balance_leaf_when_delete()); (flag d handled here)
- * else
- * if lnum is larger than 0 we put items into the left node
- * if rnum is larger than 0 we put items into the right node
- * if snum1 is larger than 0 we put items into the new node s1
- * if snum2 is larger than 0 we put items into the new node s2
- * Note that all *num* count new items being created.
- */
-
-static void balance_leaf_when_delete_del(struct tree_balance *tb)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int item_pos = PATH_LAST_POSITION(tb->tb_path);
- struct buffer_info bi;
-#ifdef CONFIG_REISERFS_CHECK
- struct item_head *ih = item_head(tbS0, item_pos);
-#endif
-
- RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
- "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
- -tb->insert_size[0], ih);
-
- buffer_info_init_tbS0(tb, &bi);
- leaf_delete_items(&bi, 0, item_pos, 1, -1);
-
- if (!item_pos && tb->CFL[0]) {
- if (B_NR_ITEMS(tbS0)) {
- replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
- } else {
- if (!PATH_H_POSITION(tb->tb_path, 1))
- replace_key(tb, tb->CFL[0], tb->lkey[0],
- PATH_H_PPARENT(tb->tb_path, 0), 0);
- }
- }
-
- RFALSE(!item_pos && !tb->CFL[0],
- "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
- tb->L[0]);
-}
-
-/* cut item in S[0] */
-static void balance_leaf_when_delete_cut(struct tree_balance *tb)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int item_pos = PATH_LAST_POSITION(tb->tb_path);
- struct item_head *ih = item_head(tbS0, item_pos);
- int pos_in_item = tb->tb_path->pos_in_item;
- struct buffer_info bi;
- buffer_info_init_tbS0(tb, &bi);
-
- if (is_direntry_le_ih(ih)) {
- /*
- * UFS unlink semantics are such that you can only
- * delete one directory entry at a time.
- *
- * when we cut a directory tb->insert_size[0] means
- * number of entries to be cut (always 1)
- */
- tb->insert_size[0] = -1;
- leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
- -tb->insert_size[0]);
-
- RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
- "PAP-12030: can not change delimiting key. CFL[0]=%p",
- tb->CFL[0]);
-
- if (!item_pos && !pos_in_item && tb->CFL[0])
- replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
- } else {
- leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
- -tb->insert_size[0]);
-
- RFALSE(!ih_item_len(ih),
- "PAP-12035: cut must leave non-zero dynamic "
- "length of item");
- }
-}
-
-static int balance_leaf_when_delete_left(struct tree_balance *tb)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int n = B_NR_ITEMS(tbS0);
-
- /* L[0] must be joined with S[0] */
- if (tb->lnum[0] == -1) {
- /* R[0] must be also joined with S[0] */
- if (tb->rnum[0] == -1) {
- if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
- /*
- * all contents of all the
- * 3 buffers will be in L[0]
- */
- if (PATH_H_POSITION(tb->tb_path, 1) == 0 &&
- 1 < B_NR_ITEMS(tb->FR[0]))
- replace_key(tb, tb->CFL[0],
- tb->lkey[0], tb->FR[0], 1);
-
- leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1,
- NULL);
- leaf_move_items(LEAF_FROM_R_TO_L, tb,
- B_NR_ITEMS(tb->R[0]), -1,
- NULL);
-
- reiserfs_invalidate_buffer(tb, tbS0);
- reiserfs_invalidate_buffer(tb, tb->R[0]);
-
- return 0;
- }
-
- /* all contents of all the 3 buffers will be in R[0] */
- leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL);
- leaf_move_items(LEAF_FROM_L_TO_R, tb,
- B_NR_ITEMS(tb->L[0]), -1, NULL);
-
- /* right_delimiting_key is correct in R[0] */
- replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
- reiserfs_invalidate_buffer(tb, tbS0);
- reiserfs_invalidate_buffer(tb, tb->L[0]);
-
- return -1;
- }
-
- RFALSE(tb->rnum[0] != 0,
- "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
- /* all contents of L[0] and S[0] will be in L[0] */
- leaf_shift_left(tb, n, -1);
-
- reiserfs_invalidate_buffer(tb, tbS0);
-
- return 0;
- }
-
- /*
- * a part of contents of S[0] will be in L[0] and
- * the rest part of S[0] will be in R[0]
- */
-
- RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
- (tb->lnum[0] + tb->rnum[0] > n + 1),
- "PAP-12050: rnum(%d) and lnum(%d) and item "
- "number(%d) in S[0] are not consistent",
- tb->rnum[0], tb->lnum[0], n);
- RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
- (tb->lbytes != -1 || tb->rbytes != -1),
- "PAP-12055: bad rbytes (%d)/lbytes (%d) "
- "parameters when items are not split",
- tb->rbytes, tb->lbytes);
- RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
- (tb->lbytes < 1 || tb->rbytes != -1),
- "PAP-12060: bad rbytes (%d)/lbytes (%d) "
- "parameters when items are split",
- tb->rbytes, tb->lbytes);
-
- leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-
- reiserfs_invalidate_buffer(tb, tbS0);
-
- return 0;
-}
-
-/*
- * Balance leaf node in case of delete or cut: insert_size[0] < 0
- *
- * lnum, rnum can have values >= -1
- * -1 means that the neighbor must be joined with S
- * 0 means that nothing should be done with the neighbor
- * >0 means to shift entirely or partly the specified number of items
- * to the neighbor
- */
-static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- struct buffer_info bi;
- int n;
-
- RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
- "vs- 12000: level: wrong FR %z", tb->FR[0]);
- RFALSE(tb->blknum[0] > 1,
- "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
- RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
- "PAP-12010: tree can not be empty");
-
- buffer_info_init_tbS0(tb, &bi);
-
- /* Delete or truncate the item */
-
- BUG_ON(flag != M_DELETE && flag != M_CUT);
- if (flag == M_DELETE)
- balance_leaf_when_delete_del(tb);
- else /* M_CUT */
- balance_leaf_when_delete_cut(tb);
-
-
- /*
- * the rule is that no shifting occurs unless by shifting
- * a node can be freed
- */
- n = B_NR_ITEMS(tbS0);
-
-
- /* L[0] takes part in balancing */
- if (tb->lnum[0])
- return balance_leaf_when_delete_left(tb);
-
- if (tb->rnum[0] == -1) {
- /* all contents of R[0] and S[0] will be in R[0] */
- leaf_shift_right(tb, n, -1);
- reiserfs_invalidate_buffer(tb, tbS0);
- return 0;
- }
-
- RFALSE(tb->rnum[0],
- "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
- return 0;
-}
-
-static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
- struct item_head *const ih,
- const char * const body)
-{
- int ret;
- struct buffer_info bi;
- int n = B_NR_ITEMS(tb->L[0]);
- unsigned body_shift_bytes = 0;
-
- if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
- /* part of new item falls into L[0] */
- int new_item_len, shift;
-
- ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
-
- /* Calculate item length to insert to S[0] */
- new_item_len = ih_item_len(ih) - tb->lbytes;
-
- /* Calculate and check item length to insert to L[0] */
- put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
-
- RFALSE(ih_item_len(ih) <= 0,
- "PAP-12080: there is nothing to insert into L[0]: "
- "ih_item_len=%d", ih_item_len(ih));
-
- /* Insert new item into L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
- min_t(int, tb->zeroes_num, ih_item_len(ih)));
-
- /*
- * Calculate key component, item length and body to
- * insert into S[0]
- */
- shift = 0;
- if (is_indirect_le_ih(ih))
- shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
-
- add_le_ih_k_offset(ih, tb->lbytes << shift);
-
- put_ih_item_len(ih, new_item_len);
- if (tb->lbytes > tb->zeroes_num) {
- body_shift_bytes = tb->lbytes - tb->zeroes_num;
- tb->zeroes_num = 0;
- } else
- tb->zeroes_num -= tb->lbytes;
-
- RFALSE(ih_item_len(ih) <= 0,
- "PAP-12085: there is nothing to insert into S[0]: "
- "ih_item_len=%d", ih_item_len(ih));
- } else {
- /* new item in whole falls into L[0] */
- /* Shift lnum[0]-1 items to L[0] */
- ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
-
- /* Insert new item into L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
- tb->zeroes_num);
- tb->insert_size[0] = 0;
- tb->zeroes_num = 0;
- }
- return body_shift_bytes;
-}
-
-static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- int n = B_NR_ITEMS(tb->L[0]);
- struct buffer_info bi;
-
- RFALSE(tb->zeroes_num,
- "PAP-12090: invalid parameter in case of a directory");
-
- /* directory item */
- if (tb->lbytes > tb->pos_in_item) {
- /* new directory entry falls into L[0] */
- struct item_head *pasted;
- int ret, l_pos_in_item = tb->pos_in_item;
-
- /*
- * Shift lnum[0] - 1 items in whole.
- * Shift lbytes - 1 entries from given directory item
- */
- ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
- if (ret && !tb->item_pos) {
- pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
- l_pos_in_item += ih_entry_count(pasted) -
- (tb->lbytes - 1);
- }
-
- /* Append given directory entry to directory item */
- buffer_info_init_left(tb, &bi);
- leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
- l_pos_in_item, tb->insert_size[0],
- body, tb->zeroes_num);
-
- /*
- * previous string prepared space for pasting new entry,
- * following string pastes this entry
- */
-
- /*
- * when we have merge directory item, pos_in_item
- * has been changed too
- */
-
- /* paste new directory entry. 1 is entry number */
- leaf_paste_entries(&bi, n + tb->item_pos - ret,
- l_pos_in_item, 1,
- (struct reiserfs_de_head *) body,
- body + DEH_SIZE, tb->insert_size[0]);
- tb->insert_size[0] = 0;
- } else {
- /* new directory item doesn't fall into L[0] */
- /*
- * Shift lnum[0]-1 items in whole. Shift lbytes
- * directory entries from directory item number lnum[0]
- */
- leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
- }
-
- /* Calculate new position to append in item body */
- tb->pos_in_item -= tb->lbytes;
-}
-
-static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int n = B_NR_ITEMS(tb->L[0]);
- struct buffer_info bi;
- int body_shift_bytes = 0;
-
- if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
- balance_leaf_paste_left_shift_dirent(tb, ih, body);
- return 0;
- }
-
- RFALSE(tb->lbytes <= 0,
- "PAP-12095: there is nothing to shift to L[0]. "
- "lbytes=%d", tb->lbytes);
- RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
- "PAP-12100: incorrect position to paste: "
- "item_len=%d, pos_in_item=%d",
- ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item);
-
- /* appended item will be in L[0] in whole */
- if (tb->lbytes >= tb->pos_in_item) {
- struct item_head *tbS0_pos_ih, *tbL0_ih;
- struct item_head *tbS0_0_ih;
- struct reiserfs_key *left_delim_key;
- int ret, l_n, version, temp_l;
-
- tbS0_pos_ih = item_head(tbS0, tb->item_pos);
- tbS0_0_ih = item_head(tbS0, 0);
-
- /*
- * this bytes number must be appended
- * to the last item of L[h]
- */
- l_n = tb->lbytes - tb->pos_in_item;
-
- /* Calculate new insert_size[0] */
- tb->insert_size[0] -= l_n;
-
- RFALSE(tb->insert_size[0] <= 0,
- "PAP-12105: there is nothing to paste into "
- "L[0]. insert_size=%d", tb->insert_size[0]);
-
- ret = leaf_shift_left(tb, tb->lnum[0],
- ih_item_len(tbS0_pos_ih));
-
- tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret);
-
- /* Append to body of item in L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
- ih_item_len(tbL0_ih), l_n, body,
- min_t(int, l_n, tb->zeroes_num));
-
- /*
- * 0-th item in S0 can be only of DIRECT type
- * when l_n != 0
- */
- temp_l = l_n;
-
- RFALSE(ih_item_len(tbS0_0_ih),
- "PAP-12106: item length must be 0");
- RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
- leaf_key(tb->L[0], n + tb->item_pos - ret)),
- "PAP-12107: items must be of the same file");
-
- if (is_indirect_le_ih(tbL0_ih)) {
- int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
- temp_l = l_n << shift;
- }
- /* update key of first item in S0 */
- version = ih_version(tbS0_0_ih);
- add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l);
-
- /* update left delimiting key */
- left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]);
- add_le_key_k_offset(version, left_delim_key, temp_l);
-
- /*
- * Calculate new body, position in item and
- * insert_size[0]
- */
- if (l_n > tb->zeroes_num) {
- body_shift_bytes = l_n - tb->zeroes_num;
- tb->zeroes_num = 0;
- } else
- tb->zeroes_num -= l_n;
- tb->pos_in_item = 0;
-
- RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
- leaf_key(tb->L[0],
- B_NR_ITEMS(tb->L[0]) - 1)) ||
- !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) ||
- !op_is_left_mergeable(left_delim_key, tbS0->b_size),
- "PAP-12120: item must be merge-able with left "
- "neighboring item");
- } else {
- /* only part of the appended item will be in L[0] */
-
- /* Calculate position in item for append in S[0] */
- tb->pos_in_item -= tb->lbytes;
-
- RFALSE(tb->pos_in_item <= 0,
- "PAP-12125: no place for paste. pos_in_item=%d",
- tb->pos_in_item);
-
- /*
- * Shift lnum[0] - 1 items in whole.
- * Shift lbytes - 1 byte from item number lnum[0]
- */
- leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
- }
- return body_shift_bytes;
-}
-
-
-/* appended item will be in L[0] in whole */
-static void balance_leaf_paste_left_whole(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int n = B_NR_ITEMS(tb->L[0]);
- struct buffer_info bi;
- struct item_head *pasted;
- int ret;
-
- /* if we paste into first item of S[0] and it is left mergable */
- if (!tb->item_pos &&
- op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) {
- /*
- * then increment pos_in_item by the size of the
- * last item in L[0]
- */
- pasted = item_head(tb->L[0], n - 1);
- if (is_direntry_le_ih(pasted))
- tb->pos_in_item += ih_entry_count(pasted);
- else
- tb->pos_in_item += ih_item_len(pasted);
- }
-
- /*
- * Shift lnum[0] - 1 items in whole.
- * Shift lbytes - 1 byte from item number lnum[0]
- */
- ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
-
- /* Append to body of item in L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item,
- tb->insert_size[0], body, tb->zeroes_num);
-
- /* if appended item is directory, paste entry */
- pasted = item_head(tb->L[0], n + tb->item_pos - ret);
- if (is_direntry_le_ih(pasted))
- leaf_paste_entries(&bi, n + tb->item_pos - ret,
- tb->pos_in_item, 1,
- (struct reiserfs_de_head *)body,
- body + DEH_SIZE, tb->insert_size[0]);
-
- /*
- * if appended item is indirect item, put unformatted node
- * into un list
- */
- if (is_indirect_le_ih(pasted))
- set_ih_free_space(pasted, 0);
-
- tb->insert_size[0] = 0;
- tb->zeroes_num = 0;
-}
-
-static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- /* we must shift the part of the appended item */
- if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
- return balance_leaf_paste_left_shift(tb, ih, body);
- else
- balance_leaf_paste_left_whole(tb, ih, body);
- return 0;
-}
-
-/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
-static unsigned int balance_leaf_left(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body, int flag)
-{
- if (tb->lnum[0] <= 0)
- return 0;
-
- /* new item or it part falls to L[0], shift it too */
- if (tb->item_pos < tb->lnum[0]) {
- BUG_ON(flag != M_INSERT && flag != M_PASTE);
-
- if (flag == M_INSERT)
- return balance_leaf_insert_left(tb, ih, body);
- else /* M_PASTE */
- return balance_leaf_paste_left(tb, ih, body);
- } else
- /* new item doesn't fall into L[0] */
- leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
- return 0;
-}
-
-
-static void balance_leaf_insert_right(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
-
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int n = B_NR_ITEMS(tbS0);
- struct buffer_info bi;
-
- /* new item or part of it doesn't fall into R[0] */
- if (n - tb->rnum[0] >= tb->item_pos) {
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
- return;
- }
-
- /* new item or its part falls to R[0] */
-
- /* part of new item falls into R[0] */
- if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
- loff_t old_key_comp, old_len, r_zeroes_number;
- const char *r_body;
- int shift;
- loff_t offset;
-
- leaf_shift_right(tb, tb->rnum[0] - 1, -1);
-
- /* Remember key component and item length */
- old_key_comp = le_ih_k_offset(ih);
- old_len = ih_item_len(ih);
-
- /*
- * Calculate key component and item length to insert
- * into R[0]
- */
- shift = 0;
- if (is_indirect_le_ih(ih))
- shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
- offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift);
- set_le_ih_k_offset(ih, offset);
- put_ih_item_len(ih, tb->rbytes);
-
- /* Insert part of the item into R[0] */
- buffer_info_init_right(tb, &bi);
- if ((old_len - tb->rbytes) > tb->zeroes_num) {
- r_zeroes_number = 0;
- r_body = body + (old_len - tb->rbytes) - tb->zeroes_num;
- } else {
- r_body = body;
- r_zeroes_number = tb->zeroes_num -
- (old_len - tb->rbytes);
- tb->zeroes_num -= r_zeroes_number;
- }
-
- leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
-
- /* Replace right delimiting key by first key in R[0] */
- replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
- /*
- * Calculate key component and item length to
- * insert into S[0]
- */
- set_le_ih_k_offset(ih, old_key_comp);
- put_ih_item_len(ih, old_len - tb->rbytes);
-
- tb->insert_size[0] -= tb->rbytes;
-
- } else {
- /* whole new item falls into R[0] */
-
- /* Shift rnum[0]-1 items to R[0] */
- leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
-
- /* Insert new item into R[0] */
- buffer_info_init_right(tb, &bi);
- leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1,
- ih, body, tb->zeroes_num);
-
- if (tb->item_pos - n + tb->rnum[0] - 1 == 0)
- replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
- tb->zeroes_num = tb->insert_size[0] = 0;
- }
-}
-
-
-static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- struct buffer_info bi;
- int entry_count;
-
- RFALSE(tb->zeroes_num,
- "PAP-12145: invalid parameter in case of a directory");
- entry_count = ih_entry_count(item_head(tbS0, tb->item_pos));
-
- /* new directory entry falls into R[0] */
- if (entry_count - tb->rbytes < tb->pos_in_item) {
- int paste_entry_position;
-
- RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0],
- "PAP-12150: no enough of entries to shift to R[0]: "
- "rbytes=%d, entry_count=%d", tb->rbytes, entry_count);
-
- /*
- * Shift rnum[0]-1 items in whole.
- * Shift rbytes-1 directory entries from directory
- * item number rnum[0]
- */
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
-
- /* Paste given directory entry to directory item */
- paste_entry_position = tb->pos_in_item - entry_count +
- tb->rbytes - 1;
- buffer_info_init_right(tb, &bi);
- leaf_paste_in_buffer(&bi, 0, paste_entry_position,
- tb->insert_size[0], body, tb->zeroes_num);
-
- /* paste entry */
- leaf_paste_entries(&bi, 0, paste_entry_position, 1,
- (struct reiserfs_de_head *) body,
- body + DEH_SIZE, tb->insert_size[0]);
-
- /* change delimiting keys */
- if (paste_entry_position == 0)
- replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
- tb->insert_size[0] = 0;
- tb->pos_in_item++;
- } else {
- /* new directory entry doesn't fall into R[0] */
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
- }
-}
-
-static void balance_leaf_paste_right_shift(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int n_shift, n_rem, r_zeroes_number, version;
- unsigned long temp_rem;
- const char *r_body;
- struct buffer_info bi;
-
- /* we append to directory item */
- if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
- balance_leaf_paste_right_shift_dirent(tb, ih, body);
- return;
- }
-
- /* regular object */
-
- /*
- * Calculate number of bytes which must be shifted
- * from appended item
- */
- n_shift = tb->rbytes - tb->insert_size[0];
- if (n_shift < 0)
- n_shift = 0;
-
- RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
- "PAP-12155: invalid position to paste. ih_item_len=%d, "
- "pos_in_item=%d", tb->pos_in_item,
- ih_item_len(item_head(tbS0, tb->item_pos)));
-
- leaf_shift_right(tb, tb->rnum[0], n_shift);
-
- /*
- * Calculate number of bytes which must remain in body
- * after appending to R[0]
- */
- n_rem = tb->insert_size[0] - tb->rbytes;
- if (n_rem < 0)
- n_rem = 0;
-
- temp_rem = n_rem;
-
- version = ih_version(item_head(tb->R[0], 0));
-
- if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) {
- int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
- temp_rem = n_rem << shift;
- }
-
- add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem);
- add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]),
- temp_rem);
-
- do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
-
- /* Append part of body into R[0] */
- buffer_info_init_right(tb, &bi);
- if (n_rem > tb->zeroes_num) {
- r_zeroes_number = 0;
- r_body = body + n_rem - tb->zeroes_num;
- } else {
- r_body = body;
- r_zeroes_number = tb->zeroes_num - n_rem;
- tb->zeroes_num -= r_zeroes_number;
- }
-
- leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
- r_body, r_zeroes_number);
-
- if (is_indirect_le_ih(item_head(tb->R[0], 0)))
- set_ih_free_space(item_head(tb->R[0], 0), 0);
-
- tb->insert_size[0] = n_rem;
- if (!n_rem)
- tb->pos_in_item++;
-}
-
-static void balance_leaf_paste_right_whole(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int n = B_NR_ITEMS(tbS0);
- struct item_head *pasted;
- struct buffer_info bi;
-
- buffer_info_init_right(tb, &bi);
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-
- /* append item in R[0] */
- if (tb->pos_in_item >= 0) {
- buffer_info_init_right(tb, &bi);
- leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0],
- tb->pos_in_item, tb->insert_size[0], body,
- tb->zeroes_num);
- }
-
- /* paste new entry, if item is directory item */
- pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]);
- if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) {
- leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0],
- tb->pos_in_item, 1,
- (struct reiserfs_de_head *)body,
- body + DEH_SIZE, tb->insert_size[0]);
-
- if (!tb->pos_in_item) {
-
- RFALSE(tb->item_pos - n + tb->rnum[0],
- "PAP-12165: directory item must be first "
- "item of node when pasting is in 0th position");
-
- /* update delimiting keys */
- replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
- }
- }
-
- if (is_indirect_le_ih(pasted))
- set_ih_free_space(pasted, 0);
- tb->zeroes_num = tb->insert_size[0] = 0;
-}
-
-static void balance_leaf_paste_right(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int n = B_NR_ITEMS(tbS0);
-
- /* new item doesn't fall into R[0] */
- if (n - tb->rnum[0] > tb->item_pos) {
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
- return;
- }
-
- /* pasted item or part of it falls to R[0] */
-
- if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1)
- /* we must shift the part of the appended item */
- balance_leaf_paste_right_shift(tb, ih, body);
- else
- /* pasted item in whole falls into R[0] */
- balance_leaf_paste_right_whole(tb, ih, body);
-}
-
-/* shift rnum[0] items from S[0] to the right neighbor R[0] */
-static void balance_leaf_right(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body, int flag)
-{
- if (tb->rnum[0] <= 0)
- return;
-
- BUG_ON(flag != M_INSERT && flag != M_PASTE);
-
- if (flag == M_INSERT)
- balance_leaf_insert_right(tb, ih, body);
- else /* M_PASTE */
- balance_leaf_paste_right(tb, ih, body);
-}
-
-static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body,
- struct item_head *insert_key,
- struct buffer_head **insert_ptr,
- int i)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int n = B_NR_ITEMS(tbS0);
- struct buffer_info bi;
- int shift;
-
- /* new item or it part don't falls into S_new[i] */
- if (n - tb->snum[i] >= tb->item_pos) {
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- tb->snum[i], tb->sbytes[i], tb->S_new[i]);
- return;
- }
-
- /* new item or it's part falls to first new node S_new[i] */
-
- /* part of new item falls into S_new[i] */
- if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
- int old_key_comp, old_len, r_zeroes_number;
- const char *r_body;
-
- /* Move snum[i]-1 items from S[0] to S_new[i] */
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
- tb->S_new[i]);
-
- /* Remember key component and item length */
- old_key_comp = le_ih_k_offset(ih);
- old_len = ih_item_len(ih);
-
- /*
- * Calculate key component and item length to insert
- * into S_new[i]
- */
- shift = 0;
- if (is_indirect_le_ih(ih))
- shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
- set_le_ih_k_offset(ih,
- le_ih_k_offset(ih) +
- ((old_len - tb->sbytes[i]) << shift));
-
- put_ih_item_len(ih, tb->sbytes[i]);
-
- /* Insert part of the item into S_new[i] before 0-th item */
- buffer_info_init_bh(tb, &bi, tb->S_new[i]);
-
- if ((old_len - tb->sbytes[i]) > tb->zeroes_num) {
- r_zeroes_number = 0;
- r_body = body + (old_len - tb->sbytes[i]) -
- tb->zeroes_num;
- } else {
- r_body = body;
- r_zeroes_number = tb->zeroes_num - (old_len -
- tb->sbytes[i]);
- tb->zeroes_num -= r_zeroes_number;
- }
-
- leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
-
- /*
- * Calculate key component and item length to
- * insert into S[i]
- */
- set_le_ih_k_offset(ih, old_key_comp);
- put_ih_item_len(ih, old_len - tb->sbytes[i]);
- tb->insert_size[0] -= tb->sbytes[i];
- } else {
- /* whole new item falls into S_new[i] */
-
- /*
- * Shift snum[0] - 1 items to S_new[i]
- * (sbytes[i] of split item)
- */
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]);
-
- /* Insert new item into S_new[i] */
- buffer_info_init_bh(tb, &bi, tb->S_new[i]);
- leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1,
- ih, body, tb->zeroes_num);
-
- tb->zeroes_num = tb->insert_size[0] = 0;
- }
-}
-
-/* we append to directory item */
-static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body,
- struct item_head *insert_key,
- struct buffer_head **insert_ptr,
- int i)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
- int entry_count = ih_entry_count(aux_ih);
- struct buffer_info bi;
-
- if (entry_count - tb->sbytes[i] < tb->pos_in_item &&
- tb->pos_in_item <= entry_count) {
- /* new directory entry falls into S_new[i] */
-
- RFALSE(!tb->insert_size[0],
- "PAP-12215: insert_size is already 0");
- RFALSE(tb->sbytes[i] - 1 >= entry_count,
- "PAP-12220: there are no so much entries (%d), only %d",
- tb->sbytes[i] - 1, entry_count);
-
- /*
- * Shift snum[i]-1 items in whole.
- * Shift sbytes[i] directory entries
- * from directory item number snum[i]
- */
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
- tb->sbytes[i] - 1, tb->S_new[i]);
-
- /*
- * Paste given directory entry to
- * directory item
- */
- buffer_info_init_bh(tb, &bi, tb->S_new[i]);
- leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count +
- tb->sbytes[i] - 1, tb->insert_size[0],
- body, tb->zeroes_num);
-
- /* paste new directory entry */
- leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count +
- tb->sbytes[i] - 1, 1,
- (struct reiserfs_de_head *) body,
- body + DEH_SIZE, tb->insert_size[0]);
-
- tb->insert_size[0] = 0;
- tb->pos_in_item++;
- } else {
- /* new directory entry doesn't fall into S_new[i] */
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
- tb->sbytes[i], tb->S_new[i]);
- }
-
-}
-
-static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body,
- struct item_head *insert_key,
- struct buffer_head **insert_ptr,
- int i)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
- int n_shift, n_rem, r_zeroes_number, shift;
- const char *r_body;
- struct item_head *tmp;
- struct buffer_info bi;
-
- RFALSE(ih, "PAP-12210: ih must be 0");
-
- if (is_direntry_le_ih(aux_ih)) {
- balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key,
- insert_ptr, i);
- return;
- }
-
- /* regular object */
-
-
- RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) ||
- tb->insert_size[0] <= 0,
- "PAP-12225: item too short or insert_size <= 0");
-
- /*
- * Calculate number of bytes which must be shifted from appended item
- */
- n_shift = tb->sbytes[i] - tb->insert_size[0];
- if (n_shift < 0)
- n_shift = 0;
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift,
- tb->S_new[i]);
-
- /*
- * Calculate number of bytes which must remain in body after
- * append to S_new[i]
- */
- n_rem = tb->insert_size[0] - tb->sbytes[i];
- if (n_rem < 0)
- n_rem = 0;
-
- /* Append part of body into S_new[0] */
- buffer_info_init_bh(tb, &bi, tb->S_new[i]);
- if (n_rem > tb->zeroes_num) {
- r_zeroes_number = 0;
- r_body = body + n_rem - tb->zeroes_num;
- } else {
- r_body = body;
- r_zeroes_number = tb->zeroes_num - n_rem;
- tb->zeroes_num -= r_zeroes_number;
- }
-
- leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
- r_body, r_zeroes_number);
-
- tmp = item_head(tb->S_new[i], 0);
- shift = 0;
- if (is_indirect_le_ih(tmp)) {
- set_ih_free_space(tmp, 0);
- shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
- }
- add_le_ih_k_offset(tmp, n_rem << shift);
-
- tb->insert_size[0] = n_rem;
- if (!n_rem)
- tb->pos_in_item++;
-}
-
-static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body,
- struct item_head *insert_key,
- struct buffer_head **insert_ptr,
- int i)
-
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int n = B_NR_ITEMS(tbS0);
- int leaf_mi;
- struct item_head *pasted;
- struct buffer_info bi;
-
-#ifdef CONFIG_REISERFS_CHECK
- struct item_head *ih_check = item_head(tbS0, tb->item_pos);
-
- if (!is_direntry_le_ih(ih_check) &&
- (tb->pos_in_item != ih_item_len(ih_check) ||
- tb->insert_size[0] <= 0))
- reiserfs_panic(tb->tb_sb,
- "PAP-12235",
- "pos_in_item must be equal to ih_item_len");
-#endif
-
- leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
- tb->sbytes[i], tb->S_new[i]);
-
- RFALSE(leaf_mi,
- "PAP-12240: unexpected value returned by leaf_move_items (%d)",
- leaf_mi);
-
- /* paste into item */
- buffer_info_init_bh(tb, &bi, tb->S_new[i]);
- leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i],
- tb->pos_in_item, tb->insert_size[0],
- body, tb->zeroes_num);
-
- pasted = item_head(tb->S_new[i], tb->item_pos - n +
- tb->snum[i]);
- if (is_direntry_le_ih(pasted))
- leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i],
- tb->pos_in_item, 1,
- (struct reiserfs_de_head *)body,
- body + DEH_SIZE, tb->insert_size[0]);
-
- /* if we paste to indirect item update ih_free_space */
- if (is_indirect_le_ih(pasted))
- set_ih_free_space(pasted, 0);
-
- tb->zeroes_num = tb->insert_size[0] = 0;
-
-}
-static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body,
- struct item_head *insert_key,
- struct buffer_head **insert_ptr,
- int i)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int n = B_NR_ITEMS(tbS0);
-
- /* pasted item doesn't fall into S_new[i] */
- if (n - tb->snum[i] > tb->item_pos) {
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- tb->snum[i], tb->sbytes[i], tb->S_new[i]);
- return;
- }
-
- /* pasted item or part if it falls to S_new[i] */
-
- if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1)
- /* we must shift part of the appended item */
- balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key,
- insert_ptr, i);
- else
- /* item falls wholly into S_new[i] */
- balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key,
- insert_ptr, i);
-}
-
-/* Fill new nodes that appear in place of S[0] */
-static void balance_leaf_new_nodes(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body,
- struct item_head *insert_key,
- struct buffer_head **insert_ptr,
- int flag)
-{
- int i;
- for (i = tb->blknum[0] - 2; i >= 0; i--) {
- BUG_ON(flag != M_INSERT && flag != M_PASTE);
-
- RFALSE(!tb->snum[i],
- "PAP-12200: snum[%d] == %d. Must be > 0", i,
- tb->snum[i]);
-
- /* here we shift from S to S_new nodes */
-
- tb->S_new[i] = get_FEB(tb);
-
- /* initialized block type and tree level */
- set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL);
-
- if (flag == M_INSERT)
- balance_leaf_new_nodes_insert(tb, ih, body, insert_key,
- insert_ptr, i);
- else /* M_PASTE */
- balance_leaf_new_nodes_paste(tb, ih, body, insert_key,
- insert_ptr, i);
-
- memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE);
- insert_ptr[i] = tb->S_new[i];
-
- RFALSE(!buffer_journaled(tb->S_new[i])
- || buffer_journal_dirty(tb->S_new[i])
- || buffer_dirty(tb->S_new[i]),
- "PAP-12247: S_new[%d] : (%b)",
- i, tb->S_new[i]);
- }
-}
-
-static void balance_leaf_finish_node_insert(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- struct buffer_info bi;
- buffer_info_init_tbS0(tb, &bi);
- leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num);
-
- /* If we insert the first key change the delimiting key */
- if (tb->item_pos == 0) {
- if (tb->CFL[0]) /* can be 0 in reiserfsck */
- replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-
- }
-}
-
-static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- struct item_head *pasted = item_head(tbS0, tb->item_pos);
- struct buffer_info bi;
-
- if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) {
- RFALSE(!tb->insert_size[0],
- "PAP-12260: insert_size is 0 already");
-
- /* prepare space */
- buffer_info_init_tbS0(tb, &bi);
- leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item,
- tb->insert_size[0], body, tb->zeroes_num);
-
- /* paste entry */
- leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1,
- (struct reiserfs_de_head *)body,
- body + DEH_SIZE, tb->insert_size[0]);
-
- if (!tb->item_pos && !tb->pos_in_item) {
- RFALSE(!tb->CFL[0] || !tb->L[0],
- "PAP-12270: CFL[0]/L[0] must be specified");
- if (tb->CFL[0])
- replace_key(tb, tb->CFL[0], tb->lkey[0],
- tbS0, 0);
- }
-
- tb->insert_size[0] = 0;
- }
-}
-
-static void balance_leaf_finish_node_paste(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- struct buffer_info bi;
- struct item_head *pasted = item_head(tbS0, tb->item_pos);
-
- /* when directory, may be new entry already pasted */
- if (is_direntry_le_ih(pasted)) {
- balance_leaf_finish_node_paste_dirent(tb, ih, body);
- return;
- }
-
- /* regular object */
-
- if (tb->pos_in_item == ih_item_len(pasted)) {
- RFALSE(tb->insert_size[0] <= 0,
- "PAP-12275: insert size must not be %d",
- tb->insert_size[0]);
- buffer_info_init_tbS0(tb, &bi);
- leaf_paste_in_buffer(&bi, tb->item_pos,
- tb->pos_in_item, tb->insert_size[0], body,
- tb->zeroes_num);
-
- if (is_indirect_le_ih(pasted))
- set_ih_free_space(pasted, 0);
-
- tb->insert_size[0] = 0;
- }
-#ifdef CONFIG_REISERFS_CHECK
- else if (tb->insert_size[0]) {
- print_cur_tb("12285");
- reiserfs_panic(tb->tb_sb, "PAP-12285",
- "insert_size must be 0 (%d)", tb->insert_size[0]);
- }
-#endif
-}
-
-/*
- * if the affected item was not wholly shifted then we
- * perform all necessary operations on that part or whole
- * of the affected item which remains in S
- */
-static void balance_leaf_finish_node(struct tree_balance *tb,
- struct item_head * const ih,
- const char * const body, int flag)
-{
- /* if we must insert or append into buffer S[0] */
- if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
- if (flag == M_INSERT)
- balance_leaf_finish_node_insert(tb, ih, body);
- else /* M_PASTE */
- balance_leaf_finish_node_paste(tb, ih, body);
- }
-}
-
-/**
- * balance_leaf - reiserfs tree balancing algorithm
- * @tb: tree balance state
- * @ih: item header of inserted item (little endian)
- * @body: body of inserted item or bytes to paste
- * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance)
- * passed back:
- * @insert_key: key to insert new nodes
- * @insert_ptr: array of nodes to insert at the next level
- *
- * In our processing of one level we sometimes determine what must be
- * inserted into the next higher level. This insertion consists of a
- * key or two keys and their corresponding pointers.
- */
-static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
- const char *body, int flag,
- struct item_head *insert_key,
- struct buffer_head **insert_ptr)
-{
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-
- PROC_INFO_INC(tb->tb_sb, balance_at[0]);
-
- /* Make balance in case insert_size[0] < 0 */
- if (tb->insert_size[0] < 0)
- return balance_leaf_when_delete(tb, flag);
-
- tb->item_pos = PATH_LAST_POSITION(tb->tb_path),
- tb->pos_in_item = tb->tb_path->pos_in_item,
- tb->zeroes_num = 0;
- if (flag == M_INSERT && !body)
- tb->zeroes_num = ih_item_len(ih);
-
- /*
- * for indirect item pos_in_item is measured in unformatted node
- * pointers. Recalculate to bytes
- */
- if (flag != M_INSERT
- && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
- tb->pos_in_item *= UNFM_P_SIZE;
-
- body += balance_leaf_left(tb, ih, body, flag);
-
- /* tb->lnum[0] > 0 */
- /* Calculate new item position */
- tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
-
- balance_leaf_right(tb, ih, body, flag);
-
- /* tb->rnum[0] > 0 */
- RFALSE(tb->blknum[0] > 3,
- "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
- RFALSE(tb->blknum[0] < 0,
- "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
-
- /*
- * if while adding to a node we discover that it is possible to split
- * it in two, and merge the left part into the left neighbor and the
- * right part into the right neighbor, eliminating the node
- */
- if (tb->blknum[0] == 0) { /* node S[0] is empty now */
-
- RFALSE(!tb->lnum[0] || !tb->rnum[0],
- "PAP-12190: lnum and rnum must not be zero");
- /*
- * if insertion was done before 0-th position in R[0], right
- * delimiting key of the tb->L[0]'s and left delimiting key are
- * not set correctly
- */
- if (tb->CFL[0]) {
- if (!tb->CFR[0])
- reiserfs_panic(tb->tb_sb, "vs-12195",
- "CFR not initialized");
- copy_key(internal_key(tb->CFL[0], tb->lkey[0]),
- internal_key(tb->CFR[0], tb->rkey[0]));
- do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
- }
-
- reiserfs_invalidate_buffer(tb, tbS0);
- return 0;
- }
-
- balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag);
-
- balance_leaf_finish_node(tb, ih, body, flag);
-
-#ifdef CONFIG_REISERFS_CHECK
- if (flag == M_PASTE && tb->insert_size[0]) {
- print_cur_tb("12290");
- reiserfs_panic(tb->tb_sb,
- "PAP-12290", "insert_size is still not 0 (%d)",
- tb->insert_size[0]);
- }
-#endif
-
- /* Leaf level of the tree is balanced (end of balance_leaf) */
- return 0;
-}
-
-/* Make empty node */
-void make_empty_node(struct buffer_info *bi)
-{
- struct block_head *blkh;
-
- RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
-
- blkh = B_BLK_HEAD(bi->bi_bh);
- set_blkh_nr_item(blkh, 0);
- set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh));
-
- if (bi->bi_parent)
- B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */
-}
-
-/* Get first empty buffer */
-struct buffer_head *get_FEB(struct tree_balance *tb)
-{
- int i;
- struct buffer_info bi;
-
- for (i = 0; i < MAX_FEB_SIZE; i++)
- if (tb->FEB[i] != NULL)
- break;
-
- if (i == MAX_FEB_SIZE)
- reiserfs_panic(tb->tb_sb, "vs-12300", "FEB list is empty");
-
- buffer_info_init_bh(tb, &bi, tb->FEB[i]);
- make_empty_node(&bi);
- set_buffer_uptodate(tb->FEB[i]);
- tb->used[i] = tb->FEB[i];
- tb->FEB[i] = NULL;
-
- return tb->used[i];
-}
-
-/* This is now used because reiserfs_free_block has to be able to schedule. */
-static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
-{
- int i;
-
- if (buffer_dirty(bh))
- reiserfs_warning(tb->tb_sb, "reiserfs-12320",
- "called with dirty buffer");
- for (i = 0; i < ARRAY_SIZE(tb->thrown); i++)
- if (!tb->thrown[i]) {
- tb->thrown[i] = bh;
- get_bh(bh); /* free_thrown puts this */
- return;
- }
- reiserfs_warning(tb->tb_sb, "reiserfs-12321",
- "too many thrown buffers");
-}
-
-static void free_thrown(struct tree_balance *tb)
-{
- int i;
- b_blocknr_t blocknr;
- for (i = 0; i < ARRAY_SIZE(tb->thrown); i++) {
- if (tb->thrown[i]) {
- blocknr = tb->thrown[i]->b_blocknr;
- if (buffer_dirty(tb->thrown[i]))
- reiserfs_warning(tb->tb_sb, "reiserfs-12322",
- "called with dirty buffer %d",
- blocknr);
- brelse(tb->thrown[i]); /* incremented in store_thrown */
- reiserfs_free_block(tb->transaction_handle, NULL,
- blocknr, 0);
- }
- }
-}
-
-void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh)
-{
- struct block_head *blkh;
- blkh = B_BLK_HEAD(bh);
- set_blkh_level(blkh, FREE_LEVEL);
- set_blkh_nr_item(blkh, 0);
-
- clear_buffer_dirty(bh);
- store_thrown(tb, bh);
-}
-
-/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/
-void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest,
- struct buffer_head *src, int n_src)
-{
-
- RFALSE(dest == NULL || src == NULL,
- "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
- src, dest);
- RFALSE(!B_IS_KEYS_LEVEL(dest),
- "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
- dest);
- RFALSE(n_dest < 0 || n_src < 0,
- "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
- RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
- "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
- n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
-
- if (B_IS_ITEMS_LEVEL(src))
- /* source buffer contains leaf node */
- memcpy(internal_key(dest, n_dest), item_head(src, n_src),
- KEY_SIZE);
- else
- memcpy(internal_key(dest, n_dest), internal_key(src, n_src),
- KEY_SIZE);
-
- do_balance_mark_internal_dirty(tb, dest, 0);
-}
-
-int get_left_neighbor_position(struct tree_balance *tb, int h)
-{
- int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
- RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FL[h] == NULL,
- "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
- h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h));
-
- if (Sh_position == 0)
- return B_NR_ITEMS(tb->FL[h]);
- else
- return Sh_position - 1;
-}
-
-int get_right_neighbor_position(struct tree_balance *tb, int h)
-{
- int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
- RFALSE(PATH_H_PPARENT(tb->tb_path, h) == NULL || tb->FR[h] == NULL,
- "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
- h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]);
-
- if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h)))
- return 0;
- else
- return Sh_position + 1;
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-
-int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
-static void check_internal_node(struct super_block *s, struct buffer_head *bh,
- char *mes)
-{
- struct disk_child *dc;
- int i;
-
- RFALSE(!bh, "PAP-12336: bh == 0");
-
- if (!bh || !B_IS_IN_TREE(bh))
- return;
-
- RFALSE(!buffer_dirty(bh) &&
- !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
- "PAP-12337: buffer (%b) must be dirty", bh);
- dc = B_N_CHILD(bh, 0);
-
- for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
- if (!is_reusable(s, dc_block_number(dc), 1)) {
- print_cur_tb(mes);
- reiserfs_panic(s, "PAP-12338",
- "invalid child pointer %y in %b",
- dc, bh);
- }
- }
-}
-
-static int locked_or_not_in_tree(struct tree_balance *tb,
- struct buffer_head *bh, char *which)
-{
- if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
- !B_IS_IN_TREE(bh)) {
- reiserfs_warning(tb->tb_sb, "vs-12339", "%s (%b)", which, bh);
- return 1;
- }
- return 0;
-}
-
-static int check_before_balancing(struct tree_balance *tb)
-{
- int retval = 0;
-
- if (REISERFS_SB(tb->tb_sb)->cur_tb) {
- reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
- "occurred based on cur_tb not being null at "
- "this point in code. do_balance cannot properly "
- "handle concurrent tree accesses on a same "
- "mount point.");
- }
-
- /*
- * double check that buffers that we will modify are unlocked.
- * (fix_nodes should already have prepped all of these for us).
- */
- if (tb->lnum[0]) {
- retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
- retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
- retval |= locked_or_not_in_tree(tb, tb->CFL[0], "CFL[0]");
- check_leaf(tb->L[0]);
- }
- if (tb->rnum[0]) {
- retval |= locked_or_not_in_tree(tb, tb->R[0], "R[0]");
- retval |= locked_or_not_in_tree(tb, tb->FR[0], "FR[0]");
- retval |= locked_or_not_in_tree(tb, tb->CFR[0], "CFR[0]");
- check_leaf(tb->R[0]);
- }
- retval |= locked_or_not_in_tree(tb, PATH_PLAST_BUFFER(tb->tb_path),
- "S[0]");
- check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
-
- return retval;
-}
-
-static void check_after_balance_leaf(struct tree_balance *tb)
-{
- if (tb->lnum[0]) {
- if (B_FREE_SPACE(tb->L[0]) !=
- MAX_CHILD_SIZE(tb->L[0]) -
- dc_size(B_N_CHILD
- (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
- print_cur_tb("12221");
- reiserfs_panic(tb->tb_sb, "PAP-12355",
- "shift to left was incorrect");
- }
- }
- if (tb->rnum[0]) {
- if (B_FREE_SPACE(tb->R[0]) !=
- MAX_CHILD_SIZE(tb->R[0]) -
- dc_size(B_N_CHILD
- (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
- print_cur_tb("12222");
- reiserfs_panic(tb->tb_sb, "PAP-12360",
- "shift to right was incorrect");
- }
- }
- if (PATH_H_PBUFFER(tb->tb_path, 1) &&
- (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) !=
- (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
- dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
- PATH_H_POSITION(tb->tb_path, 1)))))) {
- int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0));
- int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
- dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
- PATH_H_POSITION(tb->tb_path,
- 1))));
- print_cur_tb("12223");
- reiserfs_warning(tb->tb_sb, "reiserfs-12363",
- "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
- "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
- left,
- MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)),
- PATH_H_PBUFFER(tb->tb_path, 1),
- PATH_H_POSITION(tb->tb_path, 1),
- dc_size(B_N_CHILD
- (PATH_H_PBUFFER(tb->tb_path, 1),
- PATH_H_POSITION(tb->tb_path, 1))),
- right);
- reiserfs_panic(tb->tb_sb, "PAP-12365", "S is incorrect");
- }
-}
-
-static void check_leaf_level(struct tree_balance *tb)
-{
- check_leaf(tb->L[0]);
- check_leaf(tb->R[0]);
- check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
-}
-
-static void check_internal_levels(struct tree_balance *tb)
-{
- int h;
-
- /* check all internal nodes */
- for (h = 1; tb->insert_size[h]; h++) {
- check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h),
- "BAD BUFFER ON PATH");
- if (tb->lnum[h])
- check_internal_node(tb->tb_sb, tb->L[h], "BAD L");
- if (tb->rnum[h])
- check_internal_node(tb->tb_sb, tb->R[h], "BAD R");
- }
-
-}
-
-#endif
-
-/*
- * Now we have all of the buffers that must be used in balancing of
- * the tree. We rely on the assumption that schedule() will not occur
- * while do_balance works. ( Only interrupt handlers are acceptable.)
- * We balance the tree according to the analysis made before this,
- * using buffers already obtained. For SMP support it will someday be
- * necessary to add ordered locking of tb.
- */
-
-/*
- * Some interesting rules of balancing:
- * we delete a maximum of two nodes per level per balancing: we never
- * delete R, when we delete two of three nodes L, S, R then we move
- * them into R.
- *
- * we only delete L if we are deleting two nodes, if we delete only
- * one node we delete S
- *
- * if we shift leaves then we shift as much as we can: this is a
- * deliberate policy of extremism in node packing which results in
- * higher average utilization after repeated random balance operations
- * at the cost of more memory copies and more balancing as a result of
- * small insertions to full nodes.
- *
- * if we shift internal nodes we try to evenly balance the node
- * utilization, with consequent less balancing at the cost of lower
- * utilization.
- *
- * one could argue that the policy for directories in leaves should be
- * that of internal nodes, but we will wait until another day to
- * evaluate this.... It would be nice to someday measure and prove
- * these assumptions as to what is optimal....
- */
-
-static inline void do_balance_starts(struct tree_balance *tb)
-{
- /* use print_cur_tb() to see initial state of struct tree_balance */
-
- /* store_print_tb (tb); */
-
- /* do not delete, just comment it out */
- /*
- print_tb(flag, PATH_LAST_POSITION(tb->tb_path),
- tb->tb_path->pos_in_item, tb, "check");
- */
- RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
-#ifdef CONFIG_REISERFS_CHECK
- REISERFS_SB(tb->tb_sb)->cur_tb = tb;
-#endif
-}
-
-static inline void do_balance_completed(struct tree_balance *tb)
-{
-
-#ifdef CONFIG_REISERFS_CHECK
- check_leaf_level(tb);
- check_internal_levels(tb);
- REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
-#endif
-
- /*
- * reiserfs_free_block is no longer schedule safe. So, we need to
- * put the buffers we want freed on the thrown list during do_balance,
- * and then free them now
- */
-
- REISERFS_SB(tb->tb_sb)->s_do_balance++;
-
- /* release all nodes hold to perform the balancing */
- unfix_nodes(tb);
-
- free_thrown(tb);
-}
-
-/*
- * do_balance - balance the tree
- *
- * @tb: tree_balance structure
- * @ih: item header of inserted item
- * @body: body of inserted item or bytes to paste
- * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste
- *
- * Cut means delete part of an item (includes removing an entry from a
- * directory).
- *
- * Delete means delete whole item.
- *
- * Insert means add a new item into the tree.
- *
- * Paste means to append to the end of an existing file or to
- * insert a directory entry.
- */
-void do_balance(struct tree_balance *tb, struct item_head *ih,
- const char *body, int flag)
-{
- int child_pos; /* position of a child node in its parent */
- int h; /* level of the tree being processed */
-
- /*
- * in our processing of one level we sometimes determine what
- * must be inserted into the next higher level. This insertion
- * consists of a key or two keys and their corresponding
- * pointers
- */
- struct item_head insert_key[2];
-
- /* inserted node-ptrs for the next level */
- struct buffer_head *insert_ptr[2];
-
- tb->tb_mode = flag;
- tb->need_balance_dirty = 0;
-
- if (FILESYSTEM_CHANGED_TB(tb)) {
- reiserfs_panic(tb->tb_sb, "clm-6000", "fs generation has "
- "changed");
- }
- /* if we have no real work to do */
- if (!tb->insert_size[0]) {
- reiserfs_warning(tb->tb_sb, "PAP-12350",
- "insert_size == 0, mode == %c", flag);
- unfix_nodes(tb);
- return;
- }
-
- atomic_inc(&fs_generation(tb->tb_sb));
- do_balance_starts(tb);
-
- /*
- * balance_leaf returns 0 except if combining L R and S into
- * one node. see balance_internal() for explanation of this
- * line of code.
- */
- child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) +
- balance_leaf(tb, ih, body, flag, insert_key, insert_ptr);
-
-#ifdef CONFIG_REISERFS_CHECK
- check_after_balance_leaf(tb);
-#endif
-
- /* Balance internal level of the tree. */
- for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++)
- child_pos = balance_internal(tb, h, child_pos, insert_key,
- insert_ptr);
-
- do_balance_completed(tb);
-}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
deleted file mode 100644
index 8eb3ad3e8ae9..000000000000
--- a/fs/reiserfs/file.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/uaccess.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/quotaops.h>
-
-/*
- * We pack the tails of files on file close, not at the time they are written.
- * This implies an unnecessary copy of the tail and an unnecessary indirect item
- * insertion/balancing, for files that are written in one write.
- * It avoids unnecessary tail packings (balances) for files that are written in
- * multiple writes and are small enough to have tails.
- *
- * file_release is called by the VFS layer when the file is closed. If
- * this is the last open file descriptor, and the file
- * small enough to have a tail, and the tail is currently in an
- * unformatted node, the tail is converted back into a direct item.
- *
- * We use reiserfs_truncate_file to pack the tail, since it already has
- * all the conditions coded.
- */
-static int reiserfs_file_release(struct inode *inode, struct file *filp)
-{
-
- struct reiserfs_transaction_handle th;
- int err;
- int jbegin_failure = 0;
-
- BUG_ON(!S_ISREG(inode->i_mode));
-
- if (!atomic_dec_and_mutex_lock(&REISERFS_I(inode)->openers,
- &REISERFS_I(inode)->tailpack))
- return 0;
-
- /* fast out for when nothing needs to be done */
- if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
- !tail_has_to_be_packed(inode)) &&
- REISERFS_I(inode)->i_prealloc_count <= 0) {
- mutex_unlock(&REISERFS_I(inode)->tailpack);
- return 0;
- }
-
- reiserfs_write_lock(inode->i_sb);
- /*
- * freeing preallocation only involves relogging blocks that
- * are already in the current transaction. preallocation gets
- * freed at the end of each transaction, so it is impossible for
- * us to log any additional blocks (including quota blocks)
- */
- err = journal_begin(&th, inode->i_sb, 1);
- if (err) {
- /*
- * uh oh, we can't allow the inode to go away while there
- * is still preallocation blocks pending. Try to join the
- * aborted transaction
- */
- jbegin_failure = err;
- err = journal_join_abort(&th, inode->i_sb);
-
- if (err) {
- /*
- * hmpf, our choices here aren't good. We can pin
- * the inode which will disallow unmount from ever
- * happening, we can do nothing, which will corrupt
- * random memory on unmount, or we can forcibly
- * remove the file from the preallocation list, which
- * will leak blocks on disk. Lets pin the inode
- * and let the admin know what is going on.
- */
- igrab(inode);
- reiserfs_warning(inode->i_sb, "clm-9001",
- "pinning inode %lu because the "
- "preallocation can't be freed",
- inode->i_ino);
- goto out;
- }
- }
- reiserfs_update_inode_transaction(inode);
-
-#ifdef REISERFS_PREALLOCATE
- reiserfs_discard_prealloc(&th, inode);
-#endif
- err = journal_end(&th);
-
- /* copy back the error code from journal_begin */
- if (!err)
- err = jbegin_failure;
-
- if (!err &&
- (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
- tail_has_to_be_packed(inode)) {
-
- /*
- * if regular file is released by last holder and it has been
- * appended (we append by unformatted node only) or its direct
- * item(s) had to be converted, then it may have to be
- * indirect2direct converted
- */
- err = reiserfs_truncate_file(inode, 0);
- }
-out:
- reiserfs_write_unlock(inode->i_sb);
- mutex_unlock(&REISERFS_I(inode)->tailpack);
- return err;
-}
-
-static int reiserfs_file_open(struct inode *inode, struct file *file)
-{
- int err = dquot_file_open(inode, file);
-
- /* somebody might be tailpacking on final close; wait for it */
- if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
- mutex_lock(&REISERFS_I(inode)->tailpack);
- atomic_inc(&REISERFS_I(inode)->openers);
- mutex_unlock(&REISERFS_I(inode)->tailpack);
- }
- return err;
-}
-
-void reiserfs_vfs_truncate_file(struct inode *inode)
-{
- mutex_lock(&REISERFS_I(inode)->tailpack);
- reiserfs_truncate_file(inode, 1);
- mutex_unlock(&REISERFS_I(inode)->tailpack);
-}
-
-/* Sync a reiserfs file. */
-
-/*
- * FIXME: sync_mapping_buffers() never has anything to sync. Can
- * be removed...
- */
-
-static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *inode = filp->f_mapping->host;
- int err;
- int barrier_done;
-
- err = file_write_and_wait_range(filp, start, end);
- if (err)
- return err;
-
- inode_lock(inode);
- BUG_ON(!S_ISREG(inode->i_mode));
- err = sync_mapping_buffers(inode->i_mapping);
- reiserfs_write_lock(inode->i_sb);
- barrier_done = reiserfs_commit_for_inode(inode);
- reiserfs_write_unlock(inode->i_sb);
- if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
- blkdev_issue_flush(inode->i_sb->s_bdev);
- inode_unlock(inode);
- if (barrier_done < 0)
- return barrier_done;
- return (err < 0) ? -EIO : 0;
-}
-
-/* taken fs/buffer.c:__block_commit_write */
-int reiserfs_commit_page(struct inode *inode, struct page *page,
- unsigned from, unsigned to)
-{
- unsigned block_start, block_end;
- int partial = 0;
- unsigned blocksize;
- struct buffer_head *bh, *head;
- unsigned long i_size_index = inode->i_size >> PAGE_SHIFT;
- int new;
- int logit = reiserfs_file_data_log(inode);
- struct super_block *s = inode->i_sb;
- int bh_per_page = PAGE_SIZE / s->s_blocksize;
- struct reiserfs_transaction_handle th;
- int ret = 0;
-
- th.t_trans_id = 0;
- blocksize = i_blocksize(inode);
-
- if (logit) {
- reiserfs_write_lock(s);
- ret = journal_begin(&th, s, bh_per_page + 1);
- if (ret)
- goto drop_write_lock;
- reiserfs_update_inode_transaction(inode);
- }
- for (bh = head = page_buffers(page), block_start = 0;
- bh != head || !block_start;
- block_start = block_end, bh = bh->b_this_page) {
-
- new = buffer_new(bh);
- clear_buffer_new(bh);
- block_end = block_start + blocksize;
- if (block_end <= from || block_start >= to) {
- if (!buffer_uptodate(bh))
- partial = 1;
- } else {
- set_buffer_uptodate(bh);
- if (logit) {
- reiserfs_prepare_for_journal(s, bh, 1);
- journal_mark_dirty(&th, bh);
- } else if (!buffer_dirty(bh)) {
- mark_buffer_dirty(bh);
- /*
- * do data=ordered on any page past the end
- * of file and any buffer marked BH_New.
- */
- if (reiserfs_data_ordered(inode->i_sb) &&
- (new || page->index >= i_size_index)) {
- reiserfs_add_ordered_list(inode, bh);
- }
- }
- }
- }
- if (logit) {
- ret = journal_end(&th);
-drop_write_lock:
- reiserfs_write_unlock(s);
- }
- /*
- * If this is a partial write which happened to make all buffers
- * uptodate then we can optimize away a bogus read_folio() for
- * the next read(). Here we 'discover' whether the page went
- * uptodate as a result of this (potentially partial) write.
- */
- if (!partial)
- SetPageUptodate(page);
- return ret;
-}
-
-const struct file_operations reiserfs_file_operations = {
- .unlocked_ioctl = reiserfs_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = reiserfs_compat_ioctl,
-#endif
- .mmap = generic_file_mmap,
- .open = reiserfs_file_open,
- .release = reiserfs_file_release,
- .fsync = reiserfs_sync_file,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .splice_read = filemap_splice_read,
- .splice_write = iter_file_splice_write,
- .llseek = generic_file_llseek,
-};
-
-const struct inode_operations reiserfs_file_inode_operations = {
- .setattr = reiserfs_setattr,
- .listxattr = reiserfs_listxattr,
- .permission = reiserfs_permission,
- .get_inode_acl = reiserfs_get_acl,
- .set_acl = reiserfs_set_acl,
- .fileattr_get = reiserfs_fileattr_get,
- .fileattr_set = reiserfs_fileattr_set,
-};
-
-const struct inode_operations reiserfs_priv_file_inode_operations = {
- .setattr = reiserfs_setattr,
- .permission = reiserfs_permission,
- .fileattr_get = reiserfs_fileattr_get,
- .fileattr_set = reiserfs_fileattr_set,
-};
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
deleted file mode 100644
index 6c13a8d9a73c..000000000000
--- a/fs/reiserfs/fix_node.c
+++ /dev/null
@@ -1,2822 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-/*
- * To make any changes in the tree we find a node that contains item
- * to be changed/deleted or position in the node we insert a new item
- * to. We call this node S. To do balancing we need to decide what we
- * will shift to left/right neighbor, or to a new node, where new item
- * will be etc. To make this analysis simpler we build virtual
- * node. Virtual node is an array of items, that will replace items of
- * node S. (For instance if we are going to delete an item, virtual
- * node does not contain it). Virtual node keeps information about
- * item sizes and types, mergeability of first and last items, sizes
- * of all entries in directory item. We use this array of items when
- * calculating what we can shift to neighbors and how many nodes we
- * have to have if we do not any shiftings, if we shift to left/right
- * neighbor or to both.
- */
-
-/*
- * Takes item number in virtual node, returns number of item
- * that it has in source buffer
- */
-static inline int old_item_num(int new_num, int affected_item_num, int mode)
-{
- if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
- return new_num;
-
- if (mode == M_INSERT) {
-
- RFALSE(new_num == 0,
- "vs-8005: for INSERT mode and item number of inserted item");
-
- return new_num - 1;
- }
-
- RFALSE(mode != M_DELETE,
- "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
- mode);
- /* delete mode */
- return new_num + 1;
-}
-
-static void create_virtual_node(struct tree_balance *tb, int h)
-{
- struct item_head *ih;
- struct virtual_node *vn = tb->tb_vn;
- int new_num;
- struct buffer_head *Sh; /* this comes from tb->S[h] */
-
- Sh = PATH_H_PBUFFER(tb->tb_path, h);
-
- /* size of changed node */
- vn->vn_size =
- MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
-
- /* for internal nodes array if virtual items is not created */
- if (h) {
- vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
- return;
- }
-
- /* number of items in virtual node */
- vn->vn_nr_item =
- B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
- ((vn->vn_mode == M_DELETE) ? 1 : 0);
-
- /* first virtual item */
- vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
- memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
- vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
-
- /* first item in the node */
- ih = item_head(Sh, 0);
-
- /* define the mergeability for 0-th item (if it is not being deleted) */
- if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
- && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
- vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
-
- /*
- * go through all items that remain in the virtual
- * node (except for the new (inserted) one)
- */
- for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
- int j;
- struct virtual_item *vi = vn->vn_vi + new_num;
- int is_affected =
- ((new_num != vn->vn_affected_item_num) ? 0 : 1);
-
- if (is_affected && vn->vn_mode == M_INSERT)
- continue;
-
- /* get item number in source node */
- j = old_item_num(new_num, vn->vn_affected_item_num,
- vn->vn_mode);
-
- vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
- vi->vi_ih = ih + j;
- vi->vi_item = ih_item_body(Sh, ih + j);
- vi->vi_uarea = vn->vn_free_ptr;
-
- /*
- * FIXME: there is no check that item operation did not
- * consume too much memory
- */
- vn->vn_free_ptr +=
- op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
- if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
- reiserfs_panic(tb->tb_sb, "vs-8030",
- "virtual node space consumed");
-
- if (!is_affected)
- /* this is not being changed */
- continue;
-
- if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
- vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
- /* pointer to data which is going to be pasted */
- vi->vi_new_data = vn->vn_data;
- }
- }
-
- /* virtual inserted item is not defined yet */
- if (vn->vn_mode == M_INSERT) {
- struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
-
- RFALSE(vn->vn_ins_ih == NULL,
- "vs-8040: item header of inserted item is not specified");
- vi->vi_item_len = tb->insert_size[0];
- vi->vi_ih = vn->vn_ins_ih;
- vi->vi_item = vn->vn_data;
- vi->vi_uarea = vn->vn_free_ptr;
-
- op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
- tb->insert_size[0]);
- }
-
- /*
- * set right merge flag we take right delimiting key and
- * check whether it is a mergeable item
- */
- if (tb->CFR[0]) {
- struct reiserfs_key *key;
-
- key = internal_key(tb->CFR[0], tb->rkey[0]);
- if (op_is_left_mergeable(key, Sh->b_size)
- && (vn->vn_mode != M_DELETE
- || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
- vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
- VI_TYPE_RIGHT_MERGEABLE;
-
-#ifdef CONFIG_REISERFS_CHECK
- if (op_is_left_mergeable(key, Sh->b_size) &&
- !(vn->vn_mode != M_DELETE
- || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
- /*
- * we delete last item and it could be merged
- * with right neighbor's first item
- */
- if (!
- (B_NR_ITEMS(Sh) == 1
- && is_direntry_le_ih(item_head(Sh, 0))
- && ih_entry_count(item_head(Sh, 0)) == 1)) {
- /*
- * node contains more than 1 item, or item
- * is not directory item, or this item
- * contains more than 1 entry
- */
- print_block(Sh, 0, -1, -1);
- reiserfs_panic(tb->tb_sb, "vs-8045",
- "rdkey %k, affected item==%d "
- "(mode==%c) Must be %c",
- key, vn->vn_affected_item_num,
- vn->vn_mode, M_DELETE);
- }
- }
-#endif
-
- }
-}
-
-/*
- * Using virtual node check, how many items can be
- * shifted to left neighbor
- */
-static void check_left(struct tree_balance *tb, int h, int cur_free)
-{
- int i;
- struct virtual_node *vn = tb->tb_vn;
- struct virtual_item *vi;
- int d_size, ih_size;
-
- RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
-
- /* internal level */
- if (h > 0) {
- tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
- return;
- }
-
- /* leaf level */
-
- if (!cur_free || !vn->vn_nr_item) {
- /* no free space or nothing to move */
- tb->lnum[h] = 0;
- tb->lbytes = -1;
- return;
- }
-
- RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
- "vs-8055: parent does not exist or invalid");
-
- vi = vn->vn_vi;
- if ((unsigned int)cur_free >=
- (vn->vn_size -
- ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
- /* all contents of S[0] fits into L[0] */
-
- RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
- "vs-8055: invalid mode or balance condition failed");
-
- tb->lnum[0] = vn->vn_nr_item;
- tb->lbytes = -1;
- return;
- }
-
- d_size = 0, ih_size = IH_SIZE;
-
- /* first item may be merge with last item in left neighbor */
- if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
- d_size = -((int)IH_SIZE), ih_size = 0;
-
- tb->lnum[0] = 0;
- for (i = 0; i < vn->vn_nr_item;
- i++, ih_size = IH_SIZE, d_size = 0, vi++) {
- d_size += vi->vi_item_len;
- if (cur_free >= d_size) {
- /* the item can be shifted entirely */
- cur_free -= d_size;
- tb->lnum[0]++;
- continue;
- }
-
- /* the item cannot be shifted entirely, try to split it */
- /*
- * check whether L[0] can hold ih and at least one byte
- * of the item body
- */
-
- /* cannot shift even a part of the current item */
- if (cur_free <= ih_size) {
- tb->lbytes = -1;
- return;
- }
- cur_free -= ih_size;
-
- tb->lbytes = op_check_left(vi, cur_free, 0, 0);
- if (tb->lbytes != -1)
- /* count partially shifted item */
- tb->lnum[0]++;
-
- break;
- }
-
- return;
-}
-
-/*
- * Using virtual node check, how many items can be
- * shifted to right neighbor
- */
-static void check_right(struct tree_balance *tb, int h, int cur_free)
-{
- int i;
- struct virtual_node *vn = tb->tb_vn;
- struct virtual_item *vi;
- int d_size, ih_size;
-
- RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
-
- /* internal level */
- if (h > 0) {
- tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
- return;
- }
-
- /* leaf level */
-
- if (!cur_free || !vn->vn_nr_item) {
- /* no free space */
- tb->rnum[h] = 0;
- tb->rbytes = -1;
- return;
- }
-
- RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
- "vs-8075: parent does not exist or invalid");
-
- vi = vn->vn_vi + vn->vn_nr_item - 1;
- if ((unsigned int)cur_free >=
- (vn->vn_size -
- ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
- /* all contents of S[0] fits into R[0] */
-
- RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
- "vs-8080: invalid mode or balance condition failed");
-
- tb->rnum[h] = vn->vn_nr_item;
- tb->rbytes = -1;
- return;
- }
-
- d_size = 0, ih_size = IH_SIZE;
-
- /* last item may be merge with first item in right neighbor */
- if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
- d_size = -(int)IH_SIZE, ih_size = 0;
-
- tb->rnum[0] = 0;
- for (i = vn->vn_nr_item - 1; i >= 0;
- i--, d_size = 0, ih_size = IH_SIZE, vi--) {
- d_size += vi->vi_item_len;
- if (cur_free >= d_size) {
- /* the item can be shifted entirely */
- cur_free -= d_size;
- tb->rnum[0]++;
- continue;
- }
-
- /*
- * check whether R[0] can hold ih and at least one
- * byte of the item body
- */
-
- /* cannot shift even a part of the current item */
- if (cur_free <= ih_size) {
- tb->rbytes = -1;
- return;
- }
-
- /*
- * R[0] can hold the header of the item and at least
- * one byte of its body
- */
- cur_free -= ih_size; /* cur_free is still > 0 */
-
- tb->rbytes = op_check_right(vi, cur_free);
- if (tb->rbytes != -1)
- /* count partially shifted item */
- tb->rnum[0]++;
-
- break;
- }
-
- return;
-}
-
-/*
- * from - number of items, which are shifted to left neighbor entirely
- * to - number of item, which are shifted to right neighbor entirely
- * from_bytes - number of bytes of boundary item (or directory entries)
- * which are shifted to left neighbor
- * to_bytes - number of bytes of boundary item (or directory entries)
- * which are shifted to right neighbor
- */
-static int get_num_ver(int mode, struct tree_balance *tb, int h,
- int from, int from_bytes,
- int to, int to_bytes, short *snum012, int flow)
-{
- int i;
- int units;
- struct virtual_node *vn = tb->tb_vn;
- int total_node_size, max_node_size, current_item_size;
- int needed_nodes;
-
- /* position of item we start filling node from */
- int start_item;
-
- /* position of item we finish filling node by */
- int end_item;
-
- /*
- * number of first bytes (entries for directory) of start_item-th item
- * we do not include into node that is being filled
- */
- int start_bytes;
-
- /*
- * number of last bytes (entries for directory) of end_item-th item
- * we do node include into node that is being filled
- */
- int end_bytes;
-
- /*
- * these are positions in virtual item of items, that are split
- * between S[0] and S1new and S1new and S2new
- */
- int split_item_positions[2];
-
- split_item_positions[0] = -1;
- split_item_positions[1] = -1;
-
- /*
- * We only create additional nodes if we are in insert or paste mode
- * or we are in replace mode at the internal level. If h is 0 and
- * the mode is M_REPLACE then in fix_nodes we change the mode to
- * paste or insert before we get here in the code.
- */
- RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
- "vs-8100: insert_size < 0 in overflow");
-
- max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
-
- /*
- * snum012 [0-2] - number of items, that lay
- * to S[0], first new node and second new node
- */
- snum012[3] = -1; /* s1bytes */
- snum012[4] = -1; /* s2bytes */
-
- /* internal level */
- if (h > 0) {
- i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
- if (i == max_node_size)
- return 1;
- return (i / max_node_size + 1);
- }
-
- /* leaf level */
- needed_nodes = 1;
- total_node_size = 0;
-
- /* start from 'from'-th item */
- start_item = from;
- /* skip its first 'start_bytes' units */
- start_bytes = ((from_bytes != -1) ? from_bytes : 0);
-
- /* last included item is the 'end_item'-th one */
- end_item = vn->vn_nr_item - to - 1;
- /* do not count last 'end_bytes' units of 'end_item'-th item */
- end_bytes = (to_bytes != -1) ? to_bytes : 0;
-
- /*
- * go through all item beginning from the start_item-th item
- * and ending by the end_item-th item. Do not count first
- * 'start_bytes' units of 'start_item'-th item and last
- * 'end_bytes' of 'end_item'-th item
- */
- for (i = start_item; i <= end_item; i++) {
- struct virtual_item *vi = vn->vn_vi + i;
- int skip_from_end = ((i == end_item) ? end_bytes : 0);
-
- RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
-
- /* get size of current item */
- current_item_size = vi->vi_item_len;
-
- /*
- * do not take in calculation head part (from_bytes)
- * of from-th item
- */
- current_item_size -=
- op_part_size(vi, 0 /*from start */ , start_bytes);
-
- /* do not take in calculation tail part of last item */
- current_item_size -=
- op_part_size(vi, 1 /*from end */ , skip_from_end);
-
- /* if item fits into current node entierly */
- if (total_node_size + current_item_size <= max_node_size) {
- snum012[needed_nodes - 1]++;
- total_node_size += current_item_size;
- start_bytes = 0;
- continue;
- }
-
- /*
- * virtual item length is longer, than max size of item in
- * a node. It is impossible for direct item
- */
- if (current_item_size > max_node_size) {
- RFALSE(is_direct_le_ih(vi->vi_ih),
- "vs-8110: "
- "direct item length is %d. It can not be longer than %d",
- current_item_size, max_node_size);
- /* we will try to split it */
- flow = 1;
- }
-
- /* as we do not split items, take new node and continue */
- if (!flow) {
- needed_nodes++;
- i--;
- total_node_size = 0;
- continue;
- }
-
- /*
- * calculate number of item units which fit into node being
- * filled
- */
- {
- int free_space;
-
- free_space = max_node_size - total_node_size - IH_SIZE;
- units =
- op_check_left(vi, free_space, start_bytes,
- skip_from_end);
- /*
- * nothing fits into current node, take new
- * node and continue
- */
- if (units == -1) {
- needed_nodes++, i--, total_node_size = 0;
- continue;
- }
- }
-
- /* something fits into the current node */
- start_bytes += units;
- snum012[needed_nodes - 1 + 3] = units;
-
- if (needed_nodes > 2)
- reiserfs_warning(tb->tb_sb, "vs-8111",
- "split_item_position is out of range");
- snum012[needed_nodes - 1]++;
- split_item_positions[needed_nodes - 1] = i;
- needed_nodes++;
- /* continue from the same item with start_bytes != -1 */
- start_item = i;
- i--;
- total_node_size = 0;
- }
-
- /*
- * sum012[4] (if it is not -1) contains number of units of which
- * are to be in S1new, snum012[3] - to be in S0. They are supposed
- * to be S1bytes and S2bytes correspondingly, so recalculate
- */
- if (snum012[4] > 0) {
- int split_item_num;
- int bytes_to_r, bytes_to_l;
- int bytes_to_S1new;
-
- split_item_num = split_item_positions[1];
- bytes_to_l =
- ((from == split_item_num
- && from_bytes != -1) ? from_bytes : 0);
- bytes_to_r =
- ((end_item == split_item_num
- && end_bytes != -1) ? end_bytes : 0);
- bytes_to_S1new =
- ((split_item_positions[0] ==
- split_item_positions[1]) ? snum012[3] : 0);
-
- /* s2bytes */
- snum012[4] =
- op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
- bytes_to_r - bytes_to_l - bytes_to_S1new;
-
- if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
- vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
- reiserfs_warning(tb->tb_sb, "vs-8115",
- "not directory or indirect item");
- }
-
- /* now we know S2bytes, calculate S1bytes */
- if (snum012[3] > 0) {
- int split_item_num;
- int bytes_to_r, bytes_to_l;
- int bytes_to_S2new;
-
- split_item_num = split_item_positions[0];
- bytes_to_l =
- ((from == split_item_num
- && from_bytes != -1) ? from_bytes : 0);
- bytes_to_r =
- ((end_item == split_item_num
- && end_bytes != -1) ? end_bytes : 0);
- bytes_to_S2new =
- ((split_item_positions[0] == split_item_positions[1]
- && snum012[4] != -1) ? snum012[4] : 0);
-
- /* s1bytes */
- snum012[3] =
- op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
- bytes_to_r - bytes_to_l - bytes_to_S2new;
- }
-
- return needed_nodes;
-}
-
-
-/*
- * Set parameters for balancing.
- * Performs write of results of analysis of balancing into structure tb,
- * where it will later be used by the functions that actually do the balancing.
- * Parameters:
- * tb tree_balance structure;
- * h current level of the node;
- * lnum number of items from S[h] that must be shifted to L[h];
- * rnum number of items from S[h] that must be shifted to R[h];
- * blk_num number of blocks that S[h] will be splitted into;
- * s012 number of items that fall into splitted nodes.
- * lbytes number of bytes which flow to the left neighbor from the
- * item that is not shifted entirely
- * rbytes number of bytes which flow to the right neighbor from the
- * item that is not shifted entirely
- * s1bytes number of bytes which flow to the first new node when
- * S[0] splits (this number is contained in s012 array)
- */
-
-static void set_parameters(struct tree_balance *tb, int h, int lnum,
- int rnum, int blk_num, short *s012, int lb, int rb)
-{
-
- tb->lnum[h] = lnum;
- tb->rnum[h] = rnum;
- tb->blknum[h] = blk_num;
-
- /* only for leaf level */
- if (h == 0) {
- if (s012 != NULL) {
- tb->s0num = *s012++;
- tb->snum[0] = *s012++;
- tb->snum[1] = *s012++;
- tb->sbytes[0] = *s012++;
- tb->sbytes[1] = *s012;
- }
- tb->lbytes = lb;
- tb->rbytes = rb;
- }
- PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
- PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
-
- PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
- PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
-}
-
-/*
- * check if node disappears if we shift tb->lnum[0] items to left
- * neighbor and tb->rnum[0] to the right one.
- */
-static int is_leaf_removable(struct tree_balance *tb)
-{
- struct virtual_node *vn = tb->tb_vn;
- int to_left, to_right;
- int size;
- int remain_items;
-
- /*
- * number of items that will be shifted to left (right) neighbor
- * entirely
- */
- to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
- to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
- remain_items = vn->vn_nr_item;
-
- /* how many items remain in S[0] after shiftings to neighbors */
- remain_items -= (to_left + to_right);
-
- /* all content of node can be shifted to neighbors */
- if (remain_items < 1) {
- set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
- NULL, -1, -1);
- return 1;
- }
-
- /* S[0] is not removable */
- if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
- return 0;
-
- /* check whether we can divide 1 remaining item between neighbors */
-
- /* get size of remaining item (in item units) */
- size = op_unit_num(&vn->vn_vi[to_left]);
-
- if (tb->lbytes + tb->rbytes >= size) {
- set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
- tb->lbytes, -1);
- return 1;
- }
-
- return 0;
-}
-
-/* check whether L, S, R can be joined in one node */
-static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
-{
- struct virtual_node *vn = tb->tb_vn;
- int ih_size;
- struct buffer_head *S0;
-
- S0 = PATH_H_PBUFFER(tb->tb_path, 0);
-
- ih_size = 0;
- if (vn->vn_nr_item) {
- if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
- ih_size += IH_SIZE;
-
- if (vn->vn_vi[vn->vn_nr_item - 1].
- vi_type & VI_TYPE_RIGHT_MERGEABLE)
- ih_size += IH_SIZE;
- } else {
- /* there was only one item and it will be deleted */
- struct item_head *ih;
-
- RFALSE(B_NR_ITEMS(S0) != 1,
- "vs-8125: item number must be 1: it is %d",
- B_NR_ITEMS(S0));
-
- ih = item_head(S0, 0);
- if (tb->CFR[0]
- && !comp_short_le_keys(&ih->ih_key,
- internal_key(tb->CFR[0],
- tb->rkey[0])))
- /*
- * Directory must be in correct state here: that is
- * somewhere at the left side should exist first
- * directory item. But the item being deleted can
- * not be that first one because its right neighbor
- * is item of the same directory. (But first item
- * always gets deleted in last turn). So, neighbors
- * of deleted item can be merged, so we can save
- * ih_size
- */
- if (is_direntry_le_ih(ih)) {
- ih_size = IH_SIZE;
-
- /*
- * we might check that left neighbor exists
- * and is of the same directory
- */
- RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
- "vs-8130: first directory item can not be removed until directory is not empty");
- }
-
- }
-
- if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
- set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
- PROC_INFO_INC(tb->tb_sb, leaves_removable);
- return 1;
- }
- return 0;
-
-}
-
-/* when we do not split item, lnum and rnum are numbers of entire items */
-#define SET_PAR_SHIFT_LEFT \
-if (h)\
-{\
- int to_l;\
- \
- to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
- (MAX_NR_KEY(Sh) + 1 - lpar);\
- \
- set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
-}\
-else \
-{\
- if (lset==LEFT_SHIFT_FLOW)\
- set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
- tb->lbytes, -1);\
- else\
- set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
- -1, -1);\
-}
-
-#define SET_PAR_SHIFT_RIGHT \
-if (h)\
-{\
- int to_r;\
- \
- to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
- \
- set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
-}\
-else \
-{\
- if (rset==RIGHT_SHIFT_FLOW)\
- set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
- -1, tb->rbytes);\
- else\
- set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
- -1, -1);\
-}
-
-static void free_buffers_in_tb(struct tree_balance *tb)
-{
- int i;
-
- pathrelse(tb->tb_path);
-
- for (i = 0; i < MAX_HEIGHT; i++) {
- brelse(tb->L[i]);
- brelse(tb->R[i]);
- brelse(tb->FL[i]);
- brelse(tb->FR[i]);
- brelse(tb->CFL[i]);
- brelse(tb->CFR[i]);
-
- tb->L[i] = NULL;
- tb->R[i] = NULL;
- tb->FL[i] = NULL;
- tb->FR[i] = NULL;
- tb->CFL[i] = NULL;
- tb->CFR[i] = NULL;
- }
-}
-
-/*
- * Get new buffers for storing new nodes that are created while balancing.
- * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
- * CARRY_ON - schedule didn't occur while the function worked;
- * NO_DISK_SPACE - no disk space.
- */
-/* The function is NOT SCHEDULE-SAFE! */
-static int get_empty_nodes(struct tree_balance *tb, int h)
-{
- struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
- b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
- int counter, number_of_freeblk;
- int amount_needed; /* number of needed empty blocks */
- int retval = CARRY_ON;
- struct super_block *sb = tb->tb_sb;
-
- /*
- * number_of_freeblk is the number of empty blocks which have been
- * acquired for use by the balancing algorithm minus the number of
- * empty blocks used in the previous levels of the analysis,
- * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
- * occurs after empty blocks are acquired, and the balancing analysis
- * is then restarted, amount_needed is the number needed by this
- * level (h) of the balancing analysis.
- *
- * Note that for systems with many processes writing, it would be
- * more layout optimal to calculate the total number needed by all
- * levels and then to run reiserfs_new_blocks to get all of them at
- * once.
- */
-
- /*
- * Initiate number_of_freeblk to the amount acquired prior to the
- * restart of the analysis or 0 if not restarted, then subtract the
- * amount needed by all of the levels of the tree below h.
- */
- /* blknum includes S[h], so we subtract 1 in this calculation */
- for (counter = 0, number_of_freeblk = tb->cur_blknum;
- counter < h; counter++)
- number_of_freeblk -=
- (tb->blknum[counter]) ? (tb->blknum[counter] -
- 1) : 0;
-
- /* Allocate missing empty blocks. */
- /* if Sh == 0 then we are getting a new root */
- amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
- /*
- * Amount_needed = the amount that we need more than the
- * amount that we have.
- */
- if (amount_needed > number_of_freeblk)
- amount_needed -= number_of_freeblk;
- else /* If we have enough already then there is nothing to do. */
- return CARRY_ON;
-
- /*
- * No need to check quota - is not allocated for blocks used
- * for formatted nodes
- */
- if (reiserfs_new_form_blocknrs(tb, blocknrs,
- amount_needed) == NO_DISK_SPACE)
- return NO_DISK_SPACE;
-
- /* for each blocknumber we just got, get a buffer and stick it on FEB */
- for (blocknr = blocknrs, counter = 0;
- counter < amount_needed; blocknr++, counter++) {
-
- RFALSE(!*blocknr,
- "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
-
- new_bh = sb_getblk(sb, *blocknr);
- RFALSE(buffer_dirty(new_bh) ||
- buffer_journaled(new_bh) ||
- buffer_journal_dirty(new_bh),
- "PAP-8140: journaled or dirty buffer %b for the new block",
- new_bh);
-
- /* Put empty buffers into the array. */
- RFALSE(tb->FEB[tb->cur_blknum],
- "PAP-8141: busy slot for new buffer");
-
- set_buffer_journal_new(new_bh);
- tb->FEB[tb->cur_blknum++] = new_bh;
- }
-
- if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
- retval = REPEAT_SEARCH;
-
- return retval;
-}
-
-/*
- * Get free space of the left neighbor, which is stored in the parent
- * node of the left neighbor.
- */
-static int get_lfree(struct tree_balance *tb, int h)
-{
- struct buffer_head *l, *f;
- int order;
-
- if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
- (l = tb->FL[h]) == NULL)
- return 0;
-
- if (f == l)
- order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
- else {
- order = B_NR_ITEMS(l);
- f = l;
- }
-
- return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
-}
-
-/*
- * Get free space of the right neighbor,
- * which is stored in the parent node of the right neighbor.
- */
-static int get_rfree(struct tree_balance *tb, int h)
-{
- struct buffer_head *r, *f;
- int order;
-
- if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
- (r = tb->FR[h]) == NULL)
- return 0;
-
- if (f == r)
- order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
- else {
- order = 0;
- f = r;
- }
-
- return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
-
-}
-
-/* Check whether left neighbor is in memory. */
-static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
-{
- struct buffer_head *father, *left;
- struct super_block *sb = tb->tb_sb;
- b_blocknr_t left_neighbor_blocknr;
- int left_neighbor_position;
-
- /* Father of the left neighbor does not exist. */
- if (!tb->FL[h])
- return 0;
-
- /* Calculate father of the node to be balanced. */
- father = PATH_H_PBUFFER(tb->tb_path, h + 1);
-
- RFALSE(!father ||
- !B_IS_IN_TREE(father) ||
- !B_IS_IN_TREE(tb->FL[h]) ||
- !buffer_uptodate(father) ||
- !buffer_uptodate(tb->FL[h]),
- "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
- father, tb->FL[h]);
-
- /*
- * Get position of the pointer to the left neighbor
- * into the left father.
- */
- left_neighbor_position = (father == tb->FL[h]) ?
- tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
- /* Get left neighbor block number. */
- left_neighbor_blocknr =
- B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
- /* Look for the left neighbor in the cache. */
- if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
-
- RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
- "vs-8170: left neighbor (%b %z) is not in the tree",
- left, left);
- put_bh(left);
- return 1;
- }
-
- return 0;
-}
-
-#define LEFT_PARENTS 'l'
-#define RIGHT_PARENTS 'r'
-
-static void decrement_key(struct cpu_key *key)
-{
- /* call item specific function for this key */
- item_ops[cpu_key_k_type(key)]->decrement_key(key);
-}
-
-/*
- * Calculate far left/right parent of the left/right neighbor of the
- * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
- * of the parent F[h].
- * Calculate left/right common parent of the current node and L[h]/R[h].
- * Calculate left/right delimiting key position.
- * Returns: PATH_INCORRECT - path in the tree is not correct
- * SCHEDULE_OCCURRED - schedule occurred while the function worked
- * CARRY_ON - schedule didn't occur while the function
- * worked
- */
-static int get_far_parent(struct tree_balance *tb,
- int h,
- struct buffer_head **pfather,
- struct buffer_head **pcom_father, char c_lr_par)
-{
- struct buffer_head *parent;
- INITIALIZE_PATH(s_path_to_neighbor_father);
- struct treepath *path = tb->tb_path;
- struct cpu_key s_lr_father_key;
- int counter,
- position = INT_MAX,
- first_last_position = 0,
- path_offset = PATH_H_PATH_OFFSET(path, h);
-
- /*
- * Starting from F[h] go upwards in the tree, and look for the common
- * ancestor of F[h], and its neighbor l/r, that should be obtained.
- */
-
- counter = path_offset;
-
- RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
- "PAP-8180: invalid path length");
-
- for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
- /*
- * Check whether parent of the current buffer in the path
- * is really parent in the tree.
- */
- if (!B_IS_IN_TREE
- (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
- return REPEAT_SEARCH;
-
- /* Check whether position in the parent is correct. */
- if ((position =
- PATH_OFFSET_POSITION(path,
- counter - 1)) >
- B_NR_ITEMS(parent))
- return REPEAT_SEARCH;
-
- /*
- * Check whether parent at the path really points
- * to the child.
- */
- if (B_N_CHILD_NUM(parent, position) !=
- PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
- return REPEAT_SEARCH;
-
- /*
- * Return delimiting key if position in the parent is not
- * equal to first/last one.
- */
- if (c_lr_par == RIGHT_PARENTS)
- first_last_position = B_NR_ITEMS(parent);
- if (position != first_last_position) {
- *pcom_father = parent;
- get_bh(*pcom_father);
- /*(*pcom_father = parent)->b_count++; */
- break;
- }
- }
-
- /* if we are in the root of the tree, then there is no common father */
- if (counter == FIRST_PATH_ELEMENT_OFFSET) {
- /*
- * Check whether first buffer in the path is the
- * root of the tree.
- */
- if (PATH_OFFSET_PBUFFER
- (tb->tb_path,
- FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
- SB_ROOT_BLOCK(tb->tb_sb)) {
- *pfather = *pcom_father = NULL;
- return CARRY_ON;
- }
- return REPEAT_SEARCH;
- }
-
- RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
- "PAP-8185: (%b %z) level too small",
- *pcom_father, *pcom_father);
-
- /* Check whether the common parent is locked. */
-
- if (buffer_locked(*pcom_father)) {
-
- /* Release the write lock while the buffer is busy */
- int depth = reiserfs_write_unlock_nested(tb->tb_sb);
- __wait_on_buffer(*pcom_father);
- reiserfs_write_lock_nested(tb->tb_sb, depth);
- if (FILESYSTEM_CHANGED_TB(tb)) {
- brelse(*pcom_father);
- return REPEAT_SEARCH;
- }
- }
-
- /*
- * So, we got common parent of the current node and its
- * left/right neighbor. Now we are getting the parent of the
- * left/right neighbor.
- */
-
- /* Form key to get parent of the left/right neighbor. */
- le_key2cpu_key(&s_lr_father_key,
- internal_key(*pcom_father,
- (c_lr_par ==
- LEFT_PARENTS) ? (tb->lkey[h - 1] =
- position -
- 1) : (tb->rkey[h -
- 1] =
- position)));
-
- if (c_lr_par == LEFT_PARENTS)
- decrement_key(&s_lr_father_key);
-
- if (search_by_key
- (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
- h + 1) == IO_ERROR)
- /* path is released */
- return IO_ERROR;
-
- if (FILESYSTEM_CHANGED_TB(tb)) {
- pathrelse(&s_path_to_neighbor_father);
- brelse(*pcom_father);
- return REPEAT_SEARCH;
- }
-
- *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
-
- RFALSE(B_LEVEL(*pfather) != h + 1,
- "PAP-8190: (%b %z) level too small", *pfather, *pfather);
- RFALSE(s_path_to_neighbor_father.path_length <
- FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
-
- s_path_to_neighbor_father.path_length--;
- pathrelse(&s_path_to_neighbor_father);
- return CARRY_ON;
-}
-
-/*
- * Get parents of neighbors of node in the path(S[path_offset]) and
- * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
- * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
- * CFR[path_offset].
- * Calculate numbers of left and right delimiting keys position:
- * lkey[path_offset], rkey[path_offset].
- * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked
- * CARRY_ON - schedule didn't occur while the function worked
- */
-static int get_parents(struct tree_balance *tb, int h)
-{
- struct treepath *path = tb->tb_path;
- int position,
- ret,
- path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
- struct buffer_head *curf, *curcf;
-
- /* Current node is the root of the tree or will be root of the tree */
- if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
- /*
- * The root can not have parents.
- * Release nodes which previously were obtained as
- * parents of the current node neighbors.
- */
- brelse(tb->FL[h]);
- brelse(tb->CFL[h]);
- brelse(tb->FR[h]);
- brelse(tb->CFR[h]);
- tb->FL[h] = NULL;
- tb->CFL[h] = NULL;
- tb->FR[h] = NULL;
- tb->CFR[h] = NULL;
- return CARRY_ON;
- }
-
- /* Get parent FL[path_offset] of L[path_offset]. */
- position = PATH_OFFSET_POSITION(path, path_offset - 1);
- if (position) {
- /* Current node is not the first child of its parent. */
- curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
- curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
- get_bh(curf);
- get_bh(curf);
- tb->lkey[h] = position - 1;
- } else {
- /*
- * Calculate current parent of L[path_offset], which is the
- * left neighbor of the current node. Calculate current
- * common parent of L[path_offset] and the current node.
- * Note that CFL[path_offset] not equal FL[path_offset] and
- * CFL[path_offset] not equal F[path_offset].
- * Calculate lkey[path_offset].
- */
- if ((ret = get_far_parent(tb, h + 1, &curf,
- &curcf,
- LEFT_PARENTS)) != CARRY_ON)
- return ret;
- }
-
- brelse(tb->FL[h]);
- tb->FL[h] = curf; /* New initialization of FL[h]. */
- brelse(tb->CFL[h]);
- tb->CFL[h] = curcf; /* New initialization of CFL[h]. */
-
- RFALSE((curf && !B_IS_IN_TREE(curf)) ||
- (curcf && !B_IS_IN_TREE(curcf)),
- "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
-
- /* Get parent FR[h] of R[h]. */
-
- /* Current node is the last child of F[h]. FR[h] != F[h]. */
- if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
- /*
- * Calculate current parent of R[h], which is the right
- * neighbor of F[h]. Calculate current common parent of
- * R[h] and current node. Note that CFR[h] not equal
- * FR[path_offset] and CFR[h] not equal F[h].
- */
- if ((ret =
- get_far_parent(tb, h + 1, &curf, &curcf,
- RIGHT_PARENTS)) != CARRY_ON)
- return ret;
- } else {
- /* Current node is not the last child of its parent F[h]. */
- curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
- curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
- get_bh(curf);
- get_bh(curf);
- tb->rkey[h] = position;
- }
-
- brelse(tb->FR[h]);
- /* New initialization of FR[path_offset]. */
- tb->FR[h] = curf;
-
- brelse(tb->CFR[h]);
- /* New initialization of CFR[path_offset]. */
- tb->CFR[h] = curcf;
-
- RFALSE((curf && !B_IS_IN_TREE(curf)) ||
- (curcf && !B_IS_IN_TREE(curcf)),
- "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
-
- return CARRY_ON;
-}
-
-/*
- * it is possible to remove node as result of shiftings to
- * neighbors even when we insert or paste item.
- */
-static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
- struct tree_balance *tb, int h)
-{
- struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
- int levbytes = tb->insert_size[h];
- struct item_head *ih;
- struct reiserfs_key *r_key = NULL;
-
- ih = item_head(Sh, 0);
- if (tb->CFR[h])
- r_key = internal_key(tb->CFR[h], tb->rkey[h]);
-
- if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
- /* shifting may merge items which might save space */
- -
- ((!h
- && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
- -
- ((!h && r_key
- && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
- + ((h) ? KEY_SIZE : 0)) {
- /* node can not be removed */
- if (sfree >= levbytes) {
- /* new item fits into node S[h] without any shifting */
- if (!h)
- tb->s0num =
- B_NR_ITEMS(Sh) +
- ((mode == M_INSERT) ? 1 : 0);
- set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED;
- }
- }
- PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
- return !NO_BALANCING_NEEDED;
-}
-
-/*
- * Check whether current node S[h] is balanced when increasing its size by
- * Inserting or Pasting.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- * tb tree_balance structure;
- * h current level of the node;
- * inum item number in S[h];
- * mode i - insert, p - paste;
- * Returns: 1 - schedule occurred;
- * 0 - balancing for higher levels needed;
- * -1 - no balancing for higher levels needed;
- * -2 - no disk space.
- */
-/* ip means Inserting or Pasting */
-static int ip_check_balance(struct tree_balance *tb, int h)
-{
- struct virtual_node *vn = tb->tb_vn;
- /*
- * Number of bytes that must be inserted into (value is negative
- * if bytes are deleted) buffer which contains node being balanced.
- * The mnemonic is that the attempted change in node space used
- * level is levbytes bytes.
- */
- int levbytes;
- int ret;
-
- int lfree, sfree, rfree /* free space in L, S and R */ ;
-
- /*
- * nver is short for number of vertixes, and lnver is the number if
- * we shift to the left, rnver is the number if we shift to the
- * right, and lrnver is the number if we shift in both directions.
- * The goal is to minimize first the number of vertixes, and second,
- * the number of vertixes whose contents are changed by shifting,
- * and third the number of uncached vertixes whose contents are
- * changed by shifting and must be read from disk.
- */
- int nver, lnver, rnver, lrnver;
-
- /*
- * used at leaf level only, S0 = S[0] is the node being balanced,
- * sInum [ I = 0,1,2 ] is the number of items that will
- * remain in node SI after balancing. S1 and S2 are new
- * nodes that might be created.
- */
-
- /*
- * we perform 8 calls to get_num_ver(). For each call we
- * calculate five parameters. where 4th parameter is s1bytes
- * and 5th - s2bytes
- *
- * s0num, s1num, s2num for 8 cases
- * 0,1 - do not shift and do not shift but bottle
- * 2 - shift only whole item to left
- * 3 - shift to left and bottle as much as possible
- * 4,5 - shift to right (whole items and as much as possible
- * 6,7 - shift to both directions (whole items and as much as possible)
- */
- short snum012[40] = { 0, };
-
- /* Sh is the node whose balance is currently being checked */
- struct buffer_head *Sh;
-
- Sh = PATH_H_PBUFFER(tb->tb_path, h);
- levbytes = tb->insert_size[h];
-
- /* Calculate balance parameters for creating new root. */
- if (!Sh) {
- if (!h)
- reiserfs_panic(tb->tb_sb, "vs-8210",
- "S[0] can not be 0");
- switch (ret = get_empty_nodes(tb, h)) {
- /* no balancing for higher levels needed */
- case CARRY_ON:
- set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED;
-
- case NO_DISK_SPACE:
- case REPEAT_SEARCH:
- return ret;
- default:
- reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
- "return value of get_empty_nodes");
- }
- }
-
- /* get parents of S[h] neighbors. */
- ret = get_parents(tb, h);
- if (ret != CARRY_ON)
- return ret;
-
- sfree = B_FREE_SPACE(Sh);
-
- /* get free space of neighbors */
- rfree = get_rfree(tb, h);
- lfree = get_lfree(tb, h);
-
- /* and new item fits into node S[h] without any shifting */
- if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
- NO_BALANCING_NEEDED)
- return NO_BALANCING_NEEDED;
-
- create_virtual_node(tb, h);
-
- /*
- * determine maximal number of items we can shift to the left
- * neighbor (in tb structure) and the maximal number of bytes
- * that can flow to the left neighbor from the left most liquid
- * item that cannot be shifted from S[0] entirely (returned value)
- */
- check_left(tb, h, lfree);
-
- /*
- * determine maximal number of items we can shift to the right
- * neighbor (in tb structure) and the maximal number of bytes
- * that can flow to the right neighbor from the right most liquid
- * item that cannot be shifted from S[0] entirely (returned value)
- */
- check_right(tb, h, rfree);
-
- /*
- * all contents of internal node S[h] can be moved into its
- * neighbors, S[h] will be removed after balancing
- */
- if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
- int to_r;
-
- /*
- * Since we are working on internal nodes, and our internal
- * nodes have fixed size entries, then we can balance by the
- * number of items rather than the space they consume. In this
- * routine we set the left node equal to the right node,
- * allowing a difference of less than or equal to 1 child
- * pointer.
- */
- to_r =
- ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
- vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
- tb->rnum[h]);
- set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
- -1, -1);
- return CARRY_ON;
- }
-
- /*
- * this checks balance condition, that any two neighboring nodes
- * can not fit in one node
- */
- RFALSE(h &&
- (tb->lnum[h] >= vn->vn_nr_item + 1 ||
- tb->rnum[h] >= vn->vn_nr_item + 1),
- "vs-8220: tree is not balanced on internal level");
- RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
- (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
- "vs-8225: tree is not balanced on leaf level");
-
- /*
- * all contents of S[0] can be moved into its neighbors
- * S[0] will be removed after balancing.
- */
- if (!h && is_leaf_removable(tb))
- return CARRY_ON;
-
- /*
- * why do we perform this check here rather than earlier??
- * Answer: we can win 1 node in some cases above. Moreover we
- * checked it above, when we checked, that S[0] is not removable
- * in principle
- */
-
- /* new item fits into node S[h] without any shifting */
- if (sfree >= levbytes) {
- if (!h)
- tb->s0num = vn->vn_nr_item;
- set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED;
- }
-
- {
- int lpar, rpar, nset, lset, rset, lrset;
- /* regular overflowing of the node */
-
- /*
- * get_num_ver works in 2 modes (FLOW & NO_FLOW)
- * lpar, rpar - number of items we can shift to left/right
- * neighbor (including splitting item)
- * nset, lset, rset, lrset - shows, whether flowing items
- * give better packing
- */
-#define FLOW 1
-#define NO_FLOW 0 /* do not any splitting */
-
- /* we choose one of the following */
-#define NOTHING_SHIFT_NO_FLOW 0
-#define NOTHING_SHIFT_FLOW 5
-#define LEFT_SHIFT_NO_FLOW 10
-#define LEFT_SHIFT_FLOW 15
-#define RIGHT_SHIFT_NO_FLOW 20
-#define RIGHT_SHIFT_FLOW 25
-#define LR_SHIFT_NO_FLOW 30
-#define LR_SHIFT_FLOW 35
-
- lpar = tb->lnum[h];
- rpar = tb->rnum[h];
-
- /*
- * calculate number of blocks S[h] must be split into when
- * nothing is shifted to the neighbors, as well as number of
- * items in each part of the split node (s012 numbers),
- * and number of bytes (s1bytes) of the shared drop which
- * flow to S1 if any
- */
- nset = NOTHING_SHIFT_NO_FLOW;
- nver = get_num_ver(vn->vn_mode, tb, h,
- 0, -1, h ? vn->vn_nr_item : 0, -1,
- snum012, NO_FLOW);
-
- if (!h) {
- int nver1;
-
- /*
- * note, that in this case we try to bottle
- * between S[0] and S1 (S1 - the first new node)
- */
- nver1 = get_num_ver(vn->vn_mode, tb, h,
- 0, -1, 0, -1,
- snum012 + NOTHING_SHIFT_FLOW, FLOW);
- if (nver > nver1)
- nset = NOTHING_SHIFT_FLOW, nver = nver1;
- }
-
- /*
- * calculate number of blocks S[h] must be split into when
- * l_shift_num first items and l_shift_bytes of the right
- * most liquid item to be shifted are shifted to the left
- * neighbor, as well as number of items in each part of the
- * splitted node (s012 numbers), and number of bytes
- * (s1bytes) of the shared drop which flow to S1 if any
- */
- lset = LEFT_SHIFT_NO_FLOW;
- lnver = get_num_ver(vn->vn_mode, tb, h,
- lpar - ((h || tb->lbytes == -1) ? 0 : 1),
- -1, h ? vn->vn_nr_item : 0, -1,
- snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
- if (!h) {
- int lnver1;
-
- lnver1 = get_num_ver(vn->vn_mode, tb, h,
- lpar -
- ((tb->lbytes != -1) ? 1 : 0),
- tb->lbytes, 0, -1,
- snum012 + LEFT_SHIFT_FLOW, FLOW);
- if (lnver > lnver1)
- lset = LEFT_SHIFT_FLOW, lnver = lnver1;
- }
-
- /*
- * calculate number of blocks S[h] must be split into when
- * r_shift_num first items and r_shift_bytes of the left most
- * liquid item to be shifted are shifted to the right neighbor,
- * as well as number of items in each part of the splitted
- * node (s012 numbers), and number of bytes (s1bytes) of the
- * shared drop which flow to S1 if any
- */
- rset = RIGHT_SHIFT_NO_FLOW;
- rnver = get_num_ver(vn->vn_mode, tb, h,
- 0, -1,
- h ? (vn->vn_nr_item - rpar) : (rpar -
- ((tb->
- rbytes !=
- -1) ? 1 :
- 0)), -1,
- snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
- if (!h) {
- int rnver1;
-
- rnver1 = get_num_ver(vn->vn_mode, tb, h,
- 0, -1,
- (rpar -
- ((tb->rbytes != -1) ? 1 : 0)),
- tb->rbytes,
- snum012 + RIGHT_SHIFT_FLOW, FLOW);
-
- if (rnver > rnver1)
- rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
- }
-
- /*
- * calculate number of blocks S[h] must be split into when
- * items are shifted in both directions, as well as number
- * of items in each part of the splitted node (s012 numbers),
- * and number of bytes (s1bytes) of the shared drop which
- * flow to S1 if any
- */
- lrset = LR_SHIFT_NO_FLOW;
- lrnver = get_num_ver(vn->vn_mode, tb, h,
- lpar - ((h || tb->lbytes == -1) ? 0 : 1),
- -1,
- h ? (vn->vn_nr_item - rpar) : (rpar -
- ((tb->
- rbytes !=
- -1) ? 1 :
- 0)), -1,
- snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
- if (!h) {
- int lrnver1;
-
- lrnver1 = get_num_ver(vn->vn_mode, tb, h,
- lpar -
- ((tb->lbytes != -1) ? 1 : 0),
- tb->lbytes,
- (rpar -
- ((tb->rbytes != -1) ? 1 : 0)),
- tb->rbytes,
- snum012 + LR_SHIFT_FLOW, FLOW);
- if (lrnver > lrnver1)
- lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
- }
-
- /*
- * Our general shifting strategy is:
- * 1) to minimized number of new nodes;
- * 2) to minimized number of neighbors involved in shifting;
- * 3) to minimized number of disk reads;
- */
-
- /* we can win TWO or ONE nodes by shifting in both directions */
- if (lrnver < lnver && lrnver < rnver) {
- RFALSE(h &&
- (tb->lnum[h] != 1 ||
- tb->rnum[h] != 1 ||
- lrnver != 1 || rnver != 2 || lnver != 2
- || h != 1), "vs-8230: bad h");
- if (lrset == LR_SHIFT_FLOW)
- set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
- lrnver, snum012 + lrset,
- tb->lbytes, tb->rbytes);
- else
- set_parameters(tb, h,
- tb->lnum[h] -
- ((tb->lbytes == -1) ? 0 : 1),
- tb->rnum[h] -
- ((tb->rbytes == -1) ? 0 : 1),
- lrnver, snum012 + lrset, -1, -1);
-
- return CARRY_ON;
- }
-
- /*
- * if shifting doesn't lead to better packing
- * then don't shift
- */
- if (nver == lrnver) {
- set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
- -1);
- return CARRY_ON;
- }
-
- /*
- * now we know that for better packing shifting in only one
- * direction either to the left or to the right is required
- */
-
- /*
- * if shifting to the left is better than
- * shifting to the right
- */
- if (lnver < rnver) {
- SET_PAR_SHIFT_LEFT;
- return CARRY_ON;
- }
-
- /*
- * if shifting to the right is better than
- * shifting to the left
- */
- if (lnver > rnver) {
- SET_PAR_SHIFT_RIGHT;
- return CARRY_ON;
- }
-
- /*
- * now shifting in either direction gives the same number
- * of nodes and we can make use of the cached neighbors
- */
- if (is_left_neighbor_in_cache(tb, h)) {
- SET_PAR_SHIFT_LEFT;
- return CARRY_ON;
- }
-
- /*
- * shift to the right independently on whether the
- * right neighbor in cache or not
- */
- SET_PAR_SHIFT_RIGHT;
- return CARRY_ON;
- }
-}
-
-/*
- * Check whether current node S[h] is balanced when Decreasing its size by
- * Deleting or Cutting for INTERNAL node of S+tree.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- * tb tree_balance structure;
- * h current level of the node;
- * inum item number in S[h];
- * mode i - insert, p - paste;
- * Returns: 1 - schedule occurred;
- * 0 - balancing for higher levels needed;
- * -1 - no balancing for higher levels needed;
- * -2 - no disk space.
- *
- * Note: Items of internal nodes have fixed size, so the balance condition for
- * the internal part of S+tree is as for the B-trees.
- */
-static int dc_check_balance_internal(struct tree_balance *tb, int h)
-{
- struct virtual_node *vn = tb->tb_vn;
-
- /*
- * Sh is the node whose balance is currently being checked,
- * and Fh is its father.
- */
- struct buffer_head *Sh, *Fh;
- int ret;
- int lfree, rfree /* free space in L and R */ ;
-
- Sh = PATH_H_PBUFFER(tb->tb_path, h);
- Fh = PATH_H_PPARENT(tb->tb_path, h);
-
- /*
- * using tb->insert_size[h], which is negative in this case,
- * create_virtual_node calculates:
- * new_nr_item = number of items node would have if operation is
- * performed without balancing (new_nr_item);
- */
- create_virtual_node(tb, h);
-
- if (!Fh) { /* S[h] is the root. */
- /* no balancing for higher levels needed */
- if (vn->vn_nr_item > 0) {
- set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED;
- }
- /*
- * new_nr_item == 0.
- * Current root will be deleted resulting in
- * decrementing the tree height.
- */
- set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
- return CARRY_ON;
- }
-
- if ((ret = get_parents(tb, h)) != CARRY_ON)
- return ret;
-
- /* get free space of neighbors */
- rfree = get_rfree(tb, h);
- lfree = get_lfree(tb, h);
-
- /* determine maximal number of items we can fit into neighbors */
- check_left(tb, h, lfree);
- check_right(tb, h, rfree);
-
- /*
- * Balance condition for the internal node is valid.
- * In this case we balance only if it leads to better packing.
- */
- if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
- /*
- * Here we join S[h] with one of its neighbors,
- * which is impossible with greater values of new_nr_item.
- */
- if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
- /* All contents of S[h] can be moved to L[h]. */
- if (tb->lnum[h] >= vn->vn_nr_item + 1) {
- int n;
- int order_L;
-
- order_L =
- ((n =
- PATH_H_B_ITEM_ORDER(tb->tb_path,
- h)) ==
- 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
- n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
- (DC_SIZE + KEY_SIZE);
- set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
- -1);
- return CARRY_ON;
- }
-
- /* All contents of S[h] can be moved to R[h]. */
- if (tb->rnum[h] >= vn->vn_nr_item + 1) {
- int n;
- int order_R;
-
- order_R =
- ((n =
- PATH_H_B_ITEM_ORDER(tb->tb_path,
- h)) ==
- B_NR_ITEMS(Fh)) ? 0 : n + 1;
- n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
- (DC_SIZE + KEY_SIZE);
- set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
- -1);
- return CARRY_ON;
- }
- }
-
- /*
- * All contents of S[h] can be moved to the neighbors
- * (L[h] & R[h]).
- */
- if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
- int to_r;
-
- to_r =
- ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
- tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
- (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
- set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
- 0, NULL, -1, -1);
- return CARRY_ON;
- }
-
- /* Balancing does not lead to better packing. */
- set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED;
- }
-
- /*
- * Current node contain insufficient number of items.
- * Balancing is required.
- */
- /* Check whether we can merge S[h] with left neighbor. */
- if (tb->lnum[h] >= vn->vn_nr_item + 1)
- if (is_left_neighbor_in_cache(tb, h)
- || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
- int n;
- int order_L;
-
- order_L =
- ((n =
- PATH_H_B_ITEM_ORDER(tb->tb_path,
- h)) ==
- 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
- n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
- KEY_SIZE);
- set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
- return CARRY_ON;
- }
-
- /* Check whether we can merge S[h] with right neighbor. */
- if (tb->rnum[h] >= vn->vn_nr_item + 1) {
- int n;
- int order_R;
-
- order_R =
- ((n =
- PATH_H_B_ITEM_ORDER(tb->tb_path,
- h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
- n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
- KEY_SIZE);
- set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
- return CARRY_ON;
- }
-
- /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
- if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
- int to_r;
-
- to_r =
- ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
- vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
- tb->rnum[h]);
- set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
- -1, -1);
- return CARRY_ON;
- }
-
- /* For internal nodes try to borrow item from a neighbor */
- RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
-
- /* Borrow one or two items from caching neighbor */
- if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
- int from_l;
-
- from_l =
- (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
- 1) / 2 - (vn->vn_nr_item + 1);
- set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
- return CARRY_ON;
- }
-
- set_parameters(tb, h, 0,
- -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
- 1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
- return CARRY_ON;
-}
-
-/*
- * Check whether current node S[h] is balanced when Decreasing its size by
- * Deleting or Truncating for LEAF node of S+tree.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- * tb tree_balance structure;
- * h current level of the node;
- * inum item number in S[h];
- * mode i - insert, p - paste;
- * Returns: 1 - schedule occurred;
- * 0 - balancing for higher levels needed;
- * -1 - no balancing for higher levels needed;
- * -2 - no disk space.
- */
-static int dc_check_balance_leaf(struct tree_balance *tb, int h)
-{
- struct virtual_node *vn = tb->tb_vn;
-
- /*
- * Number of bytes that must be deleted from
- * (value is negative if bytes are deleted) buffer which
- * contains node being balanced. The mnemonic is that the
- * attempted change in node space used level is levbytes bytes.
- */
- int levbytes;
-
- /* the maximal item size */
- int maxsize, ret;
-
- /*
- * S0 is the node whose balance is currently being checked,
- * and F0 is its father.
- */
- struct buffer_head *S0, *F0;
- int lfree, rfree /* free space in L and R */ ;
-
- S0 = PATH_H_PBUFFER(tb->tb_path, 0);
- F0 = PATH_H_PPARENT(tb->tb_path, 0);
-
- levbytes = tb->insert_size[h];
-
- maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */
-
- if (!F0) { /* S[0] is the root now. */
-
- RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
- "vs-8240: attempt to create empty buffer tree");
-
- set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED;
- }
-
- if ((ret = get_parents(tb, h)) != CARRY_ON)
- return ret;
-
- /* get free space of neighbors */
- rfree = get_rfree(tb, h);
- lfree = get_lfree(tb, h);
-
- create_virtual_node(tb, h);
-
- /* if 3 leaves can be merge to one, set parameters and return */
- if (are_leaves_removable(tb, lfree, rfree))
- return CARRY_ON;
-
- /*
- * determine maximal number of items we can shift to the left/right
- * neighbor and the maximal number of bytes that can flow to the
- * left/right neighbor from the left/right most liquid item that
- * cannot be shifted from S[0] entirely
- */
- check_left(tb, h, lfree);
- check_right(tb, h, rfree);
-
- /* check whether we can merge S with left neighbor. */
- if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
- if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */
- !tb->FR[h]) {
-
- RFALSE(!tb->FL[h],
- "vs-8245: dc_check_balance_leaf: FL[h] must exist");
-
- /* set parameter to merge S[0] with its left neighbor */
- set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
- return CARRY_ON;
- }
-
- /* check whether we can merge S[0] with right neighbor. */
- if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
- set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
- return CARRY_ON;
- }
-
- /*
- * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
- * Set parameters and return
- */
- if (is_leaf_removable(tb))
- return CARRY_ON;
-
- /* Balancing is not required. */
- tb->s0num = vn->vn_nr_item;
- set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED;
-}
-
-/*
- * Check whether current node S[h] is balanced when Decreasing its size by
- * Deleting or Cutting.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- * tb tree_balance structure;
- * h current level of the node;
- * inum item number in S[h];
- * mode d - delete, c - cut.
- * Returns: 1 - schedule occurred;
- * 0 - balancing for higher levels needed;
- * -1 - no balancing for higher levels needed;
- * -2 - no disk space.
- */
-static int dc_check_balance(struct tree_balance *tb, int h)
-{
- RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
- "vs-8250: S is not initialized");
-
- if (h)
- return dc_check_balance_internal(tb, h);
- else
- return dc_check_balance_leaf(tb, h);
-}
-
-/*
- * Check whether current node S[h] is balanced.
- * Calculate parameters for balancing for current level h.
- * Parameters:
- *
- * tb tree_balance structure:
- *
- * tb is a large structure that must be read about in the header
- * file at the same time as this procedure if the reader is
- * to successfully understand this procedure
- *
- * h current level of the node;
- * inum item number in S[h];
- * mode i - insert, p - paste, d - delete, c - cut.
- * Returns: 1 - schedule occurred;
- * 0 - balancing for higher levels needed;
- * -1 - no balancing for higher levels needed;
- * -2 - no disk space.
- */
-static int check_balance(int mode,
- struct tree_balance *tb,
- int h,
- int inum,
- int pos_in_item,
- struct item_head *ins_ih, const void *data)
-{
- struct virtual_node *vn;
-
- vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
- vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
- vn->vn_mode = mode;
- vn->vn_affected_item_num = inum;
- vn->vn_pos_in_item = pos_in_item;
- vn->vn_ins_ih = ins_ih;
- vn->vn_data = data;
-
- RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
- "vs-8255: ins_ih can not be 0 in insert mode");
-
- /* Calculate balance parameters when size of node is increasing. */
- if (tb->insert_size[h] > 0)
- return ip_check_balance(tb, h);
-
- /* Calculate balance parameters when size of node is decreasing. */
- return dc_check_balance(tb, h);
-}
-
-/* Check whether parent at the path is the really parent of the current node.*/
-static int get_direct_parent(struct tree_balance *tb, int h)
-{
- struct buffer_head *bh;
- struct treepath *path = tb->tb_path;
- int position,
- path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
-
- /* We are in the root or in the new root. */
- if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
-
- RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
- "PAP-8260: invalid offset in the path");
-
- if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
- b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
- /* Root is not changed. */
- PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
- PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
- return CARRY_ON;
- }
- /* Root is changed and we must recalculate the path. */
- return REPEAT_SEARCH;
- }
-
- /* Parent in the path is not in the tree. */
- if (!B_IS_IN_TREE
- (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
- return REPEAT_SEARCH;
-
- if ((position =
- PATH_OFFSET_POSITION(path,
- path_offset - 1)) > B_NR_ITEMS(bh))
- return REPEAT_SEARCH;
-
- /* Parent in the path is not parent of the current node in the tree. */
- if (B_N_CHILD_NUM(bh, position) !=
- PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
- return REPEAT_SEARCH;
-
- if (buffer_locked(bh)) {
- int depth = reiserfs_write_unlock_nested(tb->tb_sb);
- __wait_on_buffer(bh);
- reiserfs_write_lock_nested(tb->tb_sb, depth);
- if (FILESYSTEM_CHANGED_TB(tb))
- return REPEAT_SEARCH;
- }
-
- /*
- * Parent in the path is unlocked and really parent
- * of the current node.
- */
- return CARRY_ON;
-}
-
-/*
- * Using lnum[h] and rnum[h] we should determine what neighbors
- * of S[h] we
- * need in order to balance S[h], and get them if necessary.
- * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
- * CARRY_ON - schedule didn't occur while the function worked;
- */
-static int get_neighbors(struct tree_balance *tb, int h)
-{
- int child_position,
- path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
- unsigned long son_number;
- struct super_block *sb = tb->tb_sb;
- struct buffer_head *bh;
- int depth;
-
- PROC_INFO_INC(sb, get_neighbors[h]);
-
- if (tb->lnum[h]) {
- /* We need left neighbor to balance S[h]. */
- PROC_INFO_INC(sb, need_l_neighbor[h]);
- bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
-
- RFALSE(bh == tb->FL[h] &&
- !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
- "PAP-8270: invalid position in the parent");
-
- child_position =
- (bh ==
- tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
- FL[h]);
- son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
- depth = reiserfs_write_unlock_nested(tb->tb_sb);
- bh = sb_bread(sb, son_number);
- reiserfs_write_lock_nested(tb->tb_sb, depth);
- if (!bh)
- return IO_ERROR;
- if (FILESYSTEM_CHANGED_TB(tb)) {
- brelse(bh);
- PROC_INFO_INC(sb, get_neighbors_restart[h]);
- return REPEAT_SEARCH;
- }
-
- RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
- child_position > B_NR_ITEMS(tb->FL[h]) ||
- B_N_CHILD_NUM(tb->FL[h], child_position) !=
- bh->b_blocknr, "PAP-8275: invalid parent");
- RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
- RFALSE(!h &&
- B_FREE_SPACE(bh) !=
- MAX_CHILD_SIZE(bh) -
- dc_size(B_N_CHILD(tb->FL[0], child_position)),
- "PAP-8290: invalid child size of left neighbor");
-
- brelse(tb->L[h]);
- tb->L[h] = bh;
- }
-
- /* We need right neighbor to balance S[path_offset]. */
- if (tb->rnum[h]) {
- PROC_INFO_INC(sb, need_r_neighbor[h]);
- bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
-
- RFALSE(bh == tb->FR[h] &&
- PATH_OFFSET_POSITION(tb->tb_path,
- path_offset) >=
- B_NR_ITEMS(bh),
- "PAP-8295: invalid position in the parent");
-
- child_position =
- (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
- son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
- depth = reiserfs_write_unlock_nested(tb->tb_sb);
- bh = sb_bread(sb, son_number);
- reiserfs_write_lock_nested(tb->tb_sb, depth);
- if (!bh)
- return IO_ERROR;
- if (FILESYSTEM_CHANGED_TB(tb)) {
- brelse(bh);
- PROC_INFO_INC(sb, get_neighbors_restart[h]);
- return REPEAT_SEARCH;
- }
- brelse(tb->R[h]);
- tb->R[h] = bh;
-
- RFALSE(!h
- && B_FREE_SPACE(bh) !=
- MAX_CHILD_SIZE(bh) -
- dc_size(B_N_CHILD(tb->FR[0], child_position)),
- "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
- B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
- dc_size(B_N_CHILD(tb->FR[0], child_position)));
-
- }
- return CARRY_ON;
-}
-
-static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
-{
- int max_num_of_items;
- int max_num_of_entries;
- unsigned long blocksize = sb->s_blocksize;
-
-#define MIN_NAME_LEN 1
-
- max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
- max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
- (DEH_SIZE + MIN_NAME_LEN);
-
- return sizeof(struct virtual_node) +
- max(max_num_of_items * sizeof(struct virtual_item),
- sizeof(struct virtual_item) +
- struct_size_t(struct direntry_uarea, entry_sizes,
- max_num_of_entries));
-}
-
-/*
- * maybe we should fail balancing we are going to perform when kmalloc
- * fails several times. But now it will loop until kmalloc gets
- * required memory
- */
-static int get_mem_for_virtual_node(struct tree_balance *tb)
-{
- int check_fs = 0;
- int size;
- char *buf;
-
- size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
-
- /* we have to allocate more memory for virtual node */
- if (size > tb->vn_buf_size) {
- if (tb->vn_buf) {
- /* free memory allocated before */
- kfree(tb->vn_buf);
- /* this is not needed if kfree is atomic */
- check_fs = 1;
- }
-
- /* virtual node requires now more memory */
- tb->vn_buf_size = size;
-
- /* get memory for virtual item */
- buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
- if (!buf) {
- /*
- * getting memory with GFP_KERNEL priority may involve
- * balancing now (due to indirect_to_direct conversion
- * on dcache shrinking). So, release path and collected
- * resources here
- */
- free_buffers_in_tb(tb);
- buf = kmalloc(size, GFP_NOFS);
- if (!buf) {
- tb->vn_buf_size = 0;
- }
- tb->vn_buf = buf;
- schedule();
- return REPEAT_SEARCH;
- }
-
- tb->vn_buf = buf;
- }
-
- if (check_fs && FILESYSTEM_CHANGED_TB(tb))
- return REPEAT_SEARCH;
-
- return CARRY_ON;
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-static void tb_buffer_sanity_check(struct super_block *sb,
- struct buffer_head *bh,
- const char *descr, int level)
-{
- if (bh) {
- if (atomic_read(&(bh->b_count)) <= 0)
-
- reiserfs_panic(sb, "jmacd-1", "negative or zero "
- "reference counter for buffer %s[%d] "
- "(%b)", descr, level, bh);
-
- if (!buffer_uptodate(bh))
- reiserfs_panic(sb, "jmacd-2", "buffer is not up "
- "to date %s[%d] (%b)",
- descr, level, bh);
-
- if (!B_IS_IN_TREE(bh))
- reiserfs_panic(sb, "jmacd-3", "buffer is not "
- "in tree %s[%d] (%b)",
- descr, level, bh);
-
- if (bh->b_bdev != sb->s_bdev)
- reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
- "device %s[%d] (%b)",
- descr, level, bh);
-
- if (bh->b_size != sb->s_blocksize)
- reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
- "blocksize %s[%d] (%b)",
- descr, level, bh);
-
- if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
- reiserfs_panic(sb, "jmacd-6", "buffer block "
- "number too high %s[%d] (%b)",
- descr, level, bh);
- }
-}
-#else
-static void tb_buffer_sanity_check(struct super_block *sb,
- struct buffer_head *bh,
- const char *descr, int level)
-{;
-}
-#endif
-
-static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
-{
- return reiserfs_prepare_for_journal(s, bh, 0);
-}
-
-static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
-{
- struct buffer_head *locked;
-#ifdef CONFIG_REISERFS_CHECK
- int repeat_counter = 0;
-#endif
- int i;
-
- do {
-
- locked = NULL;
-
- for (i = tb->tb_path->path_length;
- !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
- if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
- /*
- * if I understand correctly, we can only
- * be sure the last buffer in the path is
- * in the tree --clm
- */
-#ifdef CONFIG_REISERFS_CHECK
- if (PATH_PLAST_BUFFER(tb->tb_path) ==
- PATH_OFFSET_PBUFFER(tb->tb_path, i))
- tb_buffer_sanity_check(tb->tb_sb,
- PATH_OFFSET_PBUFFER
- (tb->tb_path,
- i), "S",
- tb->tb_path->
- path_length - i);
-#endif
- if (!clear_all_dirty_bits(tb->tb_sb,
- PATH_OFFSET_PBUFFER
- (tb->tb_path,
- i))) {
- locked =
- PATH_OFFSET_PBUFFER(tb->tb_path,
- i);
- }
- }
- }
-
- for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
- i++) {
-
- if (tb->lnum[i]) {
-
- if (tb->L[i]) {
- tb_buffer_sanity_check(tb->tb_sb,
- tb->L[i],
- "L", i);
- if (!clear_all_dirty_bits
- (tb->tb_sb, tb->L[i]))
- locked = tb->L[i];
- }
-
- if (!locked && tb->FL[i]) {
- tb_buffer_sanity_check(tb->tb_sb,
- tb->FL[i],
- "FL", i);
- if (!clear_all_dirty_bits
- (tb->tb_sb, tb->FL[i]))
- locked = tb->FL[i];
- }
-
- if (!locked && tb->CFL[i]) {
- tb_buffer_sanity_check(tb->tb_sb,
- tb->CFL[i],
- "CFL", i);
- if (!clear_all_dirty_bits
- (tb->tb_sb, tb->CFL[i]))
- locked = tb->CFL[i];
- }
-
- }
-
- if (!locked && (tb->rnum[i])) {
-
- if (tb->R[i]) {
- tb_buffer_sanity_check(tb->tb_sb,
- tb->R[i],
- "R", i);
- if (!clear_all_dirty_bits
- (tb->tb_sb, tb->R[i]))
- locked = tb->R[i];
- }
-
- if (!locked && tb->FR[i]) {
- tb_buffer_sanity_check(tb->tb_sb,
- tb->FR[i],
- "FR", i);
- if (!clear_all_dirty_bits
- (tb->tb_sb, tb->FR[i]))
- locked = tb->FR[i];
- }
-
- if (!locked && tb->CFR[i]) {
- tb_buffer_sanity_check(tb->tb_sb,
- tb->CFR[i],
- "CFR", i);
- if (!clear_all_dirty_bits
- (tb->tb_sb, tb->CFR[i]))
- locked = tb->CFR[i];
- }
- }
- }
-
- /*
- * as far as I can tell, this is not required. The FEB list
- * seems to be full of newly allocated nodes, which will
- * never be locked, dirty, or anything else.
- * To be safe, I'm putting in the checks and waits in.
- * For the moment, they are needed to keep the code in
- * journal.c from complaining about the buffer.
- * That code is inside CONFIG_REISERFS_CHECK as well. --clm
- */
- for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
- if (tb->FEB[i]) {
- if (!clear_all_dirty_bits
- (tb->tb_sb, tb->FEB[i]))
- locked = tb->FEB[i];
- }
- }
-
- if (locked) {
- int depth;
-#ifdef CONFIG_REISERFS_CHECK
- repeat_counter++;
- if ((repeat_counter % 10000) == 0) {
- reiserfs_warning(tb->tb_sb, "reiserfs-8200",
- "too many iterations waiting "
- "for buffer to unlock "
- "(%b)", locked);
-
- /* Don't loop forever. Try to recover from possible error. */
-
- return (FILESYSTEM_CHANGED_TB(tb)) ?
- REPEAT_SEARCH : CARRY_ON;
- }
-#endif
- depth = reiserfs_write_unlock_nested(tb->tb_sb);
- __wait_on_buffer(locked);
- reiserfs_write_lock_nested(tb->tb_sb, depth);
- if (FILESYSTEM_CHANGED_TB(tb))
- return REPEAT_SEARCH;
- }
-
- } while (locked);
-
- return CARRY_ON;
-}
-
-/*
- * Prepare for balancing, that is
- * get all necessary parents, and neighbors;
- * analyze what and where should be moved;
- * get sufficient number of new nodes;
- * Balancing will start only after all resources will be collected at a time.
- *
- * When ported to SMP kernels, only at the last moment after all needed nodes
- * are collected in cache, will the resources be locked using the usual
- * textbook ordered lock acquisition algorithms. Note that ensuring that
- * this code neither write locks what it does not need to write lock nor locks
- * out of order will be a pain in the butt that could have been avoided.
- * Grumble grumble. -Hans
- *
- * fix is meant in the sense of render unchanging
- *
- * Latency might be improved by first gathering a list of what buffers
- * are needed and then getting as many of them in parallel as possible? -Hans
- *
- * Parameters:
- * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
- * tb tree_balance structure;
- * inum item number in S[h];
- * pos_in_item - comment this if you can
- * ins_ih item head of item being inserted
- * data inserted item or data to be pasted
- * Returns: 1 - schedule occurred while the function worked;
- * 0 - schedule didn't occur while the function worked;
- * -1 - if no_disk_space
- */
-
-int fix_nodes(int op_mode, struct tree_balance *tb,
- struct item_head *ins_ih, const void *data)
-{
- int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
- int pos_in_item;
-
- /*
- * we set wait_tb_buffers_run when we have to restore any dirty
- * bits cleared during wait_tb_buffers_run
- */
- int wait_tb_buffers_run = 0;
- struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
-
- ++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
-
- pos_in_item = tb->tb_path->pos_in_item;
-
- tb->fs_gen = get_generation(tb->tb_sb);
-
- /*
- * we prepare and log the super here so it will already be in the
- * transaction when do_balance needs to change it.
- * This way do_balance won't have to schedule when trying to prepare
- * the super for logging
- */
- reiserfs_prepare_for_journal(tb->tb_sb,
- SB_BUFFER_WITH_SB(tb->tb_sb), 1);
- journal_mark_dirty(tb->transaction_handle,
- SB_BUFFER_WITH_SB(tb->tb_sb));
- if (FILESYSTEM_CHANGED_TB(tb))
- return REPEAT_SEARCH;
-
- /* if it possible in indirect_to_direct conversion */
- if (buffer_locked(tbS0)) {
- int depth = reiserfs_write_unlock_nested(tb->tb_sb);
- __wait_on_buffer(tbS0);
- reiserfs_write_lock_nested(tb->tb_sb, depth);
- if (FILESYSTEM_CHANGED_TB(tb))
- return REPEAT_SEARCH;
- }
-#ifdef CONFIG_REISERFS_CHECK
- if (REISERFS_SB(tb->tb_sb)->cur_tb) {
- print_cur_tb("fix_nodes");
- reiserfs_panic(tb->tb_sb, "PAP-8305",
- "there is pending do_balance");
- }
-
- if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
- reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
- "not uptodate at the beginning of fix_nodes "
- "or not in tree (mode %c)",
- tbS0, tbS0, op_mode);
-
- /* Check parameters. */
- switch (op_mode) {
- case M_INSERT:
- if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
- reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
- "item number %d (in S0 - %d) in case "
- "of insert", item_num,
- B_NR_ITEMS(tbS0));
- break;
- case M_PASTE:
- case M_DELETE:
- case M_CUT:
- if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
- print_block(tbS0, 0, -1, -1);
- reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
- "item number(%d); mode = %c "
- "insert_size = %d",
- item_num, op_mode,
- tb->insert_size[0]);
- }
- break;
- default:
- reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
- "of operation");
- }
-#endif
-
- if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
- /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
- return REPEAT_SEARCH;
-
- /* Starting from the leaf level; for all levels h of the tree. */
- for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
- ret = get_direct_parent(tb, h);
- if (ret != CARRY_ON)
- goto repeat;
-
- ret = check_balance(op_mode, tb, h, item_num,
- pos_in_item, ins_ih, data);
- if (ret != CARRY_ON) {
- if (ret == NO_BALANCING_NEEDED) {
- /* No balancing for higher levels needed. */
- ret = get_neighbors(tb, h);
- if (ret != CARRY_ON)
- goto repeat;
- if (h != MAX_HEIGHT - 1)
- tb->insert_size[h + 1] = 0;
- /*
- * ok, analysis and resource gathering
- * are complete
- */
- break;
- }
- goto repeat;
- }
-
- ret = get_neighbors(tb, h);
- if (ret != CARRY_ON)
- goto repeat;
-
- /*
- * No disk space, or schedule occurred and analysis may be
- * invalid and needs to be redone.
- */
- ret = get_empty_nodes(tb, h);
- if (ret != CARRY_ON)
- goto repeat;
-
- /*
- * We have a positive insert size but no nodes exist on this
- * level, this means that we are creating a new root.
- */
- if (!PATH_H_PBUFFER(tb->tb_path, h)) {
-
- RFALSE(tb->blknum[h] != 1,
- "PAP-8350: creating new empty root");
-
- if (h < MAX_HEIGHT - 1)
- tb->insert_size[h + 1] = 0;
- } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
- /*
- * The tree needs to be grown, so this node S[h]
- * which is the root node is split into two nodes,
- * and a new node (S[h+1]) will be created to
- * become the root node.
- */
- if (tb->blknum[h] > 1) {
-
- RFALSE(h == MAX_HEIGHT - 1,
- "PAP-8355: attempt to create too high of a tree");
-
- tb->insert_size[h + 1] =
- (DC_SIZE +
- KEY_SIZE) * (tb->blknum[h] - 1) +
- DC_SIZE;
- } else if (h < MAX_HEIGHT - 1)
- tb->insert_size[h + 1] = 0;
- } else
- tb->insert_size[h + 1] =
- (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
- }
-
- ret = wait_tb_buffers_until_unlocked(tb);
- if (ret == CARRY_ON) {
- if (FILESYSTEM_CHANGED_TB(tb)) {
- wait_tb_buffers_run = 1;
- ret = REPEAT_SEARCH;
- goto repeat;
- } else {
- return CARRY_ON;
- }
- } else {
- wait_tb_buffers_run = 1;
- goto repeat;
- }
-
-repeat:
- /*
- * fix_nodes was unable to perform its calculation due to
- * filesystem got changed under us, lack of free disk space or i/o
- * failure. If the first is the case - the search will be
- * repeated. For now - free all resources acquired so far except
- * for the new allocated nodes
- */
- {
- int i;
-
- /* Release path buffers. */
- if (wait_tb_buffers_run) {
- pathrelse_and_restore(tb->tb_sb, tb->tb_path);
- } else {
- pathrelse(tb->tb_path);
- }
- /* brelse all resources collected for balancing */
- for (i = 0; i < MAX_HEIGHT; i++) {
- if (wait_tb_buffers_run) {
- reiserfs_restore_prepared_buffer(tb->tb_sb,
- tb->L[i]);
- reiserfs_restore_prepared_buffer(tb->tb_sb,
- tb->R[i]);
- reiserfs_restore_prepared_buffer(tb->tb_sb,
- tb->FL[i]);
- reiserfs_restore_prepared_buffer(tb->tb_sb,
- tb->FR[i]);
- reiserfs_restore_prepared_buffer(tb->tb_sb,
- tb->
- CFL[i]);
- reiserfs_restore_prepared_buffer(tb->tb_sb,
- tb->
- CFR[i]);
- }
-
- brelse(tb->L[i]);
- brelse(tb->R[i]);
- brelse(tb->FL[i]);
- brelse(tb->FR[i]);
- brelse(tb->CFL[i]);
- brelse(tb->CFR[i]);
-
- tb->L[i] = NULL;
- tb->R[i] = NULL;
- tb->FL[i] = NULL;
- tb->FR[i] = NULL;
- tb->CFL[i] = NULL;
- tb->CFR[i] = NULL;
- }
-
- if (wait_tb_buffers_run) {
- for (i = 0; i < MAX_FEB_SIZE; i++) {
- if (tb->FEB[i])
- reiserfs_restore_prepared_buffer
- (tb->tb_sb, tb->FEB[i]);
- }
- }
- return ret;
- }
-
-}
-
-void unfix_nodes(struct tree_balance *tb)
-{
- int i;
-
- /* Release path buffers. */
- pathrelse_and_restore(tb->tb_sb, tb->tb_path);
-
- /* brelse all resources collected for balancing */
- for (i = 0; i < MAX_HEIGHT; i++) {
- reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
- reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
- reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
- reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
- reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
- reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
-
- brelse(tb->L[i]);
- brelse(tb->R[i]);
- brelse(tb->FL[i]);
- brelse(tb->FR[i]);
- brelse(tb->CFL[i]);
- brelse(tb->CFR[i]);
- }
-
- /* deal with list of allocated (used and unused) nodes */
- for (i = 0; i < MAX_FEB_SIZE; i++) {
- if (tb->FEB[i]) {
- b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
- /*
- * de-allocated block which was not used by
- * balancing and bforget about buffer for it
- */
- brelse(tb->FEB[i]);
- reiserfs_free_block(tb->transaction_handle, NULL,
- blocknr, 0);
- }
- if (tb->used[i]) {
- /* release used as new nodes including a new root */
- brelse(tb->used[i]);
- }
- }
-
- kfree(tb->vn_buf);
-
-}
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
deleted file mode 100644
index 7a26c4fe6c46..000000000000
--- a/fs/reiserfs/hashes.c
+++ /dev/null
@@ -1,177 +0,0 @@
-
-/*
- * Keyed 32-bit hash function using TEA in a Davis-Meyer function
- * H0 = Key
- * Hi = E Mi(Hi-1) + Hi-1
- *
- * (see Applied Cryptography, 2nd edition, p448).
- *
- * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
- *
- * Jeremy has agreed to the contents of reiserfs/README. -Hans
- * Yura's function is added (04/07/2000)
- */
-
-#include <linux/kernel.h>
-#include "reiserfs.h"
-#include <asm/types.h>
-
-#define DELTA 0x9E3779B9
-#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */
-#define PARTROUNDS 6 /* 6 gets complete mixing */
-
-/* a, b, c, d - data; h0, h1 - accumulated hash */
-#define TEACORE(rounds) \
- do { \
- u32 sum = 0; \
- int n = rounds; \
- u32 b0, b1; \
- \
- b0 = h0; \
- b1 = h1; \
- \
- do \
- { \
- sum += DELTA; \
- b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
- b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
- } while(--n); \
- \
- h0 += b0; \
- h1 += b1; \
- } while(0)
-
-u32 keyed_hash(const signed char *msg, int len)
-{
- u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 };
-
- u32 h0 = k[0], h1 = k[1];
- u32 a, b, c, d;
- u32 pad;
- int i;
-
- /* assert(len >= 0 && len < 256); */
-
- pad = (u32) len | ((u32) len << 8);
- pad |= pad << 16;
-
- while (len >= 16) {
- a = (u32) msg[0] |
- (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
- b = (u32) msg[4] |
- (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
- c = (u32) msg[8] |
- (u32) msg[9] << 8 |
- (u32) msg[10] << 16 | (u32) msg[11] << 24;
- d = (u32) msg[12] |
- (u32) msg[13] << 8 |
- (u32) msg[14] << 16 | (u32) msg[15] << 24;
-
- TEACORE(PARTROUNDS);
-
- len -= 16;
- msg += 16;
- }
-
- if (len >= 12) {
- a = (u32) msg[0] |
- (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
- b = (u32) msg[4] |
- (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
- c = (u32) msg[8] |
- (u32) msg[9] << 8 |
- (u32) msg[10] << 16 | (u32) msg[11] << 24;
-
- d = pad;
- for (i = 12; i < len; i++) {
- d <<= 8;
- d |= msg[i];
- }
- } else if (len >= 8) {
- a = (u32) msg[0] |
- (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
- b = (u32) msg[4] |
- (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
-
- c = d = pad;
- for (i = 8; i < len; i++) {
- c <<= 8;
- c |= msg[i];
- }
- } else if (len >= 4) {
- a = (u32) msg[0] |
- (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
-
- b = c = d = pad;
- for (i = 4; i < len; i++) {
- b <<= 8;
- b |= msg[i];
- }
- } else {
- a = b = c = d = pad;
- for (i = 0; i < len; i++) {
- a <<= 8;
- a |= msg[i];
- }
- }
-
- TEACORE(FULLROUNDS);
-
-/* return 0;*/
- return h0 ^ h1;
-}
-
-/*
- * What follows in this file is copyright 2000 by Hans Reiser, and the
- * licensing of what follows is governed by reiserfs/README
- */
-u32 yura_hash(const signed char *msg, int len)
-{
- int j, pow;
- u32 a, c;
- int i;
-
- for (pow = 1, i = 1; i < len; i++)
- pow = pow * 10;
-
- if (len == 1)
- a = msg[0] - 48;
- else
- a = (msg[0] - 48) * pow;
-
- for (i = 1; i < len; i++) {
- c = msg[i] - 48;
- for (pow = 1, j = i; j < len - 1; j++)
- pow = pow * 10;
- a = a + c * pow;
- }
-
- for (; i < 40; i++) {
- c = '0' - 48;
- for (pow = 1, j = i; j < len - 1; j++)
- pow = pow * 10;
- a = a + c * pow;
- }
-
- for (; i < 256; i++) {
- c = i;
- for (pow = 1, j = i; j < len - 1; j++)
- pow = pow * 10;
- a = a + c * pow;
- }
-
- a = a << 7;
- return a;
-}
-
-u32 r5_hash(const signed char *msg, int len)
-{
- u32 a = 0;
- while (*msg) {
- a += *msg << 4;
- a += *msg >> 4;
- a *= 11;
- msg++;
- }
- return a;
-}
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
deleted file mode 100644
index 5db6f45b3fed..000000000000
--- a/fs/reiserfs/ibalance.c
+++ /dev/null
@@ -1,1161 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/uaccess.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-/* this is one and only function that is used outside (do_balance.c) */
-int balance_internal(struct tree_balance *,
- int, int, struct item_head *, struct buffer_head **);
-
-/*
- * modes of internal_shift_left, internal_shift_right and
- * internal_insert_childs
- */
-#define INTERNAL_SHIFT_FROM_S_TO_L 0
-#define INTERNAL_SHIFT_FROM_R_TO_S 1
-#define INTERNAL_SHIFT_FROM_L_TO_S 2
-#define INTERNAL_SHIFT_FROM_S_TO_R 3
-#define INTERNAL_INSERT_TO_S 4
-#define INTERNAL_INSERT_TO_L 5
-#define INTERNAL_INSERT_TO_R 6
-
-static void internal_define_dest_src_infos(int shift_mode,
- struct tree_balance *tb,
- int h,
- struct buffer_info *dest_bi,
- struct buffer_info *src_bi,
- int *d_key, struct buffer_head **cf)
-{
- memset(dest_bi, 0, sizeof(struct buffer_info));
- memset(src_bi, 0, sizeof(struct buffer_info));
- /* define dest, src, dest parent, dest position */
- switch (shift_mode) {
-
- /* used in internal_shift_left */
- case INTERNAL_SHIFT_FROM_S_TO_L:
- src_bi->tb = tb;
- src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
- src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
- src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
- dest_bi->tb = tb;
- dest_bi->bi_bh = tb->L[h];
- dest_bi->bi_parent = tb->FL[h];
- dest_bi->bi_position = get_left_neighbor_position(tb, h);
- *d_key = tb->lkey[h];
- *cf = tb->CFL[h];
- break;
- case INTERNAL_SHIFT_FROM_L_TO_S:
- src_bi->tb = tb;
- src_bi->bi_bh = tb->L[h];
- src_bi->bi_parent = tb->FL[h];
- src_bi->bi_position = get_left_neighbor_position(tb, h);
- dest_bi->tb = tb;
- dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
- dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
- /* dest position is analog of dest->b_item_order */
- dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
- *d_key = tb->lkey[h];
- *cf = tb->CFL[h];
- break;
-
- /* used in internal_shift_left */
- case INTERNAL_SHIFT_FROM_R_TO_S:
- src_bi->tb = tb;
- src_bi->bi_bh = tb->R[h];
- src_bi->bi_parent = tb->FR[h];
- src_bi->bi_position = get_right_neighbor_position(tb, h);
- dest_bi->tb = tb;
- dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
- dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
- dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
- *d_key = tb->rkey[h];
- *cf = tb->CFR[h];
- break;
-
- case INTERNAL_SHIFT_FROM_S_TO_R:
- src_bi->tb = tb;
- src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
- src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
- src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
- dest_bi->tb = tb;
- dest_bi->bi_bh = tb->R[h];
- dest_bi->bi_parent = tb->FR[h];
- dest_bi->bi_position = get_right_neighbor_position(tb, h);
- *d_key = tb->rkey[h];
- *cf = tb->CFR[h];
- break;
-
- case INTERNAL_INSERT_TO_L:
- dest_bi->tb = tb;
- dest_bi->bi_bh = tb->L[h];
- dest_bi->bi_parent = tb->FL[h];
- dest_bi->bi_position = get_left_neighbor_position(tb, h);
- break;
-
- case INTERNAL_INSERT_TO_S:
- dest_bi->tb = tb;
- dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
- dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
- dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
- break;
-
- case INTERNAL_INSERT_TO_R:
- dest_bi->tb = tb;
- dest_bi->bi_bh = tb->R[h];
- dest_bi->bi_parent = tb->FR[h];
- dest_bi->bi_position = get_right_neighbor_position(tb, h);
- break;
-
- default:
- reiserfs_panic(tb->tb_sb, "ibalance-1",
- "shift type is unknown (%d)",
- shift_mode);
- }
-}
-
-/*
- * Insert count node pointers into buffer cur before position to + 1.
- * Insert count items into buffer cur before position to.
- * Items and node pointers are specified by inserted and bh respectively.
- */
-static void internal_insert_childs(struct buffer_info *cur_bi,
- int to, int count,
- struct item_head *inserted,
- struct buffer_head **bh)
-{
- struct buffer_head *cur = cur_bi->bi_bh;
- struct block_head *blkh;
- int nr;
- struct reiserfs_key *ih;
- struct disk_child new_dc[2];
- struct disk_child *dc;
- int i;
-
- if (count <= 0)
- return;
-
- blkh = B_BLK_HEAD(cur);
- nr = blkh_nr_item(blkh);
-
- RFALSE(count > 2, "too many children (%d) are to be inserted", count);
- RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE),
- "no enough free space (%d), needed %d bytes",
- B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE));
-
- /* prepare space for count disk_child */
- dc = B_N_CHILD(cur, to + 1);
-
- memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE);
-
- /* copy to_be_insert disk children */
- for (i = 0; i < count; i++) {
- put_dc_size(&new_dc[i],
- MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
- put_dc_block_number(&new_dc[i], bh[i]->b_blocknr);
- }
- memcpy(dc, new_dc, DC_SIZE * count);
-
- /* prepare space for count items */
- ih = internal_key(cur, ((to == -1) ? 0 : to));
-
- memmove(ih + count, ih,
- (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
-
- /* copy item headers (keys) */
- memcpy(ih, inserted, KEY_SIZE);
- if (count > 1)
- memcpy(ih + 1, inserted + 1, KEY_SIZE);
-
- /* sizes, item number */
- set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count);
- set_blkh_free_space(blkh,
- blkh_free_space(blkh) - count * (DC_SIZE +
- KEY_SIZE));
-
- do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
-
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
- check_internal(cur);
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
-
- if (cur_bi->bi_parent) {
- struct disk_child *t_dc =
- B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
- put_dc_size(t_dc,
- dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
- do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
- 0);
-
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
- check_internal(cur_bi->bi_parent);
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
- }
-
-}
-
-/*
- * Delete del_num items and node pointers from buffer cur starting from
- * the first_i'th item and first_p'th pointers respectively.
- */
-static void internal_delete_pointers_items(struct buffer_info *cur_bi,
- int first_p,
- int first_i, int del_num)
-{
- struct buffer_head *cur = cur_bi->bi_bh;
- int nr;
- struct block_head *blkh;
- struct reiserfs_key *key;
- struct disk_child *dc;
-
- RFALSE(cur == NULL, "buffer is 0");
- RFALSE(del_num < 0,
- "negative number of items (%d) can not be deleted", del_num);
- RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1
- || first_i < 0,
- "first pointer order (%d) < 0 or "
- "no so many pointers (%d), only (%d) or "
- "first key order %d < 0", first_p, first_p + del_num,
- B_NR_ITEMS(cur) + 1, first_i);
- if (del_num == 0)
- return;
-
- blkh = B_BLK_HEAD(cur);
- nr = blkh_nr_item(blkh);
-
- if (first_p == 0 && del_num == nr + 1) {
- RFALSE(first_i != 0,
- "1st deleted key must have order 0, not %d", first_i);
- make_empty_node(cur_bi);
- return;
- }
-
- RFALSE(first_i + del_num > B_NR_ITEMS(cur),
- "first_i = %d del_num = %d "
- "no so many keys (%d) in the node (%b)(%z)",
- first_i, del_num, first_i + del_num, cur, cur);
-
- /* deleting */
- dc = B_N_CHILD(cur, first_p);
-
- memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
- key = internal_key(cur, first_i);
- memmove(key, key + del_num,
- (nr - first_i - del_num) * KEY_SIZE + (nr + 1 -
- del_num) * DC_SIZE);
-
- /* sizes, item number */
- set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
- set_blkh_free_space(blkh,
- blkh_free_space(blkh) +
- (del_num * (KEY_SIZE + DC_SIZE)));
-
- do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
- /*&&&&&&&&&&&&&&&&&&&&&&& */
- check_internal(cur);
- /*&&&&&&&&&&&&&&&&&&&&&&& */
-
- if (cur_bi->bi_parent) {
- struct disk_child *t_dc;
- t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
- put_dc_size(t_dc,
- dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE)));
-
- do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
- 0);
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
- check_internal(cur_bi->bi_parent);
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
- }
-}
-
-/* delete n node pointers and items starting from given position */
-static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
-{
- int i_from;
-
- i_from = (from == 0) ? from : from - 1;
-
- /*
- * delete n pointers starting from `from' position in CUR;
- * delete n keys starting from 'i_from' position in CUR;
- */
- internal_delete_pointers_items(cur_bi, from, i_from, n);
-}
-
-/*
- * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer
- * dest
- * last_first == FIRST_TO_LAST means that we copy first items
- * from src to tail of dest
- * last_first == LAST_TO_FIRST means that we copy last items
- * from src to head of dest
- */
-static void internal_copy_pointers_items(struct buffer_info *dest_bi,
- struct buffer_head *src,
- int last_first, int cpy_num)
-{
- /*
- * ATTENTION! Number of node pointers in DEST is equal to number
- * of items in DEST as delimiting key have already inserted to
- * buffer dest.
- */
- struct buffer_head *dest = dest_bi->bi_bh;
- int nr_dest, nr_src;
- int dest_order, src_order;
- struct block_head *blkh;
- struct reiserfs_key *key;
- struct disk_child *dc;
-
- nr_src = B_NR_ITEMS(src);
-
- RFALSE(dest == NULL || src == NULL,
- "src (%p) or dest (%p) buffer is 0", src, dest);
- RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
- "invalid last_first parameter (%d)", last_first);
- RFALSE(nr_src < cpy_num - 1,
- "no so many items (%d) in src (%d)", cpy_num, nr_src);
- RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
- RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
- "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
- cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
-
- if (cpy_num == 0)
- return;
-
- /* coping */
- blkh = B_BLK_HEAD(dest);
- nr_dest = blkh_nr_item(blkh);
-
- /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */
- /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */
- (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order =
- nr_src - cpy_num + 1) : (dest_order =
- nr_dest,
- src_order =
- 0);
-
- /* prepare space for cpy_num pointers */
- dc = B_N_CHILD(dest, dest_order);
-
- memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
-
- /* insert pointers */
- memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num);
-
- /* prepare space for cpy_num - 1 item headers */
- key = internal_key(dest, dest_order);
- memmove(key + cpy_num - 1, key,
- KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest +
- cpy_num));
-
- /* insert headers */
- memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1));
-
- /* sizes, item number */
- set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1));
- set_blkh_free_space(blkh,
- blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) +
- DC_SIZE * cpy_num));
-
- do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
-
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
- check_internal(dest);
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
-
- if (dest_bi->bi_parent) {
- struct disk_child *t_dc;
- t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
- put_dc_size(t_dc,
- dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) +
- DC_SIZE * cpy_num));
-
- do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
- 0);
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
- check_internal(dest_bi->bi_parent);
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
- }
-
-}
-
-/*
- * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to
- * buffer dest.
- * Delete cpy_num - del_par items and node pointers from buffer src.
- * last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
- * last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
- */
-static void internal_move_pointers_items(struct buffer_info *dest_bi,
- struct buffer_info *src_bi,
- int last_first, int cpy_num,
- int del_par)
-{
- int first_pointer;
- int first_item;
-
- internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first,
- cpy_num);
-
- if (last_first == FIRST_TO_LAST) { /* shift_left occurs */
- first_pointer = 0;
- first_item = 0;
- /*
- * delete cpy_num - del_par pointers and keys starting for
- * pointers with first_pointer, for key - with first_item
- */
- internal_delete_pointers_items(src_bi, first_pointer,
- first_item, cpy_num - del_par);
- } else { /* shift_right occurs */
- int i, j;
-
- i = (cpy_num - del_par ==
- (j =
- B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num +
- del_par;
-
- internal_delete_pointers_items(src_bi,
- j + 1 - cpy_num + del_par, i,
- cpy_num - del_par);
- }
-}
-
-/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
-static void internal_insert_key(struct buffer_info *dest_bi,
- /* insert key before key with n_dest number */
- int dest_position_before,
- struct buffer_head *src, int src_position)
-{
- struct buffer_head *dest = dest_bi->bi_bh;
- int nr;
- struct block_head *blkh;
- struct reiserfs_key *key;
-
- RFALSE(dest == NULL || src == NULL,
- "source(%p) or dest(%p) buffer is 0", src, dest);
- RFALSE(dest_position_before < 0 || src_position < 0,
- "source(%d) or dest(%d) key number less than 0",
- src_position, dest_position_before);
- RFALSE(dest_position_before > B_NR_ITEMS(dest) ||
- src_position >= B_NR_ITEMS(src),
- "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
- dest_position_before, B_NR_ITEMS(dest),
- src_position, B_NR_ITEMS(src));
- RFALSE(B_FREE_SPACE(dest) < KEY_SIZE,
- "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest));
-
- blkh = B_BLK_HEAD(dest);
- nr = blkh_nr_item(blkh);
-
- /* prepare space for inserting key */
- key = internal_key(dest, dest_position_before);
- memmove(key + 1, key,
- (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
-
- /* insert key */
- memcpy(key, internal_key(src, src_position), KEY_SIZE);
-
- /* Change dirt, free space, item number fields. */
-
- set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
- set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE);
-
- do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
-
- if (dest_bi->bi_parent) {
- struct disk_child *t_dc;
- t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
- put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE);
-
- do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
- 0);
- }
-}
-
-/*
- * Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
- * Copy pointer_amount node pointers and pointer_amount - 1 items from
- * buffer src to buffer dest.
- * Replace d_key'th key in buffer cfl.
- * Delete pointer_amount items and node pointers from buffer src.
- */
-/* this can be invoked both to shift from S to L and from R to S */
-static void internal_shift_left(
- /*
- * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S
- */
- int mode,
- struct tree_balance *tb,
- int h, int pointer_amount)
-{
- struct buffer_info dest_bi, src_bi;
- struct buffer_head *cf;
- int d_key_position;
-
- internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
- &d_key_position, &cf);
-
- /*printk("pointer_amount = %d\n",pointer_amount); */
-
- if (pointer_amount) {
- /*
- * insert delimiting key from common father of dest and
- * src to node dest into position B_NR_ITEM(dest)
- */
- internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
- d_key_position);
-
- if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
- if (src_bi.bi_position /*src->b_item_order */ == 0)
- replace_key(tb, cf, d_key_position,
- src_bi.
- bi_parent /*src->b_parent */ , 0);
- } else
- replace_key(tb, cf, d_key_position, src_bi.bi_bh,
- pointer_amount - 1);
- }
- /* last parameter is del_parameter */
- internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
- pointer_amount, 0);
-
-}
-
-/*
- * Insert delimiting key to L[h].
- * Copy n node pointers and n - 1 items from buffer S[h] to L[h].
- * Delete n - 1 items and node pointers from buffer S[h].
- */
-/* it always shifts from S[h] to L[h] */
-static void internal_shift1_left(struct tree_balance *tb,
- int h, int pointer_amount)
-{
- struct buffer_info dest_bi, src_bi;
- struct buffer_head *cf;
- int d_key_position;
-
- internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
- &dest_bi, &src_bi, &d_key_position, &cf);
-
- /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */
- if (pointer_amount > 0)
- internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
- d_key_position);
-
- /* last parameter is del_parameter */
- internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
- pointer_amount, 1);
-}
-
-/*
- * Insert d_key'th (delimiting) key from buffer cfr to head of dest.
- * Copy n node pointers and n - 1 items from buffer src to buffer dest.
- * Replace d_key'th key in buffer cfr.
- * Delete n items and node pointers from buffer src.
- */
-static void internal_shift_right(
- /*
- * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S
- */
- int mode,
- struct tree_balance *tb,
- int h, int pointer_amount)
-{
- struct buffer_info dest_bi, src_bi;
- struct buffer_head *cf;
- int d_key_position;
- int nr;
-
- internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
- &d_key_position, &cf);
-
- nr = B_NR_ITEMS(src_bi.bi_bh);
-
- if (pointer_amount > 0) {
- /*
- * insert delimiting key from common father of dest
- * and src to dest node into position 0
- */
- internal_insert_key(&dest_bi, 0, cf, d_key_position);
- if (nr == pointer_amount - 1) {
- RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ ||
- dest_bi.bi_bh != tb->R[h],
- "src (%p) must be == tb->S[h](%p) when it disappears",
- src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h));
- /* when S[h] disappers replace left delemiting key as well */
- if (tb->CFL[h])
- replace_key(tb, cf, d_key_position, tb->CFL[h],
- tb->lkey[h]);
- } else
- replace_key(tb, cf, d_key_position, src_bi.bi_bh,
- nr - pointer_amount);
- }
-
- /* last parameter is del_parameter */
- internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
- pointer_amount, 0);
-}
-
-/*
- * Insert delimiting key to R[h].
- * Copy n node pointers and n - 1 items from buffer S[h] to R[h].
- * Delete n - 1 items and node pointers from buffer S[h].
- */
-/* it always shift from S[h] to R[h] */
-static void internal_shift1_right(struct tree_balance *tb,
- int h, int pointer_amount)
-{
- struct buffer_info dest_bi, src_bi;
- struct buffer_head *cf;
- int d_key_position;
-
- internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
- &dest_bi, &src_bi, &d_key_position, &cf);
-
- /* insert rkey from CFR[h] to right neighbor R[h] */
- if (pointer_amount > 0)
- internal_insert_key(&dest_bi, 0, cf, d_key_position);
-
- /* last parameter is del_parameter */
- internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
- pointer_amount, 1);
-}
-
-/*
- * Delete insert_num node pointers together with their left items
- * and balance current node.
- */
-static void balance_internal_when_delete(struct tree_balance *tb,
- int h, int child_pos)
-{
- int insert_num;
- int n;
- struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
- struct buffer_info bi;
-
- insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
-
- /* delete child-node-pointer(s) together with their left item(s) */
- bi.tb = tb;
- bi.bi_bh = tbSh;
- bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
- bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
- internal_delete_childs(&bi, child_pos, -insert_num);
-
- RFALSE(tb->blknum[h] > 1,
- "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
-
- n = B_NR_ITEMS(tbSh);
-
- if (tb->lnum[h] == 0 && tb->rnum[h] == 0) {
- if (tb->blknum[h] == 0) {
- /* node S[h] (root of the tree) is empty now */
- struct buffer_head *new_root;
-
- RFALSE(n
- || B_FREE_SPACE(tbSh) !=
- MAX_CHILD_SIZE(tbSh) - DC_SIZE,
- "buffer must have only 0 keys (%d)", n);
- RFALSE(bi.bi_parent, "root has parent (%p)",
- bi.bi_parent);
-
- /* choose a new root */
- if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1]))
- new_root = tb->R[h - 1];
- else
- new_root = tb->L[h - 1];
- /*
- * switch super block's tree root block
- * number to the new value */
- PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr);
- /*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */
- PUT_SB_TREE_HEIGHT(tb->tb_sb,
- SB_TREE_HEIGHT(tb->tb_sb) - 1);
-
- do_balance_mark_sb_dirty(tb,
- REISERFS_SB(tb->tb_sb)->s_sbh,
- 1);
- /*&&&&&&&&&&&&&&&&&&&&&& */
- /* use check_internal if new root is an internal node */
- if (h > 1)
- check_internal(new_root);
- /*&&&&&&&&&&&&&&&&&&&&&& */
-
- /* do what is needed for buffer thrown from tree */
- reiserfs_invalidate_buffer(tb, tbSh);
- return;
- }
- return;
- }
-
- /* join S[h] with L[h] */
- if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) {
-
- RFALSE(tb->rnum[h] != 0,
- "invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
- h, tb->rnum[h]);
-
- internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
- reiserfs_invalidate_buffer(tb, tbSh);
-
- return;
- }
-
- /* join S[h] with R[h] */
- if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) {
- RFALSE(tb->lnum[h] != 0,
- "invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
- h, tb->lnum[h]);
-
- internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
-
- reiserfs_invalidate_buffer(tb, tbSh);
- return;
- }
-
- /* borrow from left neighbor L[h] */
- if (tb->lnum[h] < 0) {
- RFALSE(tb->rnum[h] != 0,
- "wrong tb->rnum[%d]==%d when borrow from L[h]", h,
- tb->rnum[h]);
- internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h,
- -tb->lnum[h]);
- return;
- }
-
- /* borrow from right neighbor R[h] */
- if (tb->rnum[h] < 0) {
- RFALSE(tb->lnum[h] != 0,
- "invalid tb->lnum[%d]==%d when borrow from R[h]",
- h, tb->lnum[h]);
- internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]); /*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */
- return;
- }
-
- /* split S[h] into two parts and put them into neighbors */
- if (tb->lnum[h] > 0) {
- RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
- "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
- h, tb->lnum[h], h, tb->rnum[h], n);
-
- internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); /*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */
- internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
- tb->rnum[h]);
-
- reiserfs_invalidate_buffer(tb, tbSh);
-
- return;
- }
- reiserfs_panic(tb->tb_sb, "ibalance-2",
- "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
- h, tb->lnum[h], h, tb->rnum[h]);
-}
-
-/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/
-static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key)
-{
- RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL,
- "L[h](%p) and CFL[h](%p) must exist in replace_lkey",
- tb->L[h], tb->CFL[h]);
-
- if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
- return;
-
- memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
-
- do_balance_mark_internal_dirty(tb, tb->CFL[h], 0);
-}
-
-/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/
-static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key)
-{
- RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL,
- "R[h](%p) and CFR[h](%p) must exist in replace_rkey",
- tb->R[h], tb->CFR[h]);
- RFALSE(B_NR_ITEMS(tb->R[h]) == 0,
- "R[h] can not be empty if it exists (item number=%d)",
- B_NR_ITEMS(tb->R[h]));
-
- memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
-
- do_balance_mark_internal_dirty(tb, tb->CFR[h], 0);
-}
-
-
-/*
- * if inserting/pasting {
- * child_pos is the position of the node-pointer in S[h] that
- * pointed to S[h-1] before balancing of the h-1 level;
- * this means that new pointers and items must be inserted AFTER
- * child_pos
- * } else {
- * it is the position of the leftmost pointer that must be deleted
- * (together with its corresponding key to the left of the pointer)
- * as a result of the previous level's balancing.
- * }
- */
-
-int balance_internal(struct tree_balance *tb,
- int h, /* level of the tree */
- int child_pos,
- /* key for insertion on higher level */
- struct item_head *insert_key,
- /* node for insertion on higher level */
- struct buffer_head **insert_ptr)
-{
- struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
- struct buffer_info bi;
-
- /*
- * we return this: it is 0 if there is no S[h],
- * else it is tb->S[h]->b_item_order
- */
- int order;
- int insert_num, n, k;
- struct buffer_head *S_new;
- struct item_head new_insert_key;
- struct buffer_head *new_insert_ptr = NULL;
- struct item_head *new_insert_key_addr = insert_key;
-
- RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h);
-
- PROC_INFO_INC(tb->tb_sb, balance_at[h]);
-
- order =
- (tbSh) ? PATH_H_POSITION(tb->tb_path,
- h + 1) /*tb->S[h]->b_item_order */ : 0;
-
- /*
- * Using insert_size[h] calculate the number insert_num of items
- * that must be inserted to or deleted from S[h].
- */
- insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE));
-
- /* Check whether insert_num is proper * */
- RFALSE(insert_num < -2 || insert_num > 2,
- "incorrect number of items inserted to the internal node (%d)",
- insert_num);
- RFALSE(h > 1 && (insert_num > 1 || insert_num < -1),
- "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level",
- insert_num, h);
-
- /* Make balance in case insert_num < 0 */
- if (insert_num < 0) {
- balance_internal_when_delete(tb, h, child_pos);
- return order;
- }
-
- k = 0;
- if (tb->lnum[h] > 0) {
- /*
- * shift lnum[h] items from S[h] to the left neighbor L[h].
- * check how many of new items fall into L[h] or CFL[h] after
- * shifting
- */
- n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */
- if (tb->lnum[h] <= child_pos) {
- /* new items don't fall into L[h] or CFL[h] */
- internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
- tb->lnum[h]);
- child_pos -= tb->lnum[h];
- } else if (tb->lnum[h] > child_pos + insert_num) {
- /* all new items fall into L[h] */
- internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
- tb->lnum[h] - insert_num);
- /* insert insert_num keys and node-pointers into L[h] */
- bi.tb = tb;
- bi.bi_bh = tb->L[h];
- bi.bi_parent = tb->FL[h];
- bi.bi_position = get_left_neighbor_position(tb, h);
- internal_insert_childs(&bi,
- /*tb->L[h], tb->S[h-1]->b_next */
- n + child_pos + 1,
- insert_num, insert_key,
- insert_ptr);
-
- insert_num = 0;
- } else {
- struct disk_child *dc;
-
- /*
- * some items fall into L[h] or CFL[h],
- * but some don't fall
- */
- internal_shift1_left(tb, h, child_pos + 1);
- /* calculate number of new items that fall into L[h] */
- k = tb->lnum[h] - child_pos - 1;
- bi.tb = tb;
- bi.bi_bh = tb->L[h];
- bi.bi_parent = tb->FL[h];
- bi.bi_position = get_left_neighbor_position(tb, h);
- internal_insert_childs(&bi,
- /*tb->L[h], tb->S[h-1]->b_next, */
- n + child_pos + 1, k,
- insert_key, insert_ptr);
-
- replace_lkey(tb, h, insert_key + k);
-
- /*
- * replace the first node-ptr in S[h] by
- * node-ptr to insert_ptr[k]
- */
- dc = B_N_CHILD(tbSh, 0);
- put_dc_size(dc,
- MAX_CHILD_SIZE(insert_ptr[k]) -
- B_FREE_SPACE(insert_ptr[k]));
- put_dc_block_number(dc, insert_ptr[k]->b_blocknr);
-
- do_balance_mark_internal_dirty(tb, tbSh, 0);
-
- k++;
- insert_key += k;
- insert_ptr += k;
- insert_num -= k;
- child_pos = 0;
- }
- }
- /* tb->lnum[h] > 0 */
- if (tb->rnum[h] > 0) {
- /*shift rnum[h] items from S[h] to the right neighbor R[h] */
- /*
- * check how many of new items fall into R or CFR
- * after shifting
- */
- n = B_NR_ITEMS(tbSh); /* number of items in S[h] */
- if (n - tb->rnum[h] >= child_pos)
- /* new items fall into S[h] */
- internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
- tb->rnum[h]);
- else if (n + insert_num - tb->rnum[h] < child_pos) {
- /* all new items fall into R[h] */
- internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
- tb->rnum[h] - insert_num);
-
- /* insert insert_num keys and node-pointers into R[h] */
- bi.tb = tb;
- bi.bi_bh = tb->R[h];
- bi.bi_parent = tb->FR[h];
- bi.bi_position = get_right_neighbor_position(tb, h);
- internal_insert_childs(&bi,
- /*tb->R[h],tb->S[h-1]->b_next */
- child_pos - n - insert_num +
- tb->rnum[h] - 1,
- insert_num, insert_key,
- insert_ptr);
- insert_num = 0;
- } else {
- struct disk_child *dc;
-
- /* one of the items falls into CFR[h] */
- internal_shift1_right(tb, h, n - child_pos + 1);
- /* calculate number of new items that fall into R[h] */
- k = tb->rnum[h] - n + child_pos - 1;
- bi.tb = tb;
- bi.bi_bh = tb->R[h];
- bi.bi_parent = tb->FR[h];
- bi.bi_position = get_right_neighbor_position(tb, h);
- internal_insert_childs(&bi,
- /*tb->R[h], tb->R[h]->b_child, */
- 0, k, insert_key + 1,
- insert_ptr + 1);
-
- replace_rkey(tb, h, insert_key + insert_num - k - 1);
-
- /*
- * replace the first node-ptr in R[h] by
- * node-ptr insert_ptr[insert_num-k-1]
- */
- dc = B_N_CHILD(tb->R[h], 0);
- put_dc_size(dc,
- MAX_CHILD_SIZE(insert_ptr
- [insert_num - k - 1]) -
- B_FREE_SPACE(insert_ptr
- [insert_num - k - 1]));
- put_dc_block_number(dc,
- insert_ptr[insert_num - k -
- 1]->b_blocknr);
-
- do_balance_mark_internal_dirty(tb, tb->R[h], 0);
-
- insert_num -= (k + 1);
- }
- }
-
- /** Fill new node that appears instead of S[h] **/
- RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
- RFALSE(tb->blknum[h] < 0, "blknum can not be < 0");
-
- if (!tb->blknum[h]) { /* node S[h] is empty now */
- RFALSE(!tbSh, "S[h] is equal NULL");
-
- /* do what is needed for buffer thrown from tree */
- reiserfs_invalidate_buffer(tb, tbSh);
- return order;
- }
-
- if (!tbSh) {
- /* create new root */
- struct disk_child *dc;
- struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1);
- struct block_head *blkh;
-
- if (tb->blknum[h] != 1)
- reiserfs_panic(NULL, "ibalance-3", "One new node "
- "required for creating the new root");
- /* S[h] = empty buffer from the list FEB. */
- tbSh = get_FEB(tb);
- blkh = B_BLK_HEAD(tbSh);
- set_blkh_level(blkh, h + 1);
-
- /* Put the unique node-pointer to S[h] that points to S[h-1]. */
-
- dc = B_N_CHILD(tbSh, 0);
- put_dc_block_number(dc, tbSh_1->b_blocknr);
- put_dc_size(dc,
- (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1)));
-
- tb->insert_size[h] -= DC_SIZE;
- set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE);
-
- do_balance_mark_internal_dirty(tb, tbSh, 0);
-
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
- check_internal(tbSh);
- /*&&&&&&&&&&&&&&&&&&&&&&&& */
-
- /* put new root into path structure */
- PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) =
- tbSh;
-
- /* Change root in structure super block. */
- PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr);
- PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1);
- do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
- }
-
- if (tb->blknum[h] == 2) {
- int snum;
- struct buffer_info dest_bi, src_bi;
-
- /* S_new = free buffer from list FEB */
- S_new = get_FEB(tb);
-
- set_blkh_level(B_BLK_HEAD(S_new), h + 1);
-
- dest_bi.tb = tb;
- dest_bi.bi_bh = S_new;
- dest_bi.bi_parent = NULL;
- dest_bi.bi_position = 0;
- src_bi.tb = tb;
- src_bi.bi_bh = tbSh;
- src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
- src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
-
- n = B_NR_ITEMS(tbSh); /* number of items in S[h] */
- snum = (insert_num + n + 1) / 2;
- if (n - snum >= child_pos) {
- /* new items don't fall into S_new */
- /* store the delimiting key for the next level */
- /* new_insert_key = (n - snum)'th key in S[h] */
- memcpy(&new_insert_key, internal_key(tbSh, n - snum),
- KEY_SIZE);
- /* last parameter is del_par */
- internal_move_pointers_items(&dest_bi, &src_bi,
- LAST_TO_FIRST, snum, 0);
- } else if (n + insert_num - snum < child_pos) {
- /* all new items fall into S_new */
- /* store the delimiting key for the next level */
- /*
- * new_insert_key = (n + insert_item - snum)'th
- * key in S[h]
- */
- memcpy(&new_insert_key,
- internal_key(tbSh, n + insert_num - snum),
- KEY_SIZE);
- /* last parameter is del_par */
- internal_move_pointers_items(&dest_bi, &src_bi,
- LAST_TO_FIRST,
- snum - insert_num, 0);
-
- /*
- * insert insert_num keys and node-pointers
- * into S_new
- */
- internal_insert_childs(&dest_bi,
- /*S_new,tb->S[h-1]->b_next, */
- child_pos - n - insert_num +
- snum - 1,
- insert_num, insert_key,
- insert_ptr);
-
- insert_num = 0;
- } else {
- struct disk_child *dc;
-
- /* some items fall into S_new, but some don't fall */
- /* last parameter is del_par */
- internal_move_pointers_items(&dest_bi, &src_bi,
- LAST_TO_FIRST,
- n - child_pos + 1, 1);
- /* calculate number of new items that fall into S_new */
- k = snum - n + child_pos - 1;
-
- internal_insert_childs(&dest_bi, /*S_new, */ 0, k,
- insert_key + 1, insert_ptr + 1);
-
- /* new_insert_key = insert_key[insert_num - k - 1] */
- memcpy(&new_insert_key, insert_key + insert_num - k - 1,
- KEY_SIZE);
- /*
- * replace first node-ptr in S_new by node-ptr
- * to insert_ptr[insert_num-k-1]
- */
-
- dc = B_N_CHILD(S_new, 0);
- put_dc_size(dc,
- (MAX_CHILD_SIZE
- (insert_ptr[insert_num - k - 1]) -
- B_FREE_SPACE(insert_ptr
- [insert_num - k - 1])));
- put_dc_block_number(dc,
- insert_ptr[insert_num - k -
- 1]->b_blocknr);
-
- do_balance_mark_internal_dirty(tb, S_new, 0);
-
- insert_num -= (k + 1);
- }
- /* new_insert_ptr = node_pointer to S_new */
- new_insert_ptr = S_new;
-
- RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new)
- || buffer_dirty(S_new), "cm-00001: bad S_new (%b)",
- S_new);
-
- /* S_new is released in unfix_nodes */
- }
-
- n = B_NR_ITEMS(tbSh); /*number of items in S[h] */
-
- if (0 <= child_pos && child_pos <= n && insert_num > 0) {
- bi.tb = tb;
- bi.bi_bh = tbSh;
- bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
- bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
- internal_insert_childs(&bi, /*tbSh, */
- /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next, */
- child_pos, insert_num, insert_key,
- insert_ptr);
- }
-
- insert_ptr[0] = new_insert_ptr;
- if (new_insert_ptr)
- memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
-
- return order;
-}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
deleted file mode 100644
index d39ee5f6c075..000000000000
--- a/fs/reiserfs/inode.c
+++ /dev/null
@@ -1,3416 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/exportfs.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include <linux/unaligned.h>
-#include <linux/buffer_head.h>
-#include <linux/mpage.h>
-#include <linux/writeback.h>
-#include <linux/quotaops.h>
-#include <linux/swap.h>
-#include <linux/uio.h>
-#include <linux/bio.h>
-
-int reiserfs_commit_write(struct file *f, struct page *page,
- unsigned from, unsigned to);
-
-void reiserfs_evict_inode(struct inode *inode)
-{
- /*
- * We need blocks for transaction + (user+group) quota
- * update (possibly delete)
- */
- int jbegin_count =
- JOURNAL_PER_BALANCE_CNT * 2 +
- 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
- struct reiserfs_transaction_handle th;
- int err;
-
- if (!inode->i_nlink && !is_bad_inode(inode))
- dquot_initialize(inode);
-
- truncate_inode_pages_final(&inode->i_data);
- if (inode->i_nlink)
- goto no_delete;
-
- /*
- * The = 0 happens when we abort creating a new inode
- * for some reason like lack of space..
- * also handles bad_inode case
- */
- if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {
-
- reiserfs_delete_xattrs(inode);
-
- reiserfs_write_lock(inode->i_sb);
-
- if (journal_begin(&th, inode->i_sb, jbegin_count))
- goto out;
- reiserfs_update_inode_transaction(inode);
-
- reiserfs_discard_prealloc(&th, inode);
-
- err = reiserfs_delete_object(&th, inode);
-
- /*
- * Do quota update inside a transaction for journaled quotas.
- * We must do that after delete_object so that quota updates
- * go into the same transaction as stat data deletion
- */
- if (!err) {
- int depth = reiserfs_write_unlock_nested(inode->i_sb);
- dquot_free_inode(inode);
- reiserfs_write_lock_nested(inode->i_sb, depth);
- }
-
- if (journal_end(&th))
- goto out;
-
- /*
- * check return value from reiserfs_delete_object after
- * ending the transaction
- */
- if (err)
- goto out;
-
- /*
- * all items of file are deleted, so we can remove
- * "save" link
- * we can't do anything about an error here
- */
- remove_save_link(inode, 0 /* not truncate */);
-out:
- reiserfs_write_unlock(inode->i_sb);
- } else {
- /* no object items are in the tree */
- ;
- }
-
- /* note this must go after the journal_end to prevent deadlock */
- clear_inode(inode);
-
- dquot_drop(inode);
- inode->i_blocks = 0;
- return;
-
-no_delete:
- clear_inode(inode);
- dquot_drop(inode);
-}
-
-static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
- __u32 objectid, loff_t offset, int type, int length)
-{
- key->version = version;
-
- key->on_disk_key.k_dir_id = dirid;
- key->on_disk_key.k_objectid = objectid;
- set_cpu_key_k_offset(key, offset);
- set_cpu_key_k_type(key, type);
- key->key_length = length;
-}
-
-/*
- * take base of inode_key (it comes from inode always) (dirid, objectid)
- * and version from an inode, set offset and type of key
- */
-void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
- int type, int length)
-{
- _make_cpu_key(key, get_inode_item_key_version(inode),
- le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
- le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
- length);
-}
-
-/* when key is 0, do not set version and short key */
-inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
- int version,
- loff_t offset, int type, int length,
- int entry_count /*or ih_free_space */ )
-{
- if (key) {
- ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
- ih->ih_key.k_objectid =
- cpu_to_le32(key->on_disk_key.k_objectid);
- }
- put_ih_version(ih, version);
- set_le_ih_k_offset(ih, offset);
- set_le_ih_k_type(ih, type);
- put_ih_item_len(ih, length);
- /* set_ih_free_space (ih, 0); */
- /*
- * for directory items it is entry count, for directs and stat
- * datas - 0xffff, for indirects - 0
- */
- put_ih_entry_count(ih, entry_count);
-}
-
-/*
- * FIXME: we might cache recently accessed indirect item
- * Ugh. Not too eager for that....
- * I cut the code until such time as I see a convincing argument (benchmark).
- * I don't want a bloated inode struct..., and I don't like code complexity....
- */
-
-/*
- * cutting the code is fine, since it really isn't in use yet and is easy
- * to add back in. But, Vladimir has a really good idea here. Think
- * about what happens for reading a file. For each page,
- * The VFS layer calls reiserfs_read_folio, who searches the tree to find
- * an indirect item. This indirect item has X number of pointers, where
- * X is a big number if we've done the block allocation right. But,
- * we only use one or two of these pointers during each call to read_folio,
- * needlessly researching again later on.
- *
- * The size of the cache could be dynamic based on the size of the file.
- *
- * I'd also like to see us cache the location the stat data item, since
- * we are needlessly researching for that frequently.
- *
- * --chris
- */
-
-/*
- * If this page has a file tail in it, and
- * it was read in by get_block_create_0, the page data is valid,
- * but tail is still sitting in a direct item, and we can't write to
- * it. So, look through this page, and check all the mapped buffers
- * to make sure they have valid block numbers. Any that don't need
- * to be unmapped, so that __block_write_begin will correctly call
- * reiserfs_get_block to convert the tail into an unformatted node
- */
-static inline void fix_tail_page_for_writing(struct page *page)
-{
- struct buffer_head *head, *next, *bh;
-
- if (page && page_has_buffers(page)) {
- head = page_buffers(page);
- bh = head;
- do {
- next = bh->b_this_page;
- if (buffer_mapped(bh) && bh->b_blocknr == 0) {
- reiserfs_unmap_buffer(bh);
- }
- bh = next;
- } while (bh != head);
- }
-}
-
-/*
- * reiserfs_get_block does not need to allocate a block only if it has been
- * done already or non-hole position has been found in the indirect item
- */
-static inline int allocation_needed(int retval, b_blocknr_t allocated,
- struct item_head *ih,
- __le32 * item, int pos_in_item)
-{
- if (allocated)
- return 0;
- if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
- get_block_num(item, pos_in_item))
- return 0;
- return 1;
-}
-
-static inline int indirect_item_found(int retval, struct item_head *ih)
-{
- return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
-}
-
-static inline void set_block_dev_mapped(struct buffer_head *bh,
- b_blocknr_t block, struct inode *inode)
-{
- map_bh(bh, inode->i_sb, block);
-}
-
-/*
- * files which were created in the earlier version can not be longer,
- * than 2 gb
- */
-static int file_capable(struct inode *inode, sector_t block)
-{
- /* it is new file. */
- if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||
- /* old file, but 'block' is inside of 2gb */
- block < (1 << (31 - inode->i_sb->s_blocksize_bits)))
- return 1;
-
- return 0;
-}
-
-static int restart_transaction(struct reiserfs_transaction_handle *th,
- struct inode *inode, struct treepath *path)
-{
- struct super_block *s = th->t_super;
- int err;
-
- BUG_ON(!th->t_trans_id);
- BUG_ON(!th->t_refcount);
-
- pathrelse(path);
-
- /* we cannot restart while nested */
- if (th->t_refcount > 1) {
- return 0;
- }
- reiserfs_update_sd(th, inode);
- err = journal_end(th);
- if (!err) {
- err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
- if (!err)
- reiserfs_update_inode_transaction(inode);
- }
- return err;
-}
-
-/*
- * it is called by get_block when create == 0. Returns block number
- * for 'block'-th logical block of file. When it hits direct item it
- * returns 0 (being called from bmap) or read direct item into piece
- * of page (bh_result)
- * Please improve the english/clarity in the comment above, as it is
- * hard to understand.
- */
-static int _get_block_create_0(struct inode *inode, sector_t block,
- struct buffer_head *bh_result, int args)
-{
- INITIALIZE_PATH(path);
- struct cpu_key key;
- struct buffer_head *bh;
- struct item_head *ih, tmp_ih;
- b_blocknr_t blocknr;
- char *p;
- int chars;
- int ret;
- int result;
- int done = 0;
- unsigned long offset;
-
- /* prepare the key to look for the 'block'-th block of file */
- make_cpu_key(&key, inode,
- (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
- 3);
-
- result = search_for_position_by_key(inode->i_sb, &key, &path);
- if (result != POSITION_FOUND) {
- pathrelse(&path);
- if (result == IO_ERROR)
- return -EIO;
- /*
- * We do not return -ENOENT if there is a hole but page is
- * uptodate, because it means that there is some MMAPED data
- * associated with it that is yet to be written to disk.
- */
- if ((args & GET_BLOCK_NO_HOLE)
- && !PageUptodate(bh_result->b_page)) {
- return -ENOENT;
- }
- return 0;
- }
-
- bh = get_last_bh(&path);
- ih = tp_item_head(&path);
- if (is_indirect_le_ih(ih)) {
- __le32 *ind_item = (__le32 *) ih_item_body(bh, ih);
-
- /*
- * FIXME: here we could cache indirect item or part of it in
- * the inode to avoid search_by_key in case of subsequent
- * access to file
- */
- blocknr = get_block_num(ind_item, path.pos_in_item);
- ret = 0;
- if (blocknr) {
- map_bh(bh_result, inode->i_sb, blocknr);
- if (path.pos_in_item ==
- ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
- set_buffer_boundary(bh_result);
- }
- } else
- /*
- * We do not return -ENOENT if there is a hole but
- * page is uptodate, because it means that there is
- * some MMAPED data associated with it that is
- * yet to be written to disk.
- */
- if ((args & GET_BLOCK_NO_HOLE)
- && !PageUptodate(bh_result->b_page)) {
- ret = -ENOENT;
- }
-
- pathrelse(&path);
- return ret;
- }
- /* requested data are in direct item(s) */
- if (!(args & GET_BLOCK_READ_DIRECT)) {
- /*
- * we are called by bmap. FIXME: we can not map block of file
- * when it is stored in direct item(s)
- */
- pathrelse(&path);
- return -ENOENT;
- }
-
- /*
- * if we've got a direct item, and the buffer or page was uptodate,
- * we don't want to pull data off disk again. skip to the
- * end, where we map the buffer and return
- */
- if (buffer_uptodate(bh_result)) {
- goto finished;
- } else
- /*
- * grab_tail_page can trigger calls to reiserfs_get_block on
- * up to date pages without any buffers. If the page is up
- * to date, we don't want read old data off disk. Set the up
- * to date bit on the buffer instead and jump to the end
- */
- if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
- set_buffer_uptodate(bh_result);
- goto finished;
- }
- /* read file tail into part of page */
- offset = (cpu_key_k_offset(&key) - 1) & (PAGE_SIZE - 1);
- copy_item_head(&tmp_ih, ih);
-
- /*
- * we only want to kmap if we are reading the tail into the page.
- * this is not the common case, so we don't kmap until we are
- * sure we need to. But, this means the item might move if
- * kmap schedules
- */
- p = (char *)kmap(bh_result->b_page);
- p += offset;
- memset(p, 0, inode->i_sb->s_blocksize);
- do {
- if (!is_direct_le_ih(ih)) {
- BUG();
- }
- /*
- * make sure we don't read more bytes than actually exist in
- * the file. This can happen in odd cases where i_size isn't
- * correct, and when direct item padding results in a few
- * extra bytes at the end of the direct item
- */
- if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
- break;
- if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
- chars =
- inode->i_size - (le_ih_k_offset(ih) - 1) -
- path.pos_in_item;
- done = 1;
- } else {
- chars = ih_item_len(ih) - path.pos_in_item;
- }
- memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars);
-
- if (done)
- break;
-
- p += chars;
-
- /*
- * we done, if read direct item is not the last item of
- * node FIXME: we could try to check right delimiting key
- * to see whether direct item continues in the right
- * neighbor or rely on i_size
- */
- if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
- break;
-
- /* update key to look for the next piece */
- set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
- result = search_for_position_by_key(inode->i_sb, &key, &path);
- if (result != POSITION_FOUND)
- /* i/o error most likely */
- break;
- bh = get_last_bh(&path);
- ih = tp_item_head(&path);
- } while (1);
-
- flush_dcache_page(bh_result->b_page);
- kunmap(bh_result->b_page);
-
-finished:
- pathrelse(&path);
-
- if (result == IO_ERROR)
- return -EIO;
-
- /*
- * this buffer has valid data, but isn't valid for io. mapping it to
- * block #0 tells the rest of reiserfs it just has a tail in it
- */
- map_bh(bh_result, inode->i_sb, 0);
- set_buffer_uptodate(bh_result);
- return 0;
-}
-
-/*
- * this is called to create file map. So, _get_block_create_0 will not
- * read direct item
- */
-static int reiserfs_bmap(struct inode *inode, sector_t block,
- struct buffer_head *bh_result, int create)
-{
- if (!file_capable(inode, block))
- return -EFBIG;
-
- reiserfs_write_lock(inode->i_sb);
- /* do not read the direct item */
- _get_block_create_0(inode, block, bh_result, 0);
- reiserfs_write_unlock(inode->i_sb);
- return 0;
-}
-
-/*
- * special version of get_block that is only used by grab_tail_page right
- * now. It is sent to __block_write_begin, and when you try to get a
- * block past the end of the file (or a block from a hole) it returns
- * -ENOENT instead of a valid buffer. __block_write_begin expects to
- * be able to do i/o on the buffers returned, unless an error value
- * is also returned.
- *
- * So, this allows __block_write_begin to be used for reading a single block
- * in a page. Where it does not produce a valid page for holes, or past the
- * end of the file. This turns out to be exactly what we need for reading
- * tails for conversion.
- *
- * The point of the wrapper is forcing a certain value for create, even
- * though the VFS layer is calling this function with create==1. If you
- * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
- * don't use this function.
-*/
-static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
- struct buffer_head *bh_result,
- int create)
-{
- return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
-}
-
-/*
- * This is special helper for reiserfs_get_block in case we are executing
- * direct_IO request.
- */
-static int reiserfs_get_blocks_direct_io(struct inode *inode,
- sector_t iblock,
- struct buffer_head *bh_result,
- int create)
-{
- int ret;
-
- bh_result->b_page = NULL;
-
- /*
- * We set the b_size before reiserfs_get_block call since it is
- * referenced in convert_tail_for_hole() that may be called from
- * reiserfs_get_block()
- */
- bh_result->b_size = i_blocksize(inode);
-
- ret = reiserfs_get_block(inode, iblock, bh_result,
- create | GET_BLOCK_NO_DANGLE);
- if (ret)
- goto out;
-
- /* don't allow direct io onto tail pages */
- if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
- /*
- * make sure future calls to the direct io funcs for this
- * offset in the file fail by unmapping the buffer
- */
- clear_buffer_mapped(bh_result);
- ret = -EINVAL;
- }
-
- /*
- * Possible unpacked tail. Flush the data before pages have
- * disappeared
- */
- if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
- int err;
-
- reiserfs_write_lock(inode->i_sb);
-
- err = reiserfs_commit_for_inode(inode);
- REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
- reiserfs_write_unlock(inode->i_sb);
-
- if (err < 0)
- ret = err;
- }
-out:
- return ret;
-}
-
-/*
- * helper function for when reiserfs_get_block is called for a hole
- * but the file tail is still in a direct item
- * bh_result is the buffer head for the hole
- * tail_offset is the offset of the start of the tail in the file
- *
- * This calls prepare_write, which will start a new transaction
- * you should not be in a transaction, or have any paths held when you
- * call this.
- */
-static int convert_tail_for_hole(struct inode *inode,
- struct buffer_head *bh_result,
- loff_t tail_offset)
-{
- unsigned long index;
- unsigned long tail_end;
- unsigned long tail_start;
- struct page *tail_page;
- struct page *hole_page = bh_result->b_page;
- int retval = 0;
-
- if ((tail_offset & (bh_result->b_size - 1)) != 1)
- return -EIO;
-
- /* always try to read until the end of the block */
- tail_start = tail_offset & (PAGE_SIZE - 1);
- tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
-
- index = tail_offset >> PAGE_SHIFT;
- /*
- * hole_page can be zero in case of direct_io, we are sure
- * that we cannot get here if we write with O_DIRECT into tail page
- */
- if (!hole_page || index != hole_page->index) {
- tail_page = grab_cache_page(inode->i_mapping, index);
- retval = -ENOMEM;
- if (!tail_page) {
- goto out;
- }
- } else {
- tail_page = hole_page;
- }
-
- /*
- * we don't have to make sure the conversion did not happen while
- * we were locking the page because anyone that could convert
- * must first take i_mutex.
- *
- * We must fix the tail page for writing because it might have buffers
- * that are mapped, but have a block number of 0. This indicates tail
- * data that has been read directly into the page, and
- * __block_write_begin won't trigger a get_block in this case.
- */
- fix_tail_page_for_writing(tail_page);
- retval = __reiserfs_write_begin(tail_page, tail_start,
- tail_end - tail_start);
- if (retval)
- goto unlock;
-
- /* tail conversion might change the data in the page */
- flush_dcache_page(tail_page);
-
- retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
-
-unlock:
- if (tail_page != hole_page) {
- unlock_page(tail_page);
- put_page(tail_page);
- }
-out:
- return retval;
-}
-
-static inline int _allocate_block(struct reiserfs_transaction_handle *th,
- sector_t block,
- struct inode *inode,
- b_blocknr_t * allocated_block_nr,
- struct treepath *path, int flags)
-{
- BUG_ON(!th->t_trans_id);
-
-#ifdef REISERFS_PREALLOCATE
- if (!(flags & GET_BLOCK_NO_IMUX)) {
- return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
- path, block);
- }
-#endif
- return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
- block);
-}
-
-int reiserfs_get_block(struct inode *inode, sector_t block,
- struct buffer_head *bh_result, int create)
-{
- int repeat, retval = 0;
- /* b_blocknr_t is (unsigned) 32 bit int*/
- b_blocknr_t allocated_block_nr = 0;
- INITIALIZE_PATH(path);
- int pos_in_item;
- struct cpu_key key;
- struct buffer_head *bh, *unbh = NULL;
- struct item_head *ih, tmp_ih;
- __le32 *item;
- int done;
- int fs_gen;
- struct reiserfs_transaction_handle *th = NULL;
- /*
- * space reserved in transaction batch:
- * . 3 balancings in direct->indirect conversion
- * . 1 block involved into reiserfs_update_sd()
- * XXX in practically impossible worst case direct2indirect()
- * can incur (much) more than 3 balancings.
- * quota update for user, group
- */
- int jbegin_count =
- JOURNAL_PER_BALANCE_CNT * 3 + 1 +
- 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
- int version;
- int dangle = 1;
- loff_t new_offset =
- (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
-
- reiserfs_write_lock(inode->i_sb);
- version = get_inode_item_key_version(inode);
-
- if (!file_capable(inode, block)) {
- reiserfs_write_unlock(inode->i_sb);
- return -EFBIG;
- }
-
- /*
- * if !create, we aren't changing the FS, so we don't need to
- * log anything, so we don't need to start a transaction
- */
- if (!(create & GET_BLOCK_CREATE)) {
- int ret;
- /* find number of block-th logical block of the file */
- ret = _get_block_create_0(inode, block, bh_result,
- create | GET_BLOCK_READ_DIRECT);
- reiserfs_write_unlock(inode->i_sb);
- return ret;
- }
-
- /*
- * if we're already in a transaction, make sure to close
- * any new transactions we start in this func
- */
- if ((create & GET_BLOCK_NO_DANGLE) ||
- reiserfs_transaction_running(inode->i_sb))
- dangle = 0;
-
- /*
- * If file is of such a size, that it might have a tail and
- * tails are enabled we should mark it as possibly needing
- * tail packing on close
- */
- if ((have_large_tails(inode->i_sb)
- && inode->i_size < i_block_size(inode) * 4)
- || (have_small_tails(inode->i_sb)
- && inode->i_size < i_block_size(inode)))
- REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
-
- /* set the key of the first byte in the 'block'-th block of file */
- make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
- if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
-start_trans:
- th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
- if (!th) {
- retval = -ENOMEM;
- goto failure;
- }
- reiserfs_update_inode_transaction(inode);
- }
-research:
-
- retval = search_for_position_by_key(inode->i_sb, &key, &path);
- if (retval == IO_ERROR) {
- retval = -EIO;
- goto failure;
- }
-
- bh = get_last_bh(&path);
- ih = tp_item_head(&path);
- item = tp_item_body(&path);
- pos_in_item = path.pos_in_item;
-
- fs_gen = get_generation(inode->i_sb);
- copy_item_head(&tmp_ih, ih);
-
- if (allocation_needed
- (retval, allocated_block_nr, ih, item, pos_in_item)) {
- /* we have to allocate block for the unformatted node */
- if (!th) {
- pathrelse(&path);
- goto start_trans;
- }
-
- repeat =
- _allocate_block(th, block, inode, &allocated_block_nr,
- &path, create);
-
- /*
- * restart the transaction to give the journal a chance to free
- * some blocks. releases the path, so we have to go back to
- * research if we succeed on the second try
- */
- if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
- SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
- retval = restart_transaction(th, inode, &path);
- if (retval)
- goto failure;
- repeat =
- _allocate_block(th, block, inode,
- &allocated_block_nr, NULL, create);
-
- if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
- goto research;
- }
- if (repeat == QUOTA_EXCEEDED)
- retval = -EDQUOT;
- else
- retval = -ENOSPC;
- goto failure;
- }
-
- if (fs_changed(fs_gen, inode->i_sb)
- && item_moved(&tmp_ih, &path)) {
- goto research;
- }
- }
-
- if (indirect_item_found(retval, ih)) {
- b_blocknr_t unfm_ptr;
- /*
- * 'block'-th block is in the file already (there is
- * corresponding cell in some indirect item). But it may be
- * zero unformatted node pointer (hole)
- */
- unfm_ptr = get_block_num(item, pos_in_item);
- if (unfm_ptr == 0) {
- /* use allocated block to plug the hole */
- reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
- if (fs_changed(fs_gen, inode->i_sb)
- && item_moved(&tmp_ih, &path)) {
- reiserfs_restore_prepared_buffer(inode->i_sb,
- bh);
- goto research;
- }
- set_buffer_new(bh_result);
- if (buffer_dirty(bh_result)
- && reiserfs_data_ordered(inode->i_sb))
- reiserfs_add_ordered_list(inode, bh_result);
- put_block_num(item, pos_in_item, allocated_block_nr);
- unfm_ptr = allocated_block_nr;
- journal_mark_dirty(th, bh);
- reiserfs_update_sd(th, inode);
- }
- set_block_dev_mapped(bh_result, unfm_ptr, inode);
- pathrelse(&path);
- retval = 0;
- if (!dangle && th)
- retval = reiserfs_end_persistent_transaction(th);
-
- reiserfs_write_unlock(inode->i_sb);
-
- /*
- * the item was found, so new blocks were not added to the file
- * there is no need to make sure the inode is updated with this
- * transaction
- */
- return retval;
- }
-
- if (!th) {
- pathrelse(&path);
- goto start_trans;
- }
-
- /*
- * desired position is not found or is in the direct item. We have
- * to append file with holes up to 'block'-th block converting
- * direct items to indirect one if necessary
- */
- done = 0;
- do {
- if (is_statdata_le_ih(ih)) {
- __le32 unp = 0;
- struct cpu_key tmp_key;
-
- /* indirect item has to be inserted */
- make_le_item_head(&tmp_ih, &key, version, 1,
- TYPE_INDIRECT, UNFM_P_SIZE,
- 0 /* free_space */ );
-
- /*
- * we are going to add 'block'-th block to the file.
- * Use allocated block for that
- */
- if (cpu_key_k_offset(&key) == 1) {
- unp = cpu_to_le32(allocated_block_nr);
- set_block_dev_mapped(bh_result,
- allocated_block_nr, inode);
- set_buffer_new(bh_result);
- done = 1;
- }
- tmp_key = key; /* ;) */
- set_cpu_key_k_offset(&tmp_key, 1);
- PATH_LAST_POSITION(&path)++;
-
- retval =
- reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
- inode, (char *)&unp);
- if (retval) {
- reiserfs_free_block(th, inode,
- allocated_block_nr, 1);
- /*
- * retval == -ENOSPC, -EDQUOT or -EIO
- * or -EEXIST
- */
- goto failure;
- }
- } else if (is_direct_le_ih(ih)) {
- /* direct item has to be converted */
- loff_t tail_offset;
-
- tail_offset =
- ((le_ih_k_offset(ih) -
- 1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
-
- /*
- * direct item we just found fits into block we have
- * to map. Convert it into unformatted node: use
- * bh_result for the conversion
- */
- if (tail_offset == cpu_key_k_offset(&key)) {
- set_block_dev_mapped(bh_result,
- allocated_block_nr, inode);
- unbh = bh_result;
- done = 1;
- } else {
- /*
- * we have to pad file tail stored in direct
- * item(s) up to block size and convert it
- * to unformatted node. FIXME: this should
- * also get into page cache
- */
-
- pathrelse(&path);
- /*
- * ugly, but we can only end the transaction if
- * we aren't nested
- */
- BUG_ON(!th->t_refcount);
- if (th->t_refcount == 1) {
- retval =
- reiserfs_end_persistent_transaction
- (th);
- th = NULL;
- if (retval)
- goto failure;
- }
-
- retval =
- convert_tail_for_hole(inode, bh_result,
- tail_offset);
- if (retval) {
- if (retval != -ENOSPC)
- reiserfs_error(inode->i_sb,
- "clm-6004",
- "convert tail failed "
- "inode %lu, error %d",
- inode->i_ino,
- retval);
- if (allocated_block_nr) {
- /*
- * the bitmap, the super,
- * and the stat data == 3
- */
- if (!th)
- th = reiserfs_persistent_transaction(inode->i_sb, 3);
- if (th)
- reiserfs_free_block(th,
- inode,
- allocated_block_nr,
- 1);
- }
- goto failure;
- }
- goto research;
- }
- retval =
- direct2indirect(th, inode, &path, unbh,
- tail_offset);
- if (retval) {
- reiserfs_unmap_buffer(unbh);
- reiserfs_free_block(th, inode,
- allocated_block_nr, 1);
- goto failure;
- }
- /*
- * it is important the set_buffer_uptodate is done
- * after the direct2indirect. The buffer might
- * contain valid data newer than the data on disk
- * (read by read_folio, changed, and then sent here by
- * writepage). direct2indirect needs to know if unbh
- * was already up to date, so it can decide if the
- * data in unbh needs to be replaced with data from
- * the disk
- */
- set_buffer_uptodate(unbh);
-
- /*
- * unbh->b_page == NULL in case of DIRECT_IO request,
- * this means buffer will disappear shortly, so it
- * should not be added to
- */
- if (unbh->b_page) {
- /*
- * we've converted the tail, so we must
- * flush unbh before the transaction commits
- */
- reiserfs_add_tail_list(inode, unbh);
-
- /*
- * mark it dirty now to prevent commit_write
- * from adding this buffer to the inode's
- * dirty buffer list
- */
- /*
- * AKPM: changed __mark_buffer_dirty to
- * mark_buffer_dirty(). It's still atomic,
- * but it sets the page dirty too, which makes
- * it eligible for writeback at any time by the
- * VM (which was also the case with
- * __mark_buffer_dirty())
- */
- mark_buffer_dirty(unbh);
- }
- } else {
- /*
- * append indirect item with holes if needed, when
- * appending pointer to 'block'-th block use block,
- * which is already allocated
- */
- struct cpu_key tmp_key;
- /*
- * We use this in case we need to allocate
- * only one block which is a fastpath
- */
- unp_t unf_single = 0;
- unp_t *un;
- __u64 max_to_insert =
- MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
- UNFM_P_SIZE;
- __u64 blocks_needed;
-
- RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
- "vs-804: invalid position for append");
- /*
- * indirect item has to be appended,
- * set up key of that position
- * (key type is unimportant)
- */
- make_cpu_key(&tmp_key, inode,
- le_key_k_offset(version,
- &ih->ih_key) +
- op_bytes_number(ih,
- inode->i_sb->s_blocksize),
- TYPE_INDIRECT, 3);
-
- RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
- "green-805: invalid offset");
- blocks_needed =
- 1 +
- ((cpu_key_k_offset(&key) -
- cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
- s_blocksize_bits);
-
- if (blocks_needed == 1) {
- un = &unf_single;
- } else {
- un = kcalloc(min(blocks_needed, max_to_insert),
- UNFM_P_SIZE, GFP_NOFS);
- if (!un) {
- un = &unf_single;
- blocks_needed = 1;
- max_to_insert = 0;
- }
- }
- if (blocks_needed <= max_to_insert) {
- /*
- * we are going to add target block to
- * the file. Use allocated block for that
- */
- un[blocks_needed - 1] =
- cpu_to_le32(allocated_block_nr);
- set_block_dev_mapped(bh_result,
- allocated_block_nr, inode);
- set_buffer_new(bh_result);
- done = 1;
- } else {
- /* paste hole to the indirect item */
- /*
- * If kcalloc failed, max_to_insert becomes
- * zero and it means we only have space for
- * one block
- */
- blocks_needed =
- max_to_insert ? max_to_insert : 1;
- }
- retval =
- reiserfs_paste_into_item(th, &path, &tmp_key, inode,
- (char *)un,
- UNFM_P_SIZE *
- blocks_needed);
-
- if (blocks_needed != 1)
- kfree(un);
-
- if (retval) {
- reiserfs_free_block(th, inode,
- allocated_block_nr, 1);
- goto failure;
- }
- if (!done) {
- /*
- * We need to mark new file size in case
- * this function will be interrupted/aborted
- * later on. And we may do this only for
- * holes.
- */
- inode->i_size +=
- inode->i_sb->s_blocksize * blocks_needed;
- }
- }
-
- if (done == 1)
- break;
-
- /*
- * this loop could log more blocks than we had originally
- * asked for. So, we have to allow the transaction to end
- * if it is too big or too full. Update the inode so things
- * are consistent if we crash before the function returns
- * release the path so that anybody waiting on the path before
- * ending their transaction will be able to continue.
- */
- if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
- retval = restart_transaction(th, inode, &path);
- if (retval)
- goto failure;
- }
- /*
- * inserting indirect pointers for a hole can take a
- * long time. reschedule if needed and also release the write
- * lock for others.
- */
- reiserfs_cond_resched(inode->i_sb);
-
- retval = search_for_position_by_key(inode->i_sb, &key, &path);
- if (retval == IO_ERROR) {
- retval = -EIO;
- goto failure;
- }
- if (retval == POSITION_FOUND) {
- reiserfs_warning(inode->i_sb, "vs-825",
- "%K should not be found", &key);
- retval = -EEXIST;
- if (allocated_block_nr)
- reiserfs_free_block(th, inode,
- allocated_block_nr, 1);
- pathrelse(&path);
- goto failure;
- }
- bh = get_last_bh(&path);
- ih = tp_item_head(&path);
- item = tp_item_body(&path);
- pos_in_item = path.pos_in_item;
- } while (1);
-
- retval = 0;
-
-failure:
- if (th && (!dangle || (retval && !th->t_trans_id))) {
- int err;
- if (th->t_trans_id)
- reiserfs_update_sd(th, inode);
- err = reiserfs_end_persistent_transaction(th);
- if (err)
- retval = err;
- }
-
- reiserfs_write_unlock(inode->i_sb);
- reiserfs_check_path(&path);
- return retval;
-}
-
-static void reiserfs_readahead(struct readahead_control *rac)
-{
- mpage_readahead(rac, reiserfs_get_block);
-}
-
-/*
- * Compute real number of used bytes by file
- * Following three functions can go away when we'll have enough space in
- * stat item
- */
-static int real_space_diff(struct inode *inode, int sd_size)
-{
- int bytes;
- loff_t blocksize = inode->i_sb->s_blocksize;
-
- if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
- return sd_size;
-
- /*
- * End of file is also in full block with indirect reference, so round
- * up to the next block.
- *
- * there is just no way to know if the tail is actually packed
- * on the file, so we have to assume it isn't. When we pack the
- * tail, we add 4 bytes to pretend there really is an unformatted
- * node pointer
- */
- bytes =
- ((inode->i_size +
- (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
- sd_size;
- return bytes;
-}
-
-static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
- int sd_size)
-{
- if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
- return inode->i_size +
- (loff_t) (real_space_diff(inode, sd_size));
- }
- return ((loff_t) real_space_diff(inode, sd_size)) +
- (((loff_t) blocks) << 9);
-}
-
-/* Compute number of blocks used by file in ReiserFS counting */
-static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
-{
- loff_t bytes = inode_get_bytes(inode);
- loff_t real_space = real_space_diff(inode, sd_size);
-
- /* keeps fsck and non-quota versions of reiserfs happy */
- if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
- bytes += (loff_t) 511;
- }
-
- /*
- * files from before the quota patch might i_blocks such that
- * bytes < real_space. Deal with that here to prevent it from
- * going negative.
- */
- if (bytes < real_space)
- return 0;
- return (bytes - real_space) >> 9;
-}
-
-/*
- * BAD: new directories have stat data of new type and all other items
- * of old type. Version stored in the inode says about body items, so
- * in update_stat_data we can not rely on inode, but have to check
- * item version directly
- */
-
-/* called by read_locked_inode */
-static void init_inode(struct inode *inode, struct treepath *path)
-{
- struct buffer_head *bh;
- struct item_head *ih;
- __u32 rdev;
-
- bh = PATH_PLAST_BUFFER(path);
- ih = tp_item_head(path);
-
- copy_key(INODE_PKEY(inode), &ih->ih_key);
-
- INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
- REISERFS_I(inode)->i_flags = 0;
- REISERFS_I(inode)->i_prealloc_block = 0;
- REISERFS_I(inode)->i_prealloc_count = 0;
- REISERFS_I(inode)->i_trans_id = 0;
- REISERFS_I(inode)->i_jl = NULL;
- reiserfs_init_xattr_rwsem(inode);
-
- if (stat_data_v1(ih)) {
- struct stat_data_v1 *sd =
- (struct stat_data_v1 *)ih_item_body(bh, ih);
- unsigned long blocks;
-
- set_inode_item_key_version(inode, KEY_FORMAT_3_5);
- set_inode_sd_version(inode, STAT_DATA_V1);
- inode->i_mode = sd_v1_mode(sd);
- set_nlink(inode, sd_v1_nlink(sd));
- i_uid_write(inode, sd_v1_uid(sd));
- i_gid_write(inode, sd_v1_gid(sd));
- inode->i_size = sd_v1_size(sd);
- inode_set_atime(inode, sd_v1_atime(sd), 0);
- inode_set_mtime(inode, sd_v1_mtime(sd), 0);
- inode_set_ctime(inode, sd_v1_ctime(sd), 0);
-
- inode->i_blocks = sd_v1_blocks(sd);
- inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
- blocks = (inode->i_size + 511) >> 9;
- blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
-
- /*
- * there was a bug in <=3.5.23 when i_blocks could take
- * negative values. Starting from 3.5.17 this value could
- * even be stored in stat data. For such files we set
- * i_blocks based on file size. Just 2 notes: this can be
- * wrong for sparse files. On-disk value will be only
- * updated if file's inode will ever change
- */
- if (inode->i_blocks > blocks) {
- inode->i_blocks = blocks;
- }
-
- rdev = sd_v1_rdev(sd);
- REISERFS_I(inode)->i_first_direct_byte =
- sd_v1_first_direct_byte(sd);
-
- /*
- * an early bug in the quota code can give us an odd
- * number for the block count. This is incorrect, fix it here.
- */
- if (inode->i_blocks & 1) {
- inode->i_blocks++;
- }
- inode_set_bytes(inode,
- to_real_used_space(inode, inode->i_blocks,
- SD_V1_SIZE));
- /*
- * nopack is initially zero for v1 objects. For v2 objects,
- * nopack is initialised from sd_attrs
- */
- REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
- } else {
- /*
- * new stat data found, but object may have old items
- * (directories and symlinks)
- */
- struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih);
-
- inode->i_mode = sd_v2_mode(sd);
- set_nlink(inode, sd_v2_nlink(sd));
- i_uid_write(inode, sd_v2_uid(sd));
- inode->i_size = sd_v2_size(sd);
- i_gid_write(inode, sd_v2_gid(sd));
- inode_set_mtime(inode, sd_v2_mtime(sd), 0);
- inode_set_atime(inode, sd_v2_atime(sd), 0);
- inode_set_ctime(inode, sd_v2_ctime(sd), 0);
- inode->i_blocks = sd_v2_blocks(sd);
- rdev = sd_v2_rdev(sd);
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- inode->i_generation =
- le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
- else
- inode->i_generation = sd_v2_generation(sd);
-
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- set_inode_item_key_version(inode, KEY_FORMAT_3_5);
- else
- set_inode_item_key_version(inode, KEY_FORMAT_3_6);
- REISERFS_I(inode)->i_first_direct_byte = 0;
- set_inode_sd_version(inode, STAT_DATA_V2);
- inode_set_bytes(inode,
- to_real_used_space(inode, inode->i_blocks,
- SD_V2_SIZE));
- /*
- * read persistent inode attributes from sd and initialise
- * generic inode flags from them
- */
- REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
- sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
- }
-
- pathrelse(path);
- if (S_ISREG(inode->i_mode)) {
- inode->i_op = &reiserfs_file_inode_operations;
- inode->i_fop = &reiserfs_file_operations;
- inode->i_mapping->a_ops = &reiserfs_address_space_operations;
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = &reiserfs_dir_inode_operations;
- inode->i_fop = &reiserfs_dir_operations;
- } else if (S_ISLNK(inode->i_mode)) {
- inode->i_op = &reiserfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &reiserfs_address_space_operations;
- } else {
- inode->i_blocks = 0;
- inode->i_op = &reiserfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
- }
-}
-
-/* update new stat data with inode fields */
-static void inode2sd(void *sd, struct inode *inode, loff_t size)
-{
- struct stat_data *sd_v2 = (struct stat_data *)sd;
-
- set_sd_v2_mode(sd_v2, inode->i_mode);
- set_sd_v2_nlink(sd_v2, inode->i_nlink);
- set_sd_v2_uid(sd_v2, i_uid_read(inode));
- set_sd_v2_size(sd_v2, size);
- set_sd_v2_gid(sd_v2, i_gid_read(inode));
- set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode));
- set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode));
- set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode));
- set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
- else
- set_sd_v2_generation(sd_v2, inode->i_generation);
- set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs);
-}
-
-/* used to copy inode's fields to old stat data */
-static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
-{
- struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
-
- set_sd_v1_mode(sd_v1, inode->i_mode);
- set_sd_v1_uid(sd_v1, i_uid_read(inode));
- set_sd_v1_gid(sd_v1, i_gid_read(inode));
- set_sd_v1_nlink(sd_v1, inode->i_nlink);
- set_sd_v1_size(sd_v1, size);
- set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode));
- set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode));
- set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode));
-
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
- else
- set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
-
- /* Sigh. i_first_direct_byte is back */
- set_sd_v1_first_direct_byte(sd_v1,
- REISERFS_I(inode)->i_first_direct_byte);
-}
-
-/*
- * NOTE, you must prepare the buffer head before sending it here,
- * and then log it after the call
- */
-static void update_stat_data(struct treepath *path, struct inode *inode,
- loff_t size)
-{
- struct buffer_head *bh;
- struct item_head *ih;
-
- bh = PATH_PLAST_BUFFER(path);
- ih = tp_item_head(path);
-
- if (!is_statdata_le_ih(ih))
- reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
- INODE_PKEY(inode), ih);
-
- /* path points to old stat data */
- if (stat_data_v1(ih)) {
- inode2sd_v1(ih_item_body(bh, ih), inode, size);
- } else {
- inode2sd(ih_item_body(bh, ih), inode, size);
- }
-
- return;
-}
-
-void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
- struct inode *inode, loff_t size)
-{
- struct cpu_key key;
- INITIALIZE_PATH(path);
- struct buffer_head *bh;
- int fs_gen;
- struct item_head *ih, tmp_ih;
- int retval;
-
- BUG_ON(!th->t_trans_id);
-
- /* key type is unimportant */
- make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);
-
- for (;;) {
- int pos;
- /* look for the object's stat data */
- retval = search_item(inode->i_sb, &key, &path);
- if (retval == IO_ERROR) {
- reiserfs_error(inode->i_sb, "vs-13050",
- "i/o failure occurred trying to "
- "update %K stat data", &key);
- return;
- }
- if (retval == ITEM_NOT_FOUND) {
- pos = PATH_LAST_POSITION(&path);
- pathrelse(&path);
- if (inode->i_nlink == 0) {
- /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
- return;
- }
- reiserfs_warning(inode->i_sb, "vs-13060",
- "stat data of object %k (nlink == %d) "
- "not found (pos %d)",
- INODE_PKEY(inode), inode->i_nlink,
- pos);
- reiserfs_check_path(&path);
- return;
- }
-
- /*
- * sigh, prepare_for_journal might schedule. When it
- * schedules the FS might change. We have to detect that,
- * and loop back to the search if the stat data item has moved
- */
- bh = get_last_bh(&path);
- ih = tp_item_head(&path);
- copy_item_head(&tmp_ih, ih);
- fs_gen = get_generation(inode->i_sb);
- reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
-
- /* Stat_data item has been moved after scheduling. */
- if (fs_changed(fs_gen, inode->i_sb)
- && item_moved(&tmp_ih, &path)) {
- reiserfs_restore_prepared_buffer(inode->i_sb, bh);
- continue;
- }
- break;
- }
- update_stat_data(&path, inode, size);
- journal_mark_dirty(th, bh);
- pathrelse(&path);
- return;
-}
-
-/*
- * reiserfs_read_locked_inode is called to read the inode off disk, and it
- * does a make_bad_inode when things go wrong. But, we need to make sure
- * and clear the key in the private portion of the inode, otherwise a
- * corresponding iput might try to delete whatever object the inode last
- * represented.
- */
-static void reiserfs_make_bad_inode(struct inode *inode)
-{
- memset(INODE_PKEY(inode), 0, KEY_SIZE);
- make_bad_inode(inode);
-}
-
-/*
- * initially this function was derived from minix or ext2's analog and
- * evolved as the prototype did
- */
-int reiserfs_init_locked_inode(struct inode *inode, void *p)
-{
- struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
- inode->i_ino = args->objectid;
- INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
- return 0;
-}
-
-/*
- * looks for stat data in the tree, and fills up the fields of in-core
- * inode stat data fields
- */
-void reiserfs_read_locked_inode(struct inode *inode,
- struct reiserfs_iget_args *args)
-{
- INITIALIZE_PATH(path_to_sd);
- struct cpu_key key;
- unsigned long dirino;
- int retval;
-
- dirino = args->dirid;
-
- /*
- * set version 1, version 2 could be used too, because stat data
- * key is the same in both versions
- */
- _make_cpu_key(&key, KEY_FORMAT_3_5, dirino, inode->i_ino, 0, 0, 3);
-
- /* look for the object's stat data */
- retval = search_item(inode->i_sb, &key, &path_to_sd);
- if (retval == IO_ERROR) {
- reiserfs_error(inode->i_sb, "vs-13070",
- "i/o failure occurred trying to find "
- "stat data of %K", &key);
- reiserfs_make_bad_inode(inode);
- return;
- }
-
- /* a stale NFS handle can trigger this without it being an error */
- if (retval != ITEM_FOUND) {
- pathrelse(&path_to_sd);
- reiserfs_make_bad_inode(inode);
- clear_nlink(inode);
- return;
- }
-
- init_inode(inode, &path_to_sd);
-
- /*
- * It is possible that knfsd is trying to access inode of a file
- * that is being removed from the disk by some other thread. As we
- * update sd on unlink all that is required is to check for nlink
- * here. This bug was first found by Sizif when debugging
- * SquidNG/Butterfly, forgotten, and found again after Philippe
- * Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
-
- * More logical fix would require changes in fs/inode.c:iput() to
- * remove inode from hash-table _after_ fs cleaned disk stuff up and
- * in iget() to return NULL if I_FREEING inode is found in
- * hash-table.
- */
-
- /*
- * Currently there is one place where it's ok to meet inode with
- * nlink==0: processing of open-unlinked and half-truncated files
- * during mount (fs/reiserfs/super.c:finish_unfinished()).
- */
- if ((inode->i_nlink == 0) &&
- !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
- reiserfs_warning(inode->i_sb, "vs-13075",
- "dead inode read from disk %K. "
- "This is likely to be race with knfsd. Ignore",
- &key);
- reiserfs_make_bad_inode(inode);
- }
-
- /* init inode should be relsing */
- reiserfs_check_path(&path_to_sd);
-
- /*
- * Stat data v1 doesn't support ACLs.
- */
- if (get_inode_sd_version(inode) == STAT_DATA_V1)
- cache_no_acl(inode);
-}
-
-/*
- * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
- *
- * @inode: inode from hash table to check
- * @opaque: "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
- *
- * This function is called by iget5_locked() to distinguish reiserfs inodes
- * having the same inode numbers. Such inodes can only exist due to some
- * error condition. One of them should be bad. Inodes with identical
- * inode numbers (objectids) are distinguished by parent directory ids.
- *
- */
-int reiserfs_find_actor(struct inode *inode, void *opaque)
-{
- struct reiserfs_iget_args *args;
-
- args = opaque;
- /* args is already in CPU order */
- return (inode->i_ino == args->objectid) &&
- (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
-}
-
-struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
-{
- struct inode *inode;
- struct reiserfs_iget_args args;
- int depth;
-
- args.objectid = key->on_disk_key.k_objectid;
- args.dirid = key->on_disk_key.k_dir_id;
- depth = reiserfs_write_unlock_nested(s);
- inode = iget5_locked(s, key->on_disk_key.k_objectid,
- reiserfs_find_actor, reiserfs_init_locked_inode,
- (void *)(&args));
- reiserfs_write_lock_nested(s, depth);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- if (inode->i_state & I_NEW) {
- reiserfs_read_locked_inode(inode, &args);
- unlock_new_inode(inode);
- }
-
- if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
- /* either due to i/o error or a stale NFS handle */
- iput(inode);
- inode = NULL;
- }
- return inode;
-}
-
-static struct dentry *reiserfs_get_dentry(struct super_block *sb,
- u32 objectid, u32 dir_id, u32 generation)
-
-{
- struct cpu_key key;
- struct inode *inode;
-
- key.on_disk_key.k_objectid = objectid;
- key.on_disk_key.k_dir_id = dir_id;
- reiserfs_write_lock(sb);
- inode = reiserfs_iget(sb, &key);
- if (inode && !IS_ERR(inode) && generation != 0 &&
- generation != inode->i_generation) {
- iput(inode);
- inode = NULL;
- }
- reiserfs_write_unlock(sb);
-
- return d_obtain_alias(inode);
-}
-
-struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
-{
- /*
- * fhtype happens to reflect the number of u32s encoded.
- * due to a bug in earlier code, fhtype might indicate there
- * are more u32s then actually fitted.
- * so if fhtype seems to be more than len, reduce fhtype.
- * Valid types are:
- * 2 - objectid + dir_id - legacy support
- * 3 - objectid + dir_id + generation
- * 4 - objectid + dir_id + objectid and dirid of parent - legacy
- * 5 - objectid + dir_id + generation + objectid and dirid of parent
- * 6 - as above plus generation of directory
- * 6 does not fit in NFSv2 handles
- */
- if (fh_type > fh_len) {
- if (fh_type != 6 || fh_len != 5)
- reiserfs_warning(sb, "reiserfs-13077",
- "nfsd/reiserfs, fhtype=%d, len=%d - odd",
- fh_type, fh_len);
- fh_type = fh_len;
- }
- if (fh_len < 2)
- return NULL;
-
- return reiserfs_get_dentry(sb, fid->raw[0], fid->raw[1],
- (fh_type == 3 || fh_type >= 5) ? fid->raw[2] : 0);
-}
-
-struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
-{
- if (fh_type > fh_len)
- fh_type = fh_len;
- if (fh_type < 4)
- return NULL;
-
- return reiserfs_get_dentry(sb,
- (fh_type >= 5) ? fid->raw[3] : fid->raw[2],
- (fh_type >= 5) ? fid->raw[4] : fid->raw[3],
- (fh_type == 6) ? fid->raw[5] : 0);
-}
-
-int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
- struct inode *parent)
-{
- int maxlen = *lenp;
-
- if (parent && (maxlen < 5)) {
- *lenp = 5;
- return FILEID_INVALID;
- } else if (maxlen < 3) {
- *lenp = 3;
- return FILEID_INVALID;
- }
-
- data[0] = inode->i_ino;
- data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
- data[2] = inode->i_generation;
- *lenp = 3;
- if (parent) {
- data[3] = parent->i_ino;
- data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
- *lenp = 5;
- if (maxlen >= 6) {
- data[5] = parent->i_generation;
- *lenp = 6;
- }
- }
- return *lenp;
-}
-
-/*
- * looks for stat data, then copies fields to it, marks the buffer
- * containing stat data as dirty
- */
-/*
- * reiserfs inodes are never really dirty, since the dirty inode call
- * always logs them. This call allows the VFS inode marking routines
- * to properly mark inodes for datasync and such, but only actually
- * does something when called for a synchronous update.
- */
-int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- struct reiserfs_transaction_handle th;
- int jbegin_count = 1;
-
- if (sb_rdonly(inode->i_sb))
- return -EROFS;
- /*
- * memory pressure can sometimes initiate write_inode calls with
- * sync == 1,
- * these cases are just when the system needs ram, not when the
- * inode needs to reach disk for safety, and they can safely be
- * ignored because the altered inode has already been logged.
- */
- if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
- reiserfs_write_lock(inode->i_sb);
- if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
- reiserfs_update_sd(&th, inode);
- journal_end_sync(&th);
- }
- reiserfs_write_unlock(inode->i_sb);
- }
- return 0;
-}
-
-/*
- * stat data of new object is inserted already, this inserts the item
- * containing "." and ".." entries
- */
-static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
- struct inode *inode,
- struct item_head *ih, struct treepath *path,
- struct inode *dir)
-{
- struct super_block *sb = th->t_super;
- char empty_dir[EMPTY_DIR_SIZE];
- char *body = empty_dir;
- struct cpu_key key;
- int retval;
-
- BUG_ON(!th->t_trans_id);
-
- _make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
- le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
- TYPE_DIRENTRY, 3 /*key length */ );
-
- /*
- * compose item head for new item. Directories consist of items of
- * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
- * is done by reiserfs_new_inode
- */
- if (old_format_only(sb)) {
- make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
- TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
-
- make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
- ih->ih_key.k_objectid,
- INODE_PKEY(dir)->k_dir_id,
- INODE_PKEY(dir)->k_objectid);
- } else {
- make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
- TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
-
- make_empty_dir_item(body, ih->ih_key.k_dir_id,
- ih->ih_key.k_objectid,
- INODE_PKEY(dir)->k_dir_id,
- INODE_PKEY(dir)->k_objectid);
- }
-
- /* look for place in the tree for new item */
- retval = search_item(sb, &key, path);
- if (retval == IO_ERROR) {
- reiserfs_error(sb, "vs-13080",
- "i/o failure occurred creating new directory");
- return -EIO;
- }
- if (retval == ITEM_FOUND) {
- pathrelse(path);
- reiserfs_warning(sb, "vs-13070",
- "object with this key exists (%k)",
- &(ih->ih_key));
- return -EEXIST;
- }
-
- /* insert item, that is empty directory item */
- return reiserfs_insert_item(th, path, &key, ih, inode, body);
-}
-
-/*
- * stat data of object has been inserted, this inserts the item
- * containing the body of symlink
- */
-static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th,
- struct inode *inode,
- struct item_head *ih,
- struct treepath *path, const char *symname,
- int item_len)
-{
- struct super_block *sb = th->t_super;
- struct cpu_key key;
- int retval;
-
- BUG_ON(!th->t_trans_id);
-
- _make_cpu_key(&key, KEY_FORMAT_3_5,
- le32_to_cpu(ih->ih_key.k_dir_id),
- le32_to_cpu(ih->ih_key.k_objectid),
- 1, TYPE_DIRECT, 3 /*key length */ );
-
- make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
- 0 /*free_space */ );
-
- /* look for place in the tree for new item */
- retval = search_item(sb, &key, path);
- if (retval == IO_ERROR) {
- reiserfs_error(sb, "vs-13080",
- "i/o failure occurred creating new symlink");
- return -EIO;
- }
- if (retval == ITEM_FOUND) {
- pathrelse(path);
- reiserfs_warning(sb, "vs-13080",
- "object with this key exists (%k)",
- &(ih->ih_key));
- return -EEXIST;
- }
-
- /* insert item, that is body of symlink */
- return reiserfs_insert_item(th, path, &key, ih, inode, symname);
-}
-
-/*
- * inserts the stat data into the tree, and then calls
- * reiserfs_new_directory (to insert ".", ".." item if new object is
- * directory) or reiserfs_new_symlink (to insert symlink body if new
- * object is symlink) or nothing (if new object is regular file)
-
- * NOTE! uid and gid must already be set in the inode. If we return
- * non-zero due to an error, we have to drop the quota previously allocated
- * for the fresh inode. This can only be done outside a transaction, so
- * if we return non-zero, we also end the transaction.
- *
- * @th: active transaction handle
- * @dir: parent directory for new inode
- * @mode: mode of new inode
- * @symname: symlink contents if inode is symlink
- * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for
- * symlinks
- * @inode: inode to be filled
- * @security: optional security context to associate with this inode
- */
-int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
- struct inode *dir, umode_t mode, const char *symname,
- /* 0 for regular, EMTRY_DIR_SIZE for dirs,
- strlen (symname) for symlinks) */
- loff_t i_size, struct dentry *dentry,
- struct inode *inode,
- struct reiserfs_security_handle *security)
-{
- struct super_block *sb = dir->i_sb;
- struct reiserfs_iget_args args;
- INITIALIZE_PATH(path_to_key);
- struct cpu_key key;
- struct item_head ih;
- struct stat_data sd;
- int retval;
- int err;
- int depth;
-
- BUG_ON(!th->t_trans_id);
-
- depth = reiserfs_write_unlock_nested(sb);
- err = dquot_alloc_inode(inode);
- reiserfs_write_lock_nested(sb, depth);
- if (err)
- goto out_end_trans;
- if (!dir->i_nlink) {
- err = -EPERM;
- goto out_bad_inode;
- }
-
- /* item head of new item */
- ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
- ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
- if (!ih.ih_key.k_objectid) {
- err = -ENOMEM;
- goto out_bad_inode;
- }
- args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
- if (old_format_only(sb))
- make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
- TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
- else
- make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
- TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
- memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE);
- args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
-
- depth = reiserfs_write_unlock_nested(inode->i_sb);
- err = insert_inode_locked4(inode, args.objectid,
- reiserfs_find_actor, &args);
- reiserfs_write_lock_nested(inode->i_sb, depth);
- if (err) {
- err = -EINVAL;
- goto out_bad_inode;
- }
-
- if (old_format_only(sb))
- /*
- * not a perfect generation count, as object ids can be reused,
- * but this is as good as reiserfs can do right now.
- * note that the private part of inode isn't filled in yet,
- * we have to use the directory.
- */
- inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
- else
-#if defined( USE_INODE_GENERATION_COUNTER )
- inode->i_generation =
- le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
-#else
- inode->i_generation = ++event;
-#endif
-
- /* fill stat data */
- set_nlink(inode, (S_ISDIR(mode) ? 2 : 1));
-
- /* uid and gid must already be set by the caller for quota init */
-
- simple_inode_init_ts(inode);
- inode->i_size = i_size;
- inode->i_blocks = 0;
- inode->i_bytes = 0;
- REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
- U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
-
- INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
- REISERFS_I(inode)->i_flags = 0;
- REISERFS_I(inode)->i_prealloc_block = 0;
- REISERFS_I(inode)->i_prealloc_count = 0;
- REISERFS_I(inode)->i_trans_id = 0;
- REISERFS_I(inode)->i_jl = NULL;
- REISERFS_I(inode)->i_attrs =
- REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
- sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
- reiserfs_init_xattr_rwsem(inode);
-
- /* key to search for correct place for new stat data */
- _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
- le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
- TYPE_STAT_DATA, 3 /*key length */ );
-
- /* find proper place for inserting of stat data */
- retval = search_item(sb, &key, &path_to_key);
- if (retval == IO_ERROR) {
- err = -EIO;
- goto out_bad_inode;
- }
- if (retval == ITEM_FOUND) {
- pathrelse(&path_to_key);
- err = -EEXIST;
- goto out_bad_inode;
- }
- if (old_format_only(sb)) {
- /* i_uid or i_gid is too big to be stored in stat data v3.5 */
- if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
- pathrelse(&path_to_key);
- err = -EINVAL;
- goto out_bad_inode;
- }
- inode2sd_v1(&sd, inode, inode->i_size);
- } else {
- inode2sd(&sd, inode, inode->i_size);
- }
- /*
- * store in in-core inode the key of stat data and version all
- * object items will have (directory items will have old offset
- * format, other new objects will consist of new items)
- */
- if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
- set_inode_item_key_version(inode, KEY_FORMAT_3_5);
- else
- set_inode_item_key_version(inode, KEY_FORMAT_3_6);
- if (old_format_only(sb))
- set_inode_sd_version(inode, STAT_DATA_V1);
- else
- set_inode_sd_version(inode, STAT_DATA_V2);
-
- /* insert the stat data into the tree */
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- if (REISERFS_I(dir)->new_packing_locality)
- th->displace_new_blocks = 1;
-#endif
- retval =
- reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
- (char *)(&sd));
- if (retval) {
- err = retval;
- reiserfs_check_path(&path_to_key);
- goto out_bad_inode;
- }
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- if (!th->displace_new_blocks)
- REISERFS_I(dir)->new_packing_locality = 0;
-#endif
- if (S_ISDIR(mode)) {
- /* insert item with "." and ".." */
- retval =
- reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
- }
-
- if (S_ISLNK(mode)) {
- /* insert body of symlink */
- if (!old_format_only(sb))
- i_size = ROUND_UP(i_size);
- retval =
- reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
- i_size);
- }
- if (retval) {
- err = retval;
- reiserfs_check_path(&path_to_key);
- journal_end(th);
- goto out_inserted_sd;
- }
-
- /*
- * Mark it private if we're creating the privroot
- * or something under it.
- */
- if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root)
- reiserfs_init_priv_inode(inode);
-
- if (reiserfs_posixacl(inode->i_sb)) {
- reiserfs_write_unlock(inode->i_sb);
- retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
- reiserfs_write_lock(inode->i_sb);
- if (retval) {
- err = retval;
- reiserfs_check_path(&path_to_key);
- journal_end(th);
- goto out_inserted_sd;
- }
- } else if (inode->i_sb->s_flags & SB_POSIXACL) {
- reiserfs_warning(inode->i_sb, "jdm-13090",
- "ACLs aren't enabled in the fs, "
- "but vfs thinks they are!");
- }
-
- if (security->name) {
- reiserfs_write_unlock(inode->i_sb);
- retval = reiserfs_security_write(th, inode, security);
- reiserfs_write_lock(inode->i_sb);
- if (retval) {
- err = retval;
- reiserfs_check_path(&path_to_key);
- retval = journal_end(th);
- if (retval)
- err = retval;
- goto out_inserted_sd;
- }
- }
-
- reiserfs_update_sd(th, inode);
- reiserfs_check_path(&path_to_key);
-
- return 0;
-
-out_bad_inode:
- /* Invalidate the object, nothing was inserted yet */
- INODE_PKEY(inode)->k_objectid = 0;
-
- /* Quota change must be inside a transaction for journaling */
- depth = reiserfs_write_unlock_nested(inode->i_sb);
- dquot_free_inode(inode);
- reiserfs_write_lock_nested(inode->i_sb, depth);
-
-out_end_trans:
- journal_end(th);
- /*
- * Drop can be outside and it needs more credits so it's better
- * to have it outside
- */
- depth = reiserfs_write_unlock_nested(inode->i_sb);
- dquot_drop(inode);
- reiserfs_write_lock_nested(inode->i_sb, depth);
- inode->i_flags |= S_NOQUOTA;
- make_bad_inode(inode);
-
-out_inserted_sd:
- clear_nlink(inode);
- th->t_trans_id = 0; /* so the caller can't use this handle later */
- if (inode->i_state & I_NEW)
- unlock_new_inode(inode);
- iput(inode);
- return err;
-}
-
-/*
- * finds the tail page in the page cache,
- * reads the last block in.
- *
- * On success, page_result is set to a locked, pinned page, and bh_result
- * is set to an up to date buffer for the last block in the file. returns 0.
- *
- * tail conversion is not done, so bh_result might not be valid for writing
- * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
- * trying to write the block.
- *
- * on failure, nonzero is returned, page_result and bh_result are untouched.
- */
-static int grab_tail_page(struct inode *inode,
- struct page **page_result,
- struct buffer_head **bh_result)
-{
-
- /*
- * we want the page with the last byte in the file,
- * not the page that will hold the next byte for appending
- */
- unsigned long index = (inode->i_size - 1) >> PAGE_SHIFT;
- unsigned long pos = 0;
- unsigned long start = 0;
- unsigned long blocksize = inode->i_sb->s_blocksize;
- unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1);
- struct buffer_head *bh;
- struct buffer_head *head;
- struct folio *folio;
- int error;
-
- /*
- * we know that we are only called with inode->i_size > 0.
- * we also know that a file tail can never be as big as a block
- * If i_size % blocksize == 0, our file is currently block aligned
- * and it won't need converting or zeroing after a truncate.
- */
- if ((offset & (blocksize - 1)) == 0) {
- return -ENOENT;
- }
- folio = __filemap_get_folio(inode->i_mapping, index,
- FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
- mapping_gfp_mask(inode->i_mapping));
- if (IS_ERR(folio))
- return PTR_ERR(folio);
- /* start within the page of the last block in the file */
- start = (offset / blocksize) * blocksize;
-
- error = __block_write_begin(folio, start, offset - start,
- reiserfs_get_block_create_0);
- if (error)
- goto unlock;
-
- head = folio_buffers(folio);
- bh = head;
- do {
- if (pos >= start) {
- break;
- }
- bh = bh->b_this_page;
- pos += blocksize;
- } while (bh != head);
-
- if (!buffer_uptodate(bh)) {
- /*
- * note, this should never happen, prepare_write should be
- * taking care of this for us. If the buffer isn't up to
- * date, I've screwed up the code to find the buffer, or the
- * code to call prepare_write
- */
- reiserfs_error(inode->i_sb, "clm-6000",
- "error reading block %lu", bh->b_blocknr);
- error = -EIO;
- goto unlock;
- }
- *bh_result = bh;
- *page_result = &folio->page;
-
- return error;
-
-unlock:
- folio_unlock(folio);
- folio_put(folio);
- return error;
-}
-
-/*
- * vfs version of truncate file. Must NOT be called with
- * a transaction already started.
- *
- * some code taken from block_truncate_page
- */
-int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
-{
- struct reiserfs_transaction_handle th;
- /* we want the offset for the first byte after the end of the file */
- unsigned long offset = inode->i_size & (PAGE_SIZE - 1);
- unsigned blocksize = inode->i_sb->s_blocksize;
- unsigned length;
- struct page *page = NULL;
- int error;
- struct buffer_head *bh = NULL;
- int err2;
-
- reiserfs_write_lock(inode->i_sb);
-
- if (inode->i_size > 0) {
- error = grab_tail_page(inode, &page, &bh);
- if (error) {
- /*
- * -ENOENT means we truncated past the end of the
- * file, and get_block_create_0 could not find a
- * block to read in, which is ok.
- */
- if (error != -ENOENT)
- reiserfs_error(inode->i_sb, "clm-6001",
- "grab_tail_page failed %d",
- error);
- page = NULL;
- bh = NULL;
- }
- }
-
- /*
- * so, if page != NULL, we have a buffer head for the offset at
- * the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
- * then we have an unformatted node. Otherwise, we have a direct item,
- * and no zeroing is required on disk. We zero after the truncate,
- * because the truncate might pack the item anyway
- * (it will unmap bh if it packs).
- *
- * it is enough to reserve space in transaction for 2 balancings:
- * one for "save" link adding and another for the first
- * cut_from_item. 1 is for update_sd
- */
- error = journal_begin(&th, inode->i_sb,
- JOURNAL_PER_BALANCE_CNT * 2 + 1);
- if (error)
- goto out;
- reiserfs_update_inode_transaction(inode);
- if (update_timestamps)
- /*
- * we are doing real truncate: if the system crashes
- * before the last transaction of truncating gets committed
- * - on reboot the file either appears truncated properly
- * or not truncated at all
- */
- add_save_link(&th, inode, 1);
- err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
- error = journal_end(&th);
- if (error)
- goto out;
-
- /* check reiserfs_do_truncate after ending the transaction */
- if (err2) {
- error = err2;
- goto out;
- }
-
- if (update_timestamps) {
- error = remove_save_link(inode, 1 /* truncate */);
- if (error)
- goto out;
- }
-
- if (page) {
- length = offset & (blocksize - 1);
- /* if we are not on a block boundary */
- if (length) {
- length = blocksize - length;
- zero_user(page, offset, length);
- if (buffer_mapped(bh) && bh->b_blocknr != 0) {
- mark_buffer_dirty(bh);
- }
- }
- unlock_page(page);
- put_page(page);
- }
-
- reiserfs_write_unlock(inode->i_sb);
-
- return 0;
-out:
- if (page) {
- unlock_page(page);
- put_page(page);
- }
-
- reiserfs_write_unlock(inode->i_sb);
-
- return error;
-}
-
-static int map_block_for_writepage(struct inode *inode,
- struct buffer_head *bh_result,
- unsigned long block)
-{
- struct reiserfs_transaction_handle th;
- int fs_gen;
- struct item_head tmp_ih;
- struct item_head *ih;
- struct buffer_head *bh;
- __le32 *item;
- struct cpu_key key;
- INITIALIZE_PATH(path);
- int pos_in_item;
- int jbegin_count = JOURNAL_PER_BALANCE_CNT;
- loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
- int retval;
- int use_get_block = 0;
- int bytes_copied = 0;
- int copy_size;
- int trans_running = 0;
-
- /*
- * catch places below that try to log something without
- * starting a trans
- */
- th.t_trans_id = 0;
-
- if (!buffer_uptodate(bh_result)) {
- return -EIO;
- }
-
- kmap(bh_result->b_page);
-start_over:
- reiserfs_write_lock(inode->i_sb);
- make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
-
-research:
- retval = search_for_position_by_key(inode->i_sb, &key, &path);
- if (retval != POSITION_FOUND) {
- use_get_block = 1;
- goto out;
- }
-
- bh = get_last_bh(&path);
- ih = tp_item_head(&path);
- item = tp_item_body(&path);
- pos_in_item = path.pos_in_item;
-
- /* we've found an unformatted node */
- if (indirect_item_found(retval, ih)) {
- if (bytes_copied > 0) {
- reiserfs_warning(inode->i_sb, "clm-6002",
- "bytes_copied %d", bytes_copied);
- }
- if (!get_block_num(item, pos_in_item)) {
- /* crap, we are writing to a hole */
- use_get_block = 1;
- goto out;
- }
- set_block_dev_mapped(bh_result,
- get_block_num(item, pos_in_item), inode);
- } else if (is_direct_le_ih(ih)) {
- char *p;
- p = page_address(bh_result->b_page);
- p += (byte_offset - 1) & (PAGE_SIZE - 1);
- copy_size = ih_item_len(ih) - pos_in_item;
-
- fs_gen = get_generation(inode->i_sb);
- copy_item_head(&tmp_ih, ih);
-
- if (!trans_running) {
- /* vs-3050 is gone, no need to drop the path */
- retval = journal_begin(&th, inode->i_sb, jbegin_count);
- if (retval)
- goto out;
- reiserfs_update_inode_transaction(inode);
- trans_running = 1;
- if (fs_changed(fs_gen, inode->i_sb)
- && item_moved(&tmp_ih, &path)) {
- reiserfs_restore_prepared_buffer(inode->i_sb,
- bh);
- goto research;
- }
- }
-
- reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
-
- if (fs_changed(fs_gen, inode->i_sb)
- && item_moved(&tmp_ih, &path)) {
- reiserfs_restore_prepared_buffer(inode->i_sb, bh);
- goto research;
- }
-
- memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied,
- copy_size);
-
- journal_mark_dirty(&th, bh);
- bytes_copied += copy_size;
- set_block_dev_mapped(bh_result, 0, inode);
-
- /* are there still bytes left? */
- if (bytes_copied < bh_result->b_size &&
- (byte_offset + bytes_copied) < inode->i_size) {
- set_cpu_key_k_offset(&key,
- cpu_key_k_offset(&key) +
- copy_size);
- goto research;
- }
- } else {
- reiserfs_warning(inode->i_sb, "clm-6003",
- "bad item inode %lu", inode->i_ino);
- retval = -EIO;
- goto out;
- }
- retval = 0;
-
-out:
- pathrelse(&path);
- if (trans_running) {
- int err = journal_end(&th);
- if (err)
- retval = err;
- trans_running = 0;
- }
- reiserfs_write_unlock(inode->i_sb);
-
- /* this is where we fill in holes in the file. */
- if (use_get_block) {
- retval = reiserfs_get_block(inode, block, bh_result,
- GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
- | GET_BLOCK_NO_DANGLE);
- if (!retval) {
- if (!buffer_mapped(bh_result)
- || bh_result->b_blocknr == 0) {
- /* get_block failed to find a mapped unformatted node. */
- use_get_block = 0;
- goto start_over;
- }
- }
- }
- kunmap(bh_result->b_page);
-
- if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
- /*
- * we've copied data from the page into the direct item, so the
- * buffer in the page is now clean, mark it to reflect that.
- */
- lock_buffer(bh_result);
- clear_buffer_dirty(bh_result);
- unlock_buffer(bh_result);
- }
- return retval;
-}
-
-/*
- * mason@suse.com: updated in 2.5.54 to follow the same general io
- * start/recovery path as __block_write_full_folio, along with special
- * code to handle reiserfs tails.
- */
-static int reiserfs_write_folio(struct folio *folio,
- struct writeback_control *wbc, void *data)
-{
- struct inode *inode = folio->mapping->host;
- unsigned long end_index = inode->i_size >> PAGE_SHIFT;
- int error = 0;
- unsigned long block;
- sector_t last_block;
- struct buffer_head *head, *bh;
- int partial = 0;
- int nr = 0;
- int checked = folio_test_checked(folio);
- struct reiserfs_transaction_handle th;
- struct super_block *s = inode->i_sb;
- int bh_per_page = PAGE_SIZE / s->s_blocksize;
- th.t_trans_id = 0;
-
- /* no logging allowed when nonblocking or from PF_MEMALLOC */
- if (checked && (current->flags & PF_MEMALLOC)) {
- folio_redirty_for_writepage(wbc, folio);
- folio_unlock(folio);
- return 0;
- }
-
- /*
- * The folio dirty bit is cleared before writepage is called, which
- * means we have to tell create_empty_buffers to make dirty buffers
- * The folio really should be up to date at this point, so tossing
- * in the BH_Uptodate is just a sanity check.
- */
- head = folio_buffers(folio);
- if (!head)
- head = create_empty_buffers(folio, s->s_blocksize,
- (1 << BH_Dirty) | (1 << BH_Uptodate));
-
- /*
- * last folio in the file, zero out any contents past the
- * last byte in the file
- */
- if (folio->index >= end_index) {
- unsigned last_offset;
-
- last_offset = inode->i_size & (PAGE_SIZE - 1);
- /* no file contents in this folio */
- if (folio->index >= end_index + 1 || !last_offset) {
- folio_unlock(folio);
- return 0;
- }
- folio_zero_segment(folio, last_offset, folio_size(folio));
- }
- bh = head;
- block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits);
- last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
- /* first map all the buffers, logging any direct items we find */
- do {
- if (block > last_block) {
- /*
- * This can happen when the block size is less than
- * the folio size. The corresponding bytes in the folio
- * were zero filled above
- */
- clear_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- } else if ((checked || buffer_dirty(bh)) &&
- (!buffer_mapped(bh) || bh->b_blocknr == 0)) {
- /*
- * not mapped yet, or it points to a direct item, search
- * the btree for the mapping info, and log any direct
- * items found
- */
- if ((error = map_block_for_writepage(inode, bh, block))) {
- goto fail;
- }
- }
- bh = bh->b_this_page;
- block++;
- } while (bh != head);
-
- /*
- * we start the transaction after map_block_for_writepage,
- * because it can create holes in the file (an unbounded operation).
- * starting it here, we can make a reliable estimate for how many
- * blocks we're going to log
- */
- if (checked) {
- folio_clear_checked(folio);
- reiserfs_write_lock(s);
- error = journal_begin(&th, s, bh_per_page + 1);
- if (error) {
- reiserfs_write_unlock(s);
- goto fail;
- }
- reiserfs_update_inode_transaction(inode);
- }
- /* now go through and lock any dirty buffers on the folio */
- do {
- get_bh(bh);
- if (!buffer_mapped(bh))
- continue;
- if (buffer_mapped(bh) && bh->b_blocknr == 0)
- continue;
-
- if (checked) {
- reiserfs_prepare_for_journal(s, bh, 1);
- journal_mark_dirty(&th, bh);
- continue;
- }
- /*
- * from this point on, we know the buffer is mapped to a
- * real block and not a direct item
- */
- if (wbc->sync_mode != WB_SYNC_NONE) {
- lock_buffer(bh);
- } else {
- if (!trylock_buffer(bh)) {
- folio_redirty_for_writepage(wbc, folio);
- continue;
- }
- }
- if (test_clear_buffer_dirty(bh)) {
- mark_buffer_async_write(bh);
- } else {
- unlock_buffer(bh);
- }
- } while ((bh = bh->b_this_page) != head);
-
- if (checked) {
- error = journal_end(&th);
- reiserfs_write_unlock(s);
- if (error)
- goto fail;
- }
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio);
- folio_unlock(folio);
-
- /*
- * since any buffer might be the only dirty buffer on the folio,
- * the first submit_bh can bring the folio out of writeback.
- * be careful with the buffers.
- */
- do {
- struct buffer_head *next = bh->b_this_page;
- if (buffer_async_write(bh)) {
- submit_bh(REQ_OP_WRITE, bh);
- nr++;
- }
- put_bh(bh);
- bh = next;
- } while (bh != head);
-
- error = 0;
-done:
- if (nr == 0) {
- /*
- * if this folio only had a direct item, it is very possible for
- * no io to be required without there being an error. Or,
- * someone else could have locked them and sent them down the
- * pipe without locking the folio
- */
- bh = head;
- do {
- if (!buffer_uptodate(bh)) {
- partial = 1;
- break;
- }
- bh = bh->b_this_page;
- } while (bh != head);
- if (!partial)
- folio_mark_uptodate(folio);
- folio_end_writeback(folio);
- }
- return error;
-
-fail:
- /*
- * catches various errors, we need to make sure any valid dirty blocks
- * get to the media. The folio is currently locked and not marked for
- * writeback
- */
- folio_clear_uptodate(folio);
- bh = head;
- do {
- get_bh(bh);
- if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
- lock_buffer(bh);
- mark_buffer_async_write(bh);
- } else {
- /*
- * clear any dirty bits that might have come from
- * getting attached to a dirty folio
- */
- clear_buffer_dirty(bh);
- }
- bh = bh->b_this_page;
- } while (bh != head);
- BUG_ON(folio_test_writeback(folio));
- folio_start_writeback(folio);
- folio_unlock(folio);
- do {
- struct buffer_head *next = bh->b_this_page;
- if (buffer_async_write(bh)) {
- clear_buffer_dirty(bh);
- submit_bh(REQ_OP_WRITE, bh);
- nr++;
- }
- put_bh(bh);
- bh = next;
- } while (bh != head);
- goto done;
-}
-
-static int reiserfs_read_folio(struct file *f, struct folio *folio)
-{
- return block_read_full_folio(folio, reiserfs_get_block);
-}
-
-static int reiserfs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- reiserfs_wait_on_write_block(mapping->host->i_sb);
- return write_cache_pages(mapping, wbc, reiserfs_write_folio, NULL);
-}
-
-static void reiserfs_truncate_failed_write(struct inode *inode)
-{
- truncate_inode_pages(inode->i_mapping, inode->i_size);
- reiserfs_truncate_file(inode, 0);
-}
-
-static int reiserfs_write_begin(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
-{
- struct inode *inode;
- struct folio *folio;
- pgoff_t index;
- int ret;
- int old_ref = 0;
-
- inode = mapping->host;
- index = pos >> PAGE_SHIFT;
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio))
- return PTR_ERR(folio);
- *foliop = folio;
-
- reiserfs_wait_on_write_block(inode->i_sb);
- fix_tail_page_for_writing(&folio->page);
- if (reiserfs_transaction_running(inode->i_sb)) {
- struct reiserfs_transaction_handle *th;
- th = (struct reiserfs_transaction_handle *)current->
- journal_info;
- BUG_ON(!th->t_refcount);
- BUG_ON(!th->t_trans_id);
- old_ref = th->t_refcount;
- th->t_refcount++;
- }
- ret = __block_write_begin(folio, pos, len, reiserfs_get_block);
- if (ret && reiserfs_transaction_running(inode->i_sb)) {
- struct reiserfs_transaction_handle *th = current->journal_info;
- /*
- * this gets a little ugly. If reiserfs_get_block returned an
- * error and left a transacstion running, we've got to close
- * it, and we've got to free handle if it was a persistent
- * transaction.
- *
- * But, if we had nested into an existing transaction, we need
- * to just drop the ref count on the handle.
- *
- * If old_ref == 0, the transaction is from reiserfs_get_block,
- * and it was a persistent trans. Otherwise, it was nested
- * above.
- */
- if (th->t_refcount > old_ref) {
- if (old_ref)
- th->t_refcount--;
- else {
- int err;
- reiserfs_write_lock(inode->i_sb);
- err = reiserfs_end_persistent_transaction(th);
- reiserfs_write_unlock(inode->i_sb);
- if (err)
- ret = err;
- }
- }
- }
- if (ret) {
- folio_unlock(folio);
- folio_put(folio);
- /* Truncate allocated blocks */
- reiserfs_truncate_failed_write(inode);
- }
- return ret;
-}
-
-int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
-{
- struct inode *inode = page->mapping->host;
- int ret;
- int old_ref = 0;
- int depth;
-
- depth = reiserfs_write_unlock_nested(inode->i_sb);
- reiserfs_wait_on_write_block(inode->i_sb);
- reiserfs_write_lock_nested(inode->i_sb, depth);
-
- fix_tail_page_for_writing(page);
- if (reiserfs_transaction_running(inode->i_sb)) {
- struct reiserfs_transaction_handle *th;
- th = (struct reiserfs_transaction_handle *)current->
- journal_info;
- BUG_ON(!th->t_refcount);
- BUG_ON(!th->t_trans_id);
- old_ref = th->t_refcount;
- th->t_refcount++;
- }
-
- ret = __block_write_begin(page_folio(page), from, len, reiserfs_get_block);
- if (ret && reiserfs_transaction_running(inode->i_sb)) {
- struct reiserfs_transaction_handle *th = current->journal_info;
- /*
- * this gets a little ugly. If reiserfs_get_block returned an
- * error and left a transacstion running, we've got to close
- * it, and we've got to free handle if it was a persistent
- * transaction.
- *
- * But, if we had nested into an existing transaction, we need
- * to just drop the ref count on the handle.
- *
- * If old_ref == 0, the transaction is from reiserfs_get_block,
- * and it was a persistent trans. Otherwise, it was nested
- * above.
- */
- if (th->t_refcount > old_ref) {
- if (old_ref)
- th->t_refcount--;
- else {
- int err;
- reiserfs_write_lock(inode->i_sb);
- err = reiserfs_end_persistent_transaction(th);
- reiserfs_write_unlock(inode->i_sb);
- if (err)
- ret = err;
- }
- }
- }
- return ret;
-
-}
-
-static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
-{
- return generic_block_bmap(as, block, reiserfs_bmap);
-}
-
-static int reiserfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
-{
- struct inode *inode = folio->mapping->host;
- int ret = 0;
- int update_sd = 0;
- struct reiserfs_transaction_handle *th;
- unsigned start;
- bool locked = false;
-
- reiserfs_wait_on_write_block(inode->i_sb);
- if (reiserfs_transaction_running(inode->i_sb))
- th = current->journal_info;
- else
- th = NULL;
-
- start = pos & (PAGE_SIZE - 1);
- if (unlikely(copied < len)) {
- if (!folio_test_uptodate(folio))
- copied = 0;
-
- folio_zero_new_buffers(folio, start + copied, start + len);
- }
- flush_dcache_folio(folio);
-
- reiserfs_commit_page(inode, &folio->page, start, start + copied);
-
- /*
- * generic_commit_write does this for us, but does not update the
- * transaction tracking stuff when the size changes. So, we have
- * to do the i_size updates here.
- */
- if (pos + copied > inode->i_size) {
- struct reiserfs_transaction_handle myth;
- reiserfs_write_lock(inode->i_sb);
- locked = true;
- /*
- * If the file have grown beyond the border where it
- * can have a tail, unmark it as needing a tail
- * packing
- */
- if ((have_large_tails(inode->i_sb)
- && inode->i_size > i_block_size(inode) * 4)
- || (have_small_tails(inode->i_sb)
- && inode->i_size > i_block_size(inode)))
- REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
- ret = journal_begin(&myth, inode->i_sb, 1);
- if (ret)
- goto journal_error;
-
- reiserfs_update_inode_transaction(inode);
- inode->i_size = pos + copied;
- /*
- * this will just nest into our transaction. It's important
- * to use mark_inode_dirty so the inode gets pushed around on
- * the dirty lists, and so that O_SYNC works as expected
- */
- mark_inode_dirty(inode);
- reiserfs_update_sd(&myth, inode);
- update_sd = 1;
- ret = journal_end(&myth);
- if (ret)
- goto journal_error;
- }
- if (th) {
- if (!locked) {
- reiserfs_write_lock(inode->i_sb);
- locked = true;
- }
- if (!update_sd)
- mark_inode_dirty(inode);
- ret = reiserfs_end_persistent_transaction(th);
- if (ret)
- goto out;
- }
-
-out:
- if (locked)
- reiserfs_write_unlock(inode->i_sb);
- folio_unlock(folio);
- folio_put(folio);
-
- if (pos + len > inode->i_size)
- reiserfs_truncate_failed_write(inode);
-
- return ret == 0 ? copied : ret;
-
-journal_error:
- reiserfs_write_unlock(inode->i_sb);
- locked = false;
- if (th) {
- if (!update_sd)
- reiserfs_update_sd(th, inode);
- ret = reiserfs_end_persistent_transaction(th);
- }
- goto out;
-}
-
-int reiserfs_commit_write(struct file *f, struct page *page,
- unsigned from, unsigned to)
-{
- struct inode *inode = page->mapping->host;
- loff_t pos = ((loff_t) page->index << PAGE_SHIFT) + to;
- int ret = 0;
- int update_sd = 0;
- struct reiserfs_transaction_handle *th = NULL;
- int depth;
-
- depth = reiserfs_write_unlock_nested(inode->i_sb);
- reiserfs_wait_on_write_block(inode->i_sb);
- reiserfs_write_lock_nested(inode->i_sb, depth);
-
- if (reiserfs_transaction_running(inode->i_sb)) {
- th = current->journal_info;
- }
- reiserfs_commit_page(inode, page, from, to);
-
- /*
- * generic_commit_write does this for us, but does not update the
- * transaction tracking stuff when the size changes. So, we have
- * to do the i_size updates here.
- */
- if (pos > inode->i_size) {
- struct reiserfs_transaction_handle myth;
- /*
- * If the file have grown beyond the border where it
- * can have a tail, unmark it as needing a tail
- * packing
- */
- if ((have_large_tails(inode->i_sb)
- && inode->i_size > i_block_size(inode) * 4)
- || (have_small_tails(inode->i_sb)
- && inode->i_size > i_block_size(inode)))
- REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-
- ret = journal_begin(&myth, inode->i_sb, 1);
- if (ret)
- goto journal_error;
-
- reiserfs_update_inode_transaction(inode);
- inode->i_size = pos;
- /*
- * this will just nest into our transaction. It's important
- * to use mark_inode_dirty so the inode gets pushed around
- * on the dirty lists, and so that O_SYNC works as expected
- */
- mark_inode_dirty(inode);
- reiserfs_update_sd(&myth, inode);
- update_sd = 1;
- ret = journal_end(&myth);
- if (ret)
- goto journal_error;
- }
- if (th) {
- if (!update_sd)
- mark_inode_dirty(inode);
- ret = reiserfs_end_persistent_transaction(th);
- if (ret)
- goto out;
- }
-
-out:
- return ret;
-
-journal_error:
- if (th) {
- if (!update_sd)
- reiserfs_update_sd(th, inode);
- ret = reiserfs_end_persistent_transaction(th);
- }
-
- return ret;
-}
-
-void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
-{
- if (reiserfs_attrs(inode->i_sb)) {
- if (sd_attrs & REISERFS_SYNC_FL)
- inode->i_flags |= S_SYNC;
- else
- inode->i_flags &= ~S_SYNC;
- if (sd_attrs & REISERFS_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
- if (sd_attrs & REISERFS_APPEND_FL)
- inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
- if (sd_attrs & REISERFS_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
- else
- inode->i_flags &= ~S_NOATIME;
- if (sd_attrs & REISERFS_NOTAIL_FL)
- REISERFS_I(inode)->i_flags |= i_nopack_mask;
- else
- REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
- }
-}
-
-/*
- * decide if this buffer needs to stay around for data logging or ordered
- * write purposes
- */
-static int invalidate_folio_can_drop(struct inode *inode, struct buffer_head *bh)
-{
- int ret = 1;
- struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
-
- lock_buffer(bh);
- spin_lock(&j->j_dirty_buffers_lock);
- if (!buffer_mapped(bh)) {
- goto free_jh;
- }
- /*
- * the page is locked, and the only places that log a data buffer
- * also lock the page.
- */
- if (reiserfs_file_data_log(inode)) {
- /*
- * very conservative, leave the buffer pinned if
- * anyone might need it.
- */
- if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
- ret = 0;
- }
- } else if (buffer_dirty(bh)) {
- struct reiserfs_journal_list *jl;
- struct reiserfs_jh *jh = bh->b_private;
-
- /*
- * why is this safe?
- * reiserfs_setattr updates i_size in the on disk
- * stat data before allowing vmtruncate to be called.
- *
- * If buffer was put onto the ordered list for this
- * transaction, we know for sure either this transaction
- * or an older one already has updated i_size on disk,
- * and this ordered data won't be referenced in the file
- * if we crash.
- *
- * if the buffer was put onto the ordered list for an older
- * transaction, we need to leave it around
- */
- if (jh && (jl = jh->jl)
- && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
- ret = 0;
- }
-free_jh:
- if (ret && bh->b_private) {
- reiserfs_free_jh(bh);
- }
- spin_unlock(&j->j_dirty_buffers_lock);
- unlock_buffer(bh);
- return ret;
-}
-
-/* clm -- taken from fs/buffer.c:block_invalidate_folio */
-static void reiserfs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
-{
- struct buffer_head *head, *bh, *next;
- struct inode *inode = folio->mapping->host;
- unsigned int curr_off = 0;
- unsigned int stop = offset + length;
- int partial_page = (offset || length < folio_size(folio));
- int ret = 1;
-
- BUG_ON(!folio_test_locked(folio));
-
- if (!partial_page)
- folio_clear_checked(folio);
-
- head = folio_buffers(folio);
- if (!head)
- goto out;
-
- bh = head;
- do {
- unsigned int next_off = curr_off + bh->b_size;
- next = bh->b_this_page;
-
- if (next_off > stop)
- goto out;
-
- /*
- * is this block fully invalidated?
- */
- if (offset <= curr_off) {
- if (invalidate_folio_can_drop(inode, bh))
- reiserfs_unmap_buffer(bh);
- else
- ret = 0;
- }
- curr_off = next_off;
- bh = next;
- } while (bh != head);
-
- /*
- * We release buffers only if the entire page is being invalidated.
- * The get_block cached value has been unconditionally invalidated,
- * so real IO is not possible anymore.
- */
- if (!partial_page && ret) {
- ret = filemap_release_folio(folio, 0);
- /* maybe should BUG_ON(!ret); - neilb */
- }
-out:
- return;
-}
-
-static bool reiserfs_dirty_folio(struct address_space *mapping,
- struct folio *folio)
-{
- if (reiserfs_file_data_log(mapping->host)) {
- folio_set_checked(folio);
- return filemap_dirty_folio(mapping, folio);
- }
- return block_dirty_folio(mapping, folio);
-}
-
-/*
- * Returns true if the folio's buffers were dropped. The folio is locked.
- *
- * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
- * in the buffers at folio_buffers(folio).
- *
- * even in -o notail mode, we can't be sure an old mount without -o notail
- * didn't create files with tails.
- */
-static bool reiserfs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
-{
- struct inode *inode = folio->mapping->host;
- struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
- struct buffer_head *head;
- struct buffer_head *bh;
- bool ret = true;
-
- WARN_ON(folio_test_checked(folio));
- spin_lock(&j->j_dirty_buffers_lock);
- head = folio_buffers(folio);
- bh = head;
- do {
- if (bh->b_private) {
- if (!buffer_dirty(bh) && !buffer_locked(bh)) {
- reiserfs_free_jh(bh);
- } else {
- ret = false;
- break;
- }
- }
- bh = bh->b_this_page;
- } while (bh != head);
- if (ret)
- ret = try_to_free_buffers(folio);
- spin_unlock(&j->j_dirty_buffers_lock);
- return ret;
-}
-
-/*
- * We thank Mingming Cao for helping us understand in great detail what
- * to do in this section of the code.
- */
-static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- size_t count = iov_iter_count(iter);
- ssize_t ret;
-
- ret = blockdev_direct_IO(iocb, inode, iter,
- reiserfs_get_blocks_direct_io);
-
- /*
- * In case of error extending write may have instantiated a few
- * blocks outside i_size. Trim these off again.
- */
- if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
- loff_t isize = i_size_read(inode);
- loff_t end = iocb->ki_pos + count;
-
- if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
- truncate_setsize(inode, isize);
- reiserfs_vfs_truncate_file(inode);
- }
- }
-
- return ret;
-}
-
-int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct iattr *attr)
-{
- struct inode *inode = d_inode(dentry);
- unsigned int ia_valid;
- int error;
-
- error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
- if (error)
- return error;
-
- /* must be turned off for recursive notify_change calls */
- ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
-
- if (is_quota_modification(&nop_mnt_idmap, inode, attr)) {
- error = dquot_initialize(inode);
- if (error)
- return error;
- }
- reiserfs_write_lock(inode->i_sb);
- if (attr->ia_valid & ATTR_SIZE) {
- /*
- * version 2 items will be caught by the s_maxbytes check
- * done for us in vmtruncate
- */
- if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
- attr->ia_size > MAX_NON_LFS) {
- reiserfs_write_unlock(inode->i_sb);
- error = -EFBIG;
- goto out;
- }
-
- inode_dio_wait(inode);
-
- /* fill in hole pointers in the expanding truncate case. */
- if (attr->ia_size > inode->i_size) {
- loff_t pos = attr->ia_size;
-
- if ((pos & (inode->i_sb->s_blocksize - 1)) == 0)
- pos++;
- error = generic_cont_expand_simple(inode, pos);
- if (REISERFS_I(inode)->i_prealloc_count > 0) {
- int err;
- struct reiserfs_transaction_handle th;
- /* we're changing at most 2 bitmaps, inode + super */
- err = journal_begin(&th, inode->i_sb, 4);
- if (!err) {
- reiserfs_discard_prealloc(&th, inode);
- err = journal_end(&th);
- }
- if (err)
- error = err;
- }
- if (error) {
- reiserfs_write_unlock(inode->i_sb);
- goto out;
- }
- /*
- * file size is changed, ctime and mtime are
- * to be updated
- */
- attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
- }
- }
- reiserfs_write_unlock(inode->i_sb);
-
- if ((((attr->ia_valid & ATTR_UID) && (from_kuid(&init_user_ns, attr->ia_uid) & ~0xffff)) ||
- ((attr->ia_valid & ATTR_GID) && (from_kgid(&init_user_ns, attr->ia_gid) & ~0xffff))) &&
- (get_inode_sd_version(inode) == STAT_DATA_V1)) {
- /* stat data of format v3.5 has 16 bit uid and gid */
- error = -EINVAL;
- goto out;
- }
-
- if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
- (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
- struct reiserfs_transaction_handle th;
- int jbegin_count =
- 2 *
- (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
- REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
- 2;
-
- error = reiserfs_chown_xattrs(inode, attr);
-
- if (error)
- return error;
-
- /*
- * (user+group)*(old+new) structure - we count quota
- * info and , inode write (sb, inode)
- */
- reiserfs_write_lock(inode->i_sb);
- error = journal_begin(&th, inode->i_sb, jbegin_count);
- reiserfs_write_unlock(inode->i_sb);
- if (error)
- goto out;
- error = dquot_transfer(&nop_mnt_idmap, inode, attr);
- reiserfs_write_lock(inode->i_sb);
- if (error) {
- journal_end(&th);
- reiserfs_write_unlock(inode->i_sb);
- goto out;
- }
-
- /*
- * Update corresponding info in inode so that everything
- * is in one transaction
- */
- if (attr->ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
- mark_inode_dirty(inode);
- error = journal_end(&th);
- reiserfs_write_unlock(inode->i_sb);
- if (error)
- goto out;
- }
-
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = inode_newsize_ok(inode, attr->ia_size);
- if (!error) {
- /*
- * Could race against reiserfs_file_release
- * if called from NFS, so take tailpack mutex.
- */
- mutex_lock(&REISERFS_I(inode)->tailpack);
- truncate_setsize(inode, attr->ia_size);
- reiserfs_truncate_file(inode, 1);
- mutex_unlock(&REISERFS_I(inode)->tailpack);
- }
- }
-
- if (!error) {
- setattr_copy(&nop_mnt_idmap, inode, attr);
- mark_inode_dirty(inode);
- }
-
- if (!error && reiserfs_posixacl(inode->i_sb)) {
- if (attr->ia_valid & ATTR_MODE)
- error = reiserfs_acl_chmod(dentry);
- }
-
-out:
- return error;
-}
-
-const struct address_space_operations reiserfs_address_space_operations = {
- .writepages = reiserfs_writepages,
- .read_folio = reiserfs_read_folio,
- .readahead = reiserfs_readahead,
- .release_folio = reiserfs_release_folio,
- .invalidate_folio = reiserfs_invalidate_folio,
- .write_begin = reiserfs_write_begin,
- .write_end = reiserfs_write_end,
- .bmap = reiserfs_aop_bmap,
- .direct_IO = reiserfs_direct_IO,
- .dirty_folio = reiserfs_dirty_folio,
- .migrate_folio = buffer_migrate_folio,
-};
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
deleted file mode 100644
index dd33f8cc6eda..000000000000
--- a/fs/reiserfs/ioctl.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include "reiserfs.h"
-#include <linux/time.h>
-#include <linux/uaccess.h>
-#include <linux/pagemap.h>
-#include <linux/compat.h>
-#include <linux/fileattr.h>
-
-int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
-{
- struct inode *inode = d_inode(dentry);
-
- if (!reiserfs_attrs(inode->i_sb))
- return -ENOTTY;
-
- fileattr_fill_flags(fa, REISERFS_I(inode)->i_attrs);
-
- return 0;
-}
-
-int reiserfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
-{
- struct inode *inode = d_inode(dentry);
- unsigned int flags = fa->flags;
- int err;
-
- reiserfs_write_lock(inode->i_sb);
-
- err = -ENOTTY;
- if (!reiserfs_attrs(inode->i_sb))
- goto unlock;
-
- err = -EOPNOTSUPP;
- if (fileattr_has_fsx(fa))
- goto unlock;
-
- /*
- * Is it quota file? Do not allow user to mess with it
- */
- err = -EPERM;
- if (IS_NOQUOTA(inode))
- goto unlock;
-
- if ((flags & REISERFS_NOTAIL_FL) && S_ISREG(inode->i_mode)) {
- err = reiserfs_unpack(inode);
- if (err)
- goto unlock;
- }
- sd_attrs_to_i_attrs(flags, inode);
- REISERFS_I(inode)->i_attrs = flags;
- inode_set_ctime_current(inode);
- mark_inode_dirty(inode);
- err = 0;
-unlock:
- reiserfs_write_unlock(inode->i_sb);
-
- return err;
-}
-
-/*
- * reiserfs_ioctl - handler for ioctl for inode
- * supported commands:
- * 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
- * and prevent packing file (argument arg has t
- * be non-zero)
- * 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
- * 3) That's all for a while ...
- */
-long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
- struct inode *inode = file_inode(filp);
- int err = 0;
-
- reiserfs_write_lock(inode->i_sb);
-
- switch (cmd) {
- case REISERFS_IOC_UNPACK:
- if (S_ISREG(inode->i_mode)) {
- if (arg)
- err = reiserfs_unpack(inode);
- } else
- err = -ENOTTY;
- break;
- /*
- * following two cases are taken from fs/ext2/ioctl.c by Remy
- * Card (card@masi.ibp.fr)
- */
- case REISERFS_IOC_GETVERSION:
- err = put_user(inode->i_generation, (int __user *)arg);
- break;
- case REISERFS_IOC_SETVERSION:
- if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) {
- err = -EPERM;
- break;
- }
- err = mnt_want_write_file(filp);
- if (err)
- break;
- if (get_user(inode->i_generation, (int __user *)arg)) {
- err = -EFAULT;
- goto setversion_out;
- }
- inode_set_ctime_current(inode);
- mark_inode_dirty(inode);
-setversion_out:
- mnt_drop_write_file(filp);
- break;
- default:
- err = -ENOTTY;
- }
-
- reiserfs_write_unlock(inode->i_sb);
-
- return err;
-}
-
-#ifdef CONFIG_COMPAT
-long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
-{
- /*
- * These are just misnamed, they actually
- * get/put from/to user an int
- */
- switch (cmd) {
- case REISERFS_IOC32_UNPACK:
- cmd = REISERFS_IOC_UNPACK;
- break;
- case REISERFS_IOC32_GETVERSION:
- cmd = REISERFS_IOC_GETVERSION;
- break;
- case REISERFS_IOC32_SETVERSION:
- cmd = REISERFS_IOC_SETVERSION;
- break;
- default:
- return -ENOIOCTLCMD;
- }
-
- return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-}
-#endif
-
-int reiserfs_commit_write(struct file *f, struct page *page,
- unsigned from, unsigned to);
-/*
- * reiserfs_unpack
- * Function try to convert tail from direct item into indirect.
- * It set up nopack attribute in the REISERFS_I(inode)->nopack
- */
-int reiserfs_unpack(struct inode *inode)
-{
- int retval = 0;
- int index;
- struct page *page;
- struct address_space *mapping;
- unsigned long write_from;
- unsigned long blocksize = inode->i_sb->s_blocksize;
-
- if (inode->i_size == 0) {
- REISERFS_I(inode)->i_flags |= i_nopack_mask;
- return 0;
- }
- /* ioctl already done */
- if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
- return 0;
- }
-
- /* we need to make sure nobody is changing the file size beneath us */
- {
- int depth = reiserfs_write_unlock_nested(inode->i_sb);
-
- inode_lock(inode);
- reiserfs_write_lock_nested(inode->i_sb, depth);
- }
-
- reiserfs_write_lock(inode->i_sb);
-
- write_from = inode->i_size & (blocksize - 1);
- /* if we are on a block boundary, we are already unpacked. */
- if (write_from == 0) {
- REISERFS_I(inode)->i_flags |= i_nopack_mask;
- goto out;
- }
-
- /*
- * we unpack by finding the page with the tail, and calling
- * __reiserfs_write_begin on that page. This will force a
- * reiserfs_get_block to unpack the tail for us.
- */
- index = inode->i_size >> PAGE_SHIFT;
- mapping = inode->i_mapping;
- page = grab_cache_page(mapping, index);
- retval = -ENOMEM;
- if (!page) {
- goto out;
- }
- retval = __reiserfs_write_begin(page, write_from, 0);
- if (retval)
- goto out_unlock;
-
- /* conversion can change page contents, must flush */
- flush_dcache_page(page);
- retval = reiserfs_commit_write(NULL, page, write_from, write_from);
- REISERFS_I(inode)->i_flags |= i_nopack_mask;
-
-out_unlock:
- unlock_page(page);
- put_page(page);
-
-out:
- inode_unlock(inode);
- reiserfs_write_unlock(inode->i_sb);
- return retval;
-}
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
deleted file mode 100644
index 5011c10287c6..000000000000
--- a/fs/reiserfs/item_ops.c
+++ /dev/null
@@ -1,737 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include "reiserfs.h"
-
-/*
- * this contains item handlers for old item types: sd, direct,
- * indirect, directory
- */
-
-/*
- * and where are the comments? how about saying where we can find an
- * explanation of each item handler method? -Hans
- */
-
-/* stat data functions */
-static int sd_bytes_number(struct item_head *ih, int block_size)
-{
- return 0;
-}
-
-static void sd_decrement_key(struct cpu_key *key)
-{
- key->on_disk_key.k_objectid--;
- set_cpu_key_k_type(key, TYPE_ANY);
- set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1));
-}
-
-static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
-{
- return 0;
-}
-
-static void sd_print_item(struct item_head *ih, char *item)
-{
- printk("\tmode | size | nlinks | first direct | mtime\n");
- if (stat_data_v1(ih)) {
- struct stat_data_v1 *sd = (struct stat_data_v1 *)item;
-
- printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd),
- sd_v1_size(sd), sd_v1_nlink(sd),
- sd_v1_first_direct_byte(sd),
- sd_v1_mtime(sd));
- } else {
- struct stat_data *sd = (struct stat_data *)item;
-
- printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd),
- (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
- sd_v2_rdev(sd), sd_v2_mtime(sd));
- }
-}
-
-static void sd_check_item(struct item_head *ih, char *item)
-{
- /* unused */
-}
-
-static int sd_create_vi(struct virtual_node *vn,
- struct virtual_item *vi,
- int is_affected, int insert_size)
-{
- vi->vi_index = TYPE_STAT_DATA;
- return 0;
-}
-
-static int sd_check_left(struct virtual_item *vi, int free,
- int start_skip, int end_skip)
-{
- BUG_ON(start_skip || end_skip);
- return -1;
-}
-
-static int sd_check_right(struct virtual_item *vi, int free)
-{
- return -1;
-}
-
-static int sd_part_size(struct virtual_item *vi, int first, int count)
-{
- BUG_ON(count);
- return 0;
-}
-
-static int sd_unit_num(struct virtual_item *vi)
-{
- return vi->vi_item_len - IH_SIZE;
-}
-
-static void sd_print_vi(struct virtual_item *vi)
-{
- reiserfs_warning(NULL, "reiserfs-16100",
- "STATDATA, index %d, type 0x%x, %h",
- vi->vi_index, vi->vi_type, vi->vi_ih);
-}
-
-static struct item_operations stat_data_ops = {
- .bytes_number = sd_bytes_number,
- .decrement_key = sd_decrement_key,
- .is_left_mergeable = sd_is_left_mergeable,
- .print_item = sd_print_item,
- .check_item = sd_check_item,
-
- .create_vi = sd_create_vi,
- .check_left = sd_check_left,
- .check_right = sd_check_right,
- .part_size = sd_part_size,
- .unit_num = sd_unit_num,
- .print_vi = sd_print_vi
-};
-
-/* direct item functions */
-static int direct_bytes_number(struct item_head *ih, int block_size)
-{
- return ih_item_len(ih);
-}
-
-/* FIXME: this should probably switch to indirect as well */
-static void direct_decrement_key(struct cpu_key *key)
-{
- cpu_key_k_offset_dec(key);
- if (cpu_key_k_offset(key) == 0)
- set_cpu_key_k_type(key, TYPE_STAT_DATA);
-}
-
-static int direct_is_left_mergeable(struct reiserfs_key *key,
- unsigned long bsize)
-{
- int version = le_key_version(key);
- return ((le_key_k_offset(version, key) & (bsize - 1)) != 1);
-}
-
-static void direct_print_item(struct item_head *ih, char *item)
-{
- int j = 0;
-
-/* return; */
- printk("\"");
- while (j < ih_item_len(ih))
- printk("%c", item[j++]);
- printk("\"\n");
-}
-
-static void direct_check_item(struct item_head *ih, char *item)
-{
- /* unused */
-}
-
-static int direct_create_vi(struct virtual_node *vn,
- struct virtual_item *vi,
- int is_affected, int insert_size)
-{
- vi->vi_index = TYPE_DIRECT;
- return 0;
-}
-
-static int direct_check_left(struct virtual_item *vi, int free,
- int start_skip, int end_skip)
-{
- int bytes;
-
- bytes = free - free % 8;
- return bytes ? : -1;
-}
-
-static int direct_check_right(struct virtual_item *vi, int free)
-{
- return direct_check_left(vi, free, 0, 0);
-}
-
-static int direct_part_size(struct virtual_item *vi, int first, int count)
-{
- return count;
-}
-
-static int direct_unit_num(struct virtual_item *vi)
-{
- return vi->vi_item_len - IH_SIZE;
-}
-
-static void direct_print_vi(struct virtual_item *vi)
-{
- reiserfs_warning(NULL, "reiserfs-16101",
- "DIRECT, index %d, type 0x%x, %h",
- vi->vi_index, vi->vi_type, vi->vi_ih);
-}
-
-static struct item_operations direct_ops = {
- .bytes_number = direct_bytes_number,
- .decrement_key = direct_decrement_key,
- .is_left_mergeable = direct_is_left_mergeable,
- .print_item = direct_print_item,
- .check_item = direct_check_item,
-
- .create_vi = direct_create_vi,
- .check_left = direct_check_left,
- .check_right = direct_check_right,
- .part_size = direct_part_size,
- .unit_num = direct_unit_num,
- .print_vi = direct_print_vi
-};
-
-/* indirect item functions */
-static int indirect_bytes_number(struct item_head *ih, int block_size)
-{
- return ih_item_len(ih) / UNFM_P_SIZE * block_size;
-}
-
-/* decrease offset, if it becomes 0, change type to stat data */
-static void indirect_decrement_key(struct cpu_key *key)
-{
- cpu_key_k_offset_dec(key);
- if (cpu_key_k_offset(key) == 0)
- set_cpu_key_k_type(key, TYPE_STAT_DATA);
-}
-
-/* if it is not first item of the body, then it is mergeable */
-static int indirect_is_left_mergeable(struct reiserfs_key *key,
- unsigned long bsize)
-{
- int version = le_key_version(key);
- return (le_key_k_offset(version, key) != 1);
-}
-
-/* printing of indirect item */
-static void start_new_sequence(__u32 * start, int *len, __u32 new)
-{
- *start = new;
- *len = 1;
-}
-
-static int sequence_finished(__u32 start, int *len, __u32 new)
-{
- if (start == INT_MAX)
- return 1;
-
- if (start == 0 && new == 0) {
- (*len)++;
- return 0;
- }
- if (start != 0 && (start + *len) == new) {
- (*len)++;
- return 0;
- }
- return 1;
-}
-
-static void print_sequence(__u32 start, int len)
-{
- if (start == INT_MAX)
- return;
-
- if (len == 1)
- printk(" %d", start);
- else
- printk(" %d(%d)", start, len);
-}
-
-static void indirect_print_item(struct item_head *ih, char *item)
-{
- int j;
- __le32 *unp;
- __u32 prev = INT_MAX;
- int num = 0;
-
- unp = (__le32 *) item;
-
- if (ih_item_len(ih) % UNFM_P_SIZE)
- reiserfs_warning(NULL, "reiserfs-16102", "invalid item len");
-
- printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
- for (j = 0; j < I_UNFM_NUM(ih); j++) {
- if (sequence_finished(prev, &num, get_block_num(unp, j))) {
- print_sequence(prev, num);
- start_new_sequence(&prev, &num, get_block_num(unp, j));
- }
- }
- print_sequence(prev, num);
- printk("]\n");
-}
-
-static void indirect_check_item(struct item_head *ih, char *item)
-{
- /* unused */
-}
-
-static int indirect_create_vi(struct virtual_node *vn,
- struct virtual_item *vi,
- int is_affected, int insert_size)
-{
- vi->vi_index = TYPE_INDIRECT;
- return 0;
-}
-
-static int indirect_check_left(struct virtual_item *vi, int free,
- int start_skip, int end_skip)
-{
- int bytes;
-
- bytes = free - free % UNFM_P_SIZE;
- return bytes ? : -1;
-}
-
-static int indirect_check_right(struct virtual_item *vi, int free)
-{
- return indirect_check_left(vi, free, 0, 0);
-}
-
-/*
- * return size in bytes of 'units' units. If first == 0 - calculate
- * from the head (left), otherwise - from tail (right)
- */
-static int indirect_part_size(struct virtual_item *vi, int first, int units)
-{
- /* unit of indirect item is byte (yet) */
- return units;
-}
-
-static int indirect_unit_num(struct virtual_item *vi)
-{
- /* unit of indirect item is byte (yet) */
- return vi->vi_item_len - IH_SIZE;
-}
-
-static void indirect_print_vi(struct virtual_item *vi)
-{
- reiserfs_warning(NULL, "reiserfs-16103",
- "INDIRECT, index %d, type 0x%x, %h",
- vi->vi_index, vi->vi_type, vi->vi_ih);
-}
-
-static struct item_operations indirect_ops = {
- .bytes_number = indirect_bytes_number,
- .decrement_key = indirect_decrement_key,
- .is_left_mergeable = indirect_is_left_mergeable,
- .print_item = indirect_print_item,
- .check_item = indirect_check_item,
-
- .create_vi = indirect_create_vi,
- .check_left = indirect_check_left,
- .check_right = indirect_check_right,
- .part_size = indirect_part_size,
- .unit_num = indirect_unit_num,
- .print_vi = indirect_print_vi
-};
-
-/* direntry functions */
-static int direntry_bytes_number(struct item_head *ih, int block_size)
-{
- reiserfs_warning(NULL, "vs-16090",
- "bytes number is asked for direntry");
- return 0;
-}
-
-static void direntry_decrement_key(struct cpu_key *key)
-{
- cpu_key_k_offset_dec(key);
- if (cpu_key_k_offset(key) == 0)
- set_cpu_key_k_type(key, TYPE_STAT_DATA);
-}
-
-static int direntry_is_left_mergeable(struct reiserfs_key *key,
- unsigned long bsize)
-{
- if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET)
- return 0;
- return 1;
-
-}
-
-static void direntry_print_item(struct item_head *ih, char *item)
-{
- int i;
- int namelen;
- struct reiserfs_de_head *deh;
- char *name;
- static char namebuf[80];
-
- printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name",
- "Key of pointed object", "Hash", "Gen number", "Status");
-
- deh = (struct reiserfs_de_head *)item;
-
- for (i = 0; i < ih_entry_count(ih); i++, deh++) {
- namelen =
- (i ? (deh_location(deh - 1)) : ih_item_len(ih)) -
- deh_location(deh);
- name = item + deh_location(deh);
- if (name[namelen - 1] == 0)
- namelen = strlen(name);
-
- scnprintf(namebuf, sizeof(namebuf), "\"%.*s\"",
- (int)sizeof(namebuf)-3, name);
-
- printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n",
- i, namebuf,
- deh_dir_id(deh), deh_objectid(deh),
- GET_HASH_VALUE(deh_offset(deh)),
- GET_GENERATION_NUMBER((deh_offset(deh))),
- (de_hidden(deh)) ? "HIDDEN" : "VISIBLE");
- }
-}
-
-static void direntry_check_item(struct item_head *ih, char *item)
-{
- int i;
- struct reiserfs_de_head *deh;
-
- /* unused */
- deh = (struct reiserfs_de_head *)item;
- for (i = 0; i < ih_entry_count(ih); i++, deh++) {
- ;
- }
-}
-
-#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1
-
-/*
- * function returns old entry number in directory item in real node
- * using new entry number in virtual item in virtual node
- */
-static inline int old_entry_num(int is_affected, int virtual_entry_num,
- int pos_in_item, int mode)
-{
- if (mode == M_INSERT || mode == M_DELETE)
- return virtual_entry_num;
-
- if (!is_affected)
- /* cut or paste is applied to another item */
- return virtual_entry_num;
-
- if (virtual_entry_num < pos_in_item)
- return virtual_entry_num;
-
- if (mode == M_CUT)
- return virtual_entry_num + 1;
-
- RFALSE(mode != M_PASTE || virtual_entry_num == 0,
- "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'",
- mode);
-
- return virtual_entry_num - 1;
-}
-
-/*
- * Create an array of sizes of directory entries for virtual
- * item. Return space used by an item. FIXME: no control over
- * consuming of space used by this item handler
- */
-static int direntry_create_vi(struct virtual_node *vn,
- struct virtual_item *vi,
- int is_affected, int insert_size)
-{
- struct direntry_uarea *dir_u = vi->vi_uarea;
- int i, j;
- int size = sizeof(struct direntry_uarea);
- struct reiserfs_de_head *deh;
-
- vi->vi_index = TYPE_DIRENTRY;
-
- BUG_ON(!(vi->vi_ih) || !vi->vi_item);
-
- dir_u->flags = 0;
- if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET)
- dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
-
- deh = (struct reiserfs_de_head *)(vi->vi_item);
-
- /* virtual directory item have this amount of entry after */
- dir_u->entry_count = ih_entry_count(vi->vi_ih) +
- ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
- (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
-
- for (i = 0; i < dir_u->entry_count; i++) {
- j = old_entry_num(is_affected, i, vn->vn_pos_in_item,
- vn->vn_mode);
- dir_u->entry_sizes[i] =
- (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) -
- deh_location(&deh[j]) + DEH_SIZE;
- }
-
- size += (dir_u->entry_count * sizeof(short));
-
- /* set size of pasted entry */
- if (is_affected && vn->vn_mode == M_PASTE)
- dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
-
-#ifdef CONFIG_REISERFS_CHECK
- /* compare total size of entries with item length */
- {
- int k, l;
-
- l = 0;
- for (k = 0; k < dir_u->entry_count; k++)
- l += dir_u->entry_sizes[k];
-
- if (l + IH_SIZE != vi->vi_item_len +
- ((is_affected
- && (vn->vn_mode == M_PASTE
- || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
- reiserfs_panic(NULL, "vs-8025", "(mode==%c, "
- "insert_size==%d), invalid length of "
- "directory item",
- vn->vn_mode, insert_size);
- }
- }
-#endif
-
- return size;
-
-}
-
-/*
- * return number of entries which may fit into specified amount of
- * free space, or -1 if free space is not enough even for 1 entry
- */
-static int direntry_check_left(struct virtual_item *vi, int free,
- int start_skip, int end_skip)
-{
- int i;
- int entries = 0;
- struct direntry_uarea *dir_u = vi->vi_uarea;
-
- for (i = start_skip; i < dir_u->entry_count - end_skip; i++) {
- /* i-th entry doesn't fit into the remaining free space */
- if (dir_u->entry_sizes[i] > free)
- break;
-
- free -= dir_u->entry_sizes[i];
- entries++;
- }
-
- if (entries == dir_u->entry_count) {
- reiserfs_panic(NULL, "item_ops-1",
- "free space %d, entry_count %d", free,
- dir_u->entry_count);
- }
-
- /* "." and ".." can not be separated from each other */
- if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
- && entries < 2)
- entries = 0;
-
- return entries ? : -1;
-}
-
-static int direntry_check_right(struct virtual_item *vi, int free)
-{
- int i;
- int entries = 0;
- struct direntry_uarea *dir_u = vi->vi_uarea;
-
- for (i = dir_u->entry_count - 1; i >= 0; i--) {
- /* i-th entry doesn't fit into the remaining free space */
- if (dir_u->entry_sizes[i] > free)
- break;
-
- free -= dir_u->entry_sizes[i];
- entries++;
- }
- BUG_ON(entries == dir_u->entry_count);
-
- /* "." and ".." can not be separated from each other */
- if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
- && entries > dir_u->entry_count - 2)
- entries = dir_u->entry_count - 2;
-
- return entries ? : -1;
-}
-
-/* sum of entry sizes between from-th and to-th entries including both edges */
-static int direntry_part_size(struct virtual_item *vi, int first, int count)
-{
- int i, retval;
- int from, to;
- struct direntry_uarea *dir_u = vi->vi_uarea;
-
- retval = 0;
- if (first == 0)
- from = 0;
- else
- from = dir_u->entry_count - count;
- to = from + count - 1;
-
- for (i = from; i <= to; i++)
- retval += dir_u->entry_sizes[i];
-
- return retval;
-}
-
-static int direntry_unit_num(struct virtual_item *vi)
-{
- struct direntry_uarea *dir_u = vi->vi_uarea;
-
- return dir_u->entry_count;
-}
-
-static void direntry_print_vi(struct virtual_item *vi)
-{
- int i;
- struct direntry_uarea *dir_u = vi->vi_uarea;
-
- reiserfs_warning(NULL, "reiserfs-16104",
- "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
- vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
- printk("%d entries: ", dir_u->entry_count);
- for (i = 0; i < dir_u->entry_count; i++)
- printk("%d ", dir_u->entry_sizes[i]);
- printk("\n");
-}
-
-static struct item_operations direntry_ops = {
- .bytes_number = direntry_bytes_number,
- .decrement_key = direntry_decrement_key,
- .is_left_mergeable = direntry_is_left_mergeable,
- .print_item = direntry_print_item,
- .check_item = direntry_check_item,
-
- .create_vi = direntry_create_vi,
- .check_left = direntry_check_left,
- .check_right = direntry_check_right,
- .part_size = direntry_part_size,
- .unit_num = direntry_unit_num,
- .print_vi = direntry_print_vi
-};
-
-/* Error catching functions to catch errors caused by incorrect item types. */
-static int errcatch_bytes_number(struct item_head *ih, int block_size)
-{
- reiserfs_warning(NULL, "green-16001",
- "Invalid item type observed, run fsck ASAP");
- return 0;
-}
-
-static void errcatch_decrement_key(struct cpu_key *key)
-{
- reiserfs_warning(NULL, "green-16002",
- "Invalid item type observed, run fsck ASAP");
-}
-
-static int errcatch_is_left_mergeable(struct reiserfs_key *key,
- unsigned long bsize)
-{
- reiserfs_warning(NULL, "green-16003",
- "Invalid item type observed, run fsck ASAP");
- return 0;
-}
-
-static void errcatch_print_item(struct item_head *ih, char *item)
-{
- reiserfs_warning(NULL, "green-16004",
- "Invalid item type observed, run fsck ASAP");
-}
-
-static void errcatch_check_item(struct item_head *ih, char *item)
-{
- reiserfs_warning(NULL, "green-16005",
- "Invalid item type observed, run fsck ASAP");
-}
-
-static int errcatch_create_vi(struct virtual_node *vn,
- struct virtual_item *vi,
- int is_affected, int insert_size)
-{
- reiserfs_warning(NULL, "green-16006",
- "Invalid item type observed, run fsck ASAP");
- /*
- * We might return -1 here as well, but it won't help as
- * create_virtual_node() from where this operation is called
- * from is of return type void.
- */
- return 0;
-}
-
-static int errcatch_check_left(struct virtual_item *vi, int free,
- int start_skip, int end_skip)
-{
- reiserfs_warning(NULL, "green-16007",
- "Invalid item type observed, run fsck ASAP");
- return -1;
-}
-
-static int errcatch_check_right(struct virtual_item *vi, int free)
-{
- reiserfs_warning(NULL, "green-16008",
- "Invalid item type observed, run fsck ASAP");
- return -1;
-}
-
-static int errcatch_part_size(struct virtual_item *vi, int first, int count)
-{
- reiserfs_warning(NULL, "green-16009",
- "Invalid item type observed, run fsck ASAP");
- return 0;
-}
-
-static int errcatch_unit_num(struct virtual_item *vi)
-{
- reiserfs_warning(NULL, "green-16010",
- "Invalid item type observed, run fsck ASAP");
- return 0;
-}
-
-static void errcatch_print_vi(struct virtual_item *vi)
-{
- reiserfs_warning(NULL, "green-16011",
- "Invalid item type observed, run fsck ASAP");
-}
-
-static struct item_operations errcatch_ops = {
- .bytes_number = errcatch_bytes_number,
- .decrement_key = errcatch_decrement_key,
- .is_left_mergeable = errcatch_is_left_mergeable,
- .print_item = errcatch_print_item,
- .check_item = errcatch_check_item,
-
- .create_vi = errcatch_create_vi,
- .check_left = errcatch_check_left,
- .check_right = errcatch_check_right,
- .part_size = errcatch_part_size,
- .unit_num = errcatch_unit_num,
- .print_vi = errcatch_print_vi
-};
-
-#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
-#error Item types must use disk-format assigned values.
-#endif
-
-struct item_operations *item_ops[TYPE_ANY + 1] = {
- &stat_data_ops,
- &indirect_ops,
- &direct_ops,
- &direntry_ops,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
-};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
deleted file mode 100644
index e477ee0ff35d..000000000000
--- a/fs/reiserfs/journal.c
+++ /dev/null
@@ -1,4404 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Write ahead logging implementation copyright Chris Mason 2000
- *
- * The background commits make this code very interrelated, and
- * overly complex. I need to rethink things a bit....The major players:
- *
- * journal_begin -- call with the number of blocks you expect to log.
- * If the current transaction is too
- * old, it will block until the current transaction is
- * finished, and then start a new one.
- * Usually, your transaction will get joined in with
- * previous ones for speed.
- *
- * journal_join -- same as journal_begin, but won't block on the current
- * transaction regardless of age. Don't ever call
- * this. Ever. There are only two places it should be
- * called from, and they are both inside this file.
- *
- * journal_mark_dirty -- adds blocks into this transaction. clears any flags
- * that might make them get sent to disk
- * and then marks them BH_JDirty. Puts the buffer head
- * into the current transaction hash.
- *
- * journal_end -- if the current transaction is batchable, it does nothing
- * otherwise, it could do an async/synchronous commit, or
- * a full flush of all log and real blocks in the
- * transaction.
- *
- * flush_old_commits -- if the current transaction is too old, it is ended and
- * commit blocks are sent to disk. Forces commit blocks
- * to disk for all backgrounded commits that have been
- * around too long.
- * -- Note, if you call this as an immediate flush from
- * within kupdate, it will ignore the immediate flag
- */
-
-#include <linux/time.h>
-#include <linux/semaphore.h>
-#include <linux/vmalloc.h>
-#include "reiserfs.h"
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/fcntl.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-#include <linux/workqueue.h>
-#include <linux/writeback.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-
-
-/* gets a struct reiserfs_journal_list * from a list head */
-#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
- j_list))
-
-/* must be correct to keep the desc and commit structs at 4k */
-#define JOURNAL_TRANS_HALF 1018
-#define BUFNR 64 /*read ahead */
-
-/* cnode stat bits. Move these into reiserfs_fs.h */
-
-/* this block was freed, and can't be written. */
-#define BLOCK_FREED 2
-/* this block was freed during this transaction, and can't be written */
-#define BLOCK_FREED_HOLDER 3
-
-/* used in flush_journal_list */
-#define BLOCK_NEEDS_FLUSH 4
-#define BLOCK_DIRTIED 5
-
-/* journal list state bits */
-#define LIST_TOUCHED 1
-#define LIST_DIRTY 2
-#define LIST_COMMIT_PENDING 4 /* someone will commit this list */
-
-/* flags for do_journal_end */
-#define FLUSH_ALL 1 /* flush commit and real blocks */
-#define COMMIT_NOW 2 /* end and commit this transaction */
-#define WAIT 4 /* wait for the log blocks to hit the disk */
-
-static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
-static int flush_journal_list(struct super_block *s,
- struct reiserfs_journal_list *jl, int flushall);
-static int flush_commit_list(struct super_block *s,
- struct reiserfs_journal_list *jl, int flushall);
-static int can_dirty(struct reiserfs_journal_cnode *cn);
-static int journal_join(struct reiserfs_transaction_handle *th,
- struct super_block *sb);
-static void release_journal_dev(struct reiserfs_journal *journal);
-static void dirty_one_transaction(struct super_block *s,
- struct reiserfs_journal_list *jl);
-static void flush_async_commits(struct work_struct *work);
-static void queue_log_writer(struct super_block *s);
-
-/* values for join in do_journal_begin_r */
-enum {
- JBEGIN_REG = 0, /* regular journal begin */
- /* join the running transaction if at all possible */
- JBEGIN_JOIN = 1,
- /* called from cleanup code, ignores aborted flag */
- JBEGIN_ABORT = 2,
-};
-
-static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
- struct super_block *sb,
- unsigned long nblocks, int join);
-
-static void init_journal_hash(struct super_block *sb)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- memset(journal->j_hash_table, 0,
- JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
-}
-
-/*
- * clears BH_Dirty and sticks the buffer on the clean list. Called because
- * I can't allow refile_buffer to make schedule happen after I've freed a
- * block. Look at remove_from_transaction and journal_mark_freed for
- * more details.
- */
-static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
-{
- if (bh) {
- clear_buffer_dirty(bh);
- clear_buffer_journal_test(bh);
- }
- return 0;
-}
-
-static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
- *sb)
-{
- struct reiserfs_bitmap_node *bn;
- static int id;
-
- bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS);
- if (!bn) {
- return NULL;
- }
- bn->data = kzalloc(sb->s_blocksize, GFP_NOFS);
- if (!bn->data) {
- kfree(bn);
- return NULL;
- }
- bn->id = id++;
- INIT_LIST_HEAD(&bn->list);
- return bn;
-}
-
-static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_bitmap_node *bn = NULL;
- struct list_head *entry = journal->j_bitmap_nodes.next;
-
- journal->j_used_bitmap_nodes++;
-repeat:
-
- if (entry != &journal->j_bitmap_nodes) {
- bn = list_entry(entry, struct reiserfs_bitmap_node, list);
- list_del(entry);
- memset(bn->data, 0, sb->s_blocksize);
- journal->j_free_bitmap_nodes--;
- return bn;
- }
- bn = allocate_bitmap_node(sb);
- if (!bn) {
- yield();
- goto repeat;
- }
- return bn;
-}
-static inline void free_bitmap_node(struct super_block *sb,
- struct reiserfs_bitmap_node *bn)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- journal->j_used_bitmap_nodes--;
- if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
- kfree(bn->data);
- kfree(bn);
- } else {
- list_add(&bn->list, &journal->j_bitmap_nodes);
- journal->j_free_bitmap_nodes++;
- }
-}
-
-static void allocate_bitmap_nodes(struct super_block *sb)
-{
- int i;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_bitmap_node *bn = NULL;
- for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
- bn = allocate_bitmap_node(sb);
- if (bn) {
- list_add(&bn->list, &journal->j_bitmap_nodes);
- journal->j_free_bitmap_nodes++;
- } else {
- /* this is ok, we'll try again when more are needed */
- break;
- }
- }
-}
-
-static int set_bit_in_list_bitmap(struct super_block *sb,
- b_blocknr_t block,
- struct reiserfs_list_bitmap *jb)
-{
- unsigned int bmap_nr = block / (sb->s_blocksize << 3);
- unsigned int bit_nr = block % (sb->s_blocksize << 3);
-
- if (!jb->bitmaps[bmap_nr]) {
- jb->bitmaps[bmap_nr] = get_bitmap_node(sb);
- }
- set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
- return 0;
-}
-
-static void cleanup_bitmap_list(struct super_block *sb,
- struct reiserfs_list_bitmap *jb)
-{
- int i;
- if (jb->bitmaps == NULL)
- return;
-
- for (i = 0; i < reiserfs_bmap_count(sb); i++) {
- if (jb->bitmaps[i]) {
- free_bitmap_node(sb, jb->bitmaps[i]);
- jb->bitmaps[i] = NULL;
- }
- }
-}
-
-/*
- * only call this on FS unmount.
- */
-static int free_list_bitmaps(struct super_block *sb,
- struct reiserfs_list_bitmap *jb_array)
-{
- int i;
- struct reiserfs_list_bitmap *jb;
- for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
- jb = jb_array + i;
- jb->journal_list = NULL;
- cleanup_bitmap_list(sb, jb);
- vfree(jb->bitmaps);
- jb->bitmaps = NULL;
- }
- return 0;
-}
-
-static int free_bitmap_nodes(struct super_block *sb)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct list_head *next = journal->j_bitmap_nodes.next;
- struct reiserfs_bitmap_node *bn;
-
- while (next != &journal->j_bitmap_nodes) {
- bn = list_entry(next, struct reiserfs_bitmap_node, list);
- list_del(next);
- kfree(bn->data);
- kfree(bn);
- next = journal->j_bitmap_nodes.next;
- journal->j_free_bitmap_nodes--;
- }
-
- return 0;
-}
-
-/*
- * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
- * jb_array is the array to be filled in.
- */
-int reiserfs_allocate_list_bitmaps(struct super_block *sb,
- struct reiserfs_list_bitmap *jb_array,
- unsigned int bmap_nr)
-{
- int i;
- int failed = 0;
- struct reiserfs_list_bitmap *jb;
- int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
-
- for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
- jb = jb_array + i;
- jb->journal_list = NULL;
- jb->bitmaps = vzalloc(mem);
- if (!jb->bitmaps) {
- reiserfs_warning(sb, "clm-2000", "unable to "
- "allocate bitmaps for journal lists");
- failed = 1;
- break;
- }
- }
- if (failed) {
- free_list_bitmaps(sb, jb_array);
- return -1;
- }
- return 0;
-}
-
-/*
- * find an available list bitmap. If you can't find one, flush a commit list
- * and try again
- */
-static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
- struct reiserfs_journal_list
- *jl)
-{
- int i, j;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_list_bitmap *jb = NULL;
-
- for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
- i = journal->j_list_bitmap_index;
- journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
- jb = journal->j_list_bitmap + i;
- if (journal->j_list_bitmap[i].journal_list) {
- flush_commit_list(sb,
- journal->j_list_bitmap[i].
- journal_list, 1);
- if (!journal->j_list_bitmap[i].journal_list) {
- break;
- }
- } else {
- break;
- }
- }
- /* double check to make sure if flushed correctly */
- if (jb->journal_list)
- return NULL;
- jb->journal_list = jl;
- return jb;
-}
-
-/*
- * allocates a new chunk of X nodes, and links them all together as a list.
- * Uses the cnode->next and cnode->prev pointers
- * returns NULL on failure
- */
-static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
-{
- struct reiserfs_journal_cnode *head;
- int i;
- if (num_cnodes <= 0) {
- return NULL;
- }
- head = vzalloc(array_size(num_cnodes,
- sizeof(struct reiserfs_journal_cnode)));
- if (!head) {
- return NULL;
- }
- head[0].prev = NULL;
- head[0].next = head + 1;
- for (i = 1; i < num_cnodes; i++) {
- head[i].prev = head + (i - 1);
- head[i].next = head + (i + 1); /* if last one, overwrite it after the if */
- }
- head[num_cnodes - 1].next = NULL;
- return head;
-}
-
-/* pulls a cnode off the free list, or returns NULL on failure */
-static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
-{
- struct reiserfs_journal_cnode *cn;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
- reiserfs_check_lock_depth(sb, "get_cnode");
-
- if (journal->j_cnode_free <= 0) {
- return NULL;
- }
- journal->j_cnode_used++;
- journal->j_cnode_free--;
- cn = journal->j_cnode_free_list;
- if (!cn) {
- return cn;
- }
- if (cn->next) {
- cn->next->prev = NULL;
- }
- journal->j_cnode_free_list = cn->next;
- memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
- return cn;
-}
-
-/*
- * returns a cnode to the free list
- */
-static void free_cnode(struct super_block *sb,
- struct reiserfs_journal_cnode *cn)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
- reiserfs_check_lock_depth(sb, "free_cnode");
-
- journal->j_cnode_used--;
- journal->j_cnode_free++;
- /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
- cn->next = journal->j_cnode_free_list;
- if (journal->j_cnode_free_list) {
- journal->j_cnode_free_list->prev = cn;
- }
- cn->prev = NULL; /* not needed with the memset, but I might kill the memset, and forget to do this */
- journal->j_cnode_free_list = cn;
-}
-
-static void clear_prepared_bits(struct buffer_head *bh)
-{
- clear_buffer_journal_prepared(bh);
- clear_buffer_journal_restore_dirty(bh);
-}
-
-/*
- * return a cnode with same dev, block number and size in table,
- * or null if not found
- */
-static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
- super_block
- *sb,
- struct
- reiserfs_journal_cnode
- **table,
- long bl)
-{
- struct reiserfs_journal_cnode *cn;
- cn = journal_hash(table, sb, bl);
- while (cn) {
- if (cn->blocknr == bl && cn->sb == sb)
- return cn;
- cn = cn->hnext;
- }
- return (struct reiserfs_journal_cnode *)0;
-}
-
-/*
- * this actually means 'can this block be reallocated yet?'. If you set
- * search_all, a block can only be allocated if it is not in the current
- * transaction, was not freed by the current transaction, and has no chance
- * of ever being overwritten by a replay after crashing.
- *
- * If you don't set search_all, a block can only be allocated if it is not
- * in the current transaction. Since deleting a block removes it from the
- * current transaction, this case should never happen. If you don't set
- * search_all, make sure you never write the block without logging it.
- *
- * next_zero_bit is a suggestion about the next block to try for find_forward.
- * when bl is rejected because it is set in a journal list bitmap, we search
- * for the next zero bit in the bitmap that rejected bl. Then, we return
- * that through next_zero_bit for find_forward to try.
- *
- * Just because we return something in next_zero_bit does not mean we won't
- * reject it on the next call to reiserfs_in_journal
- */
-int reiserfs_in_journal(struct super_block *sb,
- unsigned int bmap_nr, int bit_nr, int search_all,
- b_blocknr_t * next_zero_bit)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_list_bitmap *jb;
- int i;
- unsigned long bl;
-
- *next_zero_bit = 0; /* always start this at zero. */
-
- PROC_INFO_INC(sb, journal.in_journal);
- /*
- * If we aren't doing a search_all, this is a metablock, and it
- * will be logged before use. if we crash before the transaction
- * that freed it commits, this transaction won't have committed
- * either, and the block will never be written
- */
- if (search_all) {
- for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
- PROC_INFO_INC(sb, journal.in_journal_bitmap);
- jb = journal->j_list_bitmap + i;
- if (jb->journal_list && jb->bitmaps[bmap_nr] &&
- test_bit(bit_nr,
- (unsigned long *)jb->bitmaps[bmap_nr]->
- data)) {
- *next_zero_bit =
- find_next_zero_bit((unsigned long *)
- (jb->bitmaps[bmap_nr]->
- data),
- sb->s_blocksize << 3,
- bit_nr + 1);
- return 1;
- }
- }
- }
-
- bl = bmap_nr * (sb->s_blocksize << 3) + bit_nr;
- /* is it in any old transactions? */
- if (search_all
- && (get_journal_hash_dev(sb, journal->j_list_hash_table, bl))) {
- return 1;
- }
-
- /* is it in the current transaction. This should never happen */
- if ((get_journal_hash_dev(sb, journal->j_hash_table, bl))) {
- BUG();
- return 1;
- }
-
- PROC_INFO_INC(sb, journal.in_journal_reusable);
- /* safe for reuse */
- return 0;
-}
-
-/* insert cn into table */
-static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
- struct reiserfs_journal_cnode *cn)
-{
- struct reiserfs_journal_cnode *cn_orig;
-
- cn_orig = journal_hash(table, cn->sb, cn->blocknr);
- cn->hnext = cn_orig;
- cn->hprev = NULL;
- if (cn_orig) {
- cn_orig->hprev = cn;
- }
- journal_hash(table, cn->sb, cn->blocknr) = cn;
-}
-
-/* lock the current transaction */
-static inline void lock_journal(struct super_block *sb)
-{
- PROC_INFO_INC(sb, journal.lock_journal);
-
- reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
-}
-
-/* unlock the current transaction */
-static inline void unlock_journal(struct super_block *sb)
-{
- mutex_unlock(&SB_JOURNAL(sb)->j_mutex);
-}
-
-static inline void get_journal_list(struct reiserfs_journal_list *jl)
-{
- jl->j_refcount++;
-}
-
-static inline void put_journal_list(struct super_block *s,
- struct reiserfs_journal_list *jl)
-{
- if (jl->j_refcount < 1) {
- reiserfs_panic(s, "journal-2", "trans id %u, refcount at %d",
- jl->j_trans_id, jl->j_refcount);
- }
- if (--jl->j_refcount == 0)
- kfree(jl);
-}
-
-/*
- * this used to be much more involved, and I'm keeping it just in case
- * things get ugly again. it gets called by flush_commit_list, and
- * cleans up any data stored about blocks freed during a transaction.
- */
-static void cleanup_freed_for_journal_list(struct super_block *sb,
- struct reiserfs_journal_list *jl)
-{
-
- struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
- if (jb) {
- cleanup_bitmap_list(sb, jb);
- }
- jl->j_list_bitmap->journal_list = NULL;
- jl->j_list_bitmap = NULL;
-}
-
-static int journal_list_still_alive(struct super_block *s,
- unsigned int trans_id)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- struct list_head *entry = &journal->j_journal_list;
- struct reiserfs_journal_list *jl;
-
- if (!list_empty(entry)) {
- jl = JOURNAL_LIST_ENTRY(entry->next);
- if (jl->j_trans_id <= trans_id) {
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * If page->mapping was null, we failed to truncate this page for
- * some reason. Most likely because it was truncated after being
- * logged via data=journal.
- *
- * This does a check to see if the buffer belongs to one of these
- * lost pages before doing the final put_bh. If page->mapping was
- * null, it tries to free buffers on the page, which should make the
- * final put_page drop the page from the lru.
- */
-static void release_buffer_page(struct buffer_head *bh)
-{
- struct folio *folio = bh->b_folio;
- if (!folio->mapping && folio_trylock(folio)) {
- folio_get(folio);
- put_bh(bh);
- if (!folio->mapping)
- try_to_free_buffers(folio);
- folio_unlock(folio);
- folio_put(folio);
- } else {
- put_bh(bh);
- }
-}
-
-static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
-{
- if (buffer_journaled(bh)) {
- reiserfs_warning(NULL, "clm-2084",
- "pinned buffer %lu:%pg sent to disk",
- bh->b_blocknr, bh->b_bdev);
- }
- if (uptodate)
- set_buffer_uptodate(bh);
- else
- clear_buffer_uptodate(bh);
-
- unlock_buffer(bh);
- release_buffer_page(bh);
-}
-
-static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
-{
- if (uptodate)
- set_buffer_uptodate(bh);
- else
- clear_buffer_uptodate(bh);
- unlock_buffer(bh);
- put_bh(bh);
-}
-
-static void submit_logged_buffer(struct buffer_head *bh)
-{
- get_bh(bh);
- bh->b_end_io = reiserfs_end_buffer_io_sync;
- clear_buffer_journal_new(bh);
- clear_buffer_dirty(bh);
- if (!test_clear_buffer_journal_test(bh))
- BUG();
- if (!buffer_uptodate(bh))
- BUG();
- submit_bh(REQ_OP_WRITE, bh);
-}
-
-static void submit_ordered_buffer(struct buffer_head *bh)
-{
- get_bh(bh);
- bh->b_end_io = reiserfs_end_ordered_io;
- clear_buffer_dirty(bh);
- if (!buffer_uptodate(bh))
- BUG();
- submit_bh(REQ_OP_WRITE, bh);
-}
-
-#define CHUNK_SIZE 32
-struct buffer_chunk {
- struct buffer_head *bh[CHUNK_SIZE];
- int nr;
-};
-
-static void write_chunk(struct buffer_chunk *chunk)
-{
- int i;
- for (i = 0; i < chunk->nr; i++) {
- submit_logged_buffer(chunk->bh[i]);
- }
- chunk->nr = 0;
-}
-
-static void write_ordered_chunk(struct buffer_chunk *chunk)
-{
- int i;
- for (i = 0; i < chunk->nr; i++) {
- submit_ordered_buffer(chunk->bh[i]);
- }
- chunk->nr = 0;
-}
-
-static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
- spinlock_t * lock, void (fn) (struct buffer_chunk *))
-{
- int ret = 0;
- BUG_ON(chunk->nr >= CHUNK_SIZE);
- chunk->bh[chunk->nr++] = bh;
- if (chunk->nr >= CHUNK_SIZE) {
- ret = 1;
- if (lock) {
- spin_unlock(lock);
- fn(chunk);
- spin_lock(lock);
- } else {
- fn(chunk);
- }
- }
- return ret;
-}
-
-static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
-static struct reiserfs_jh *alloc_jh(void)
-{
- struct reiserfs_jh *jh;
- while (1) {
- jh = kmalloc(sizeof(*jh), GFP_NOFS);
- if (jh) {
- atomic_inc(&nr_reiserfs_jh);
- return jh;
- }
- yield();
- }
-}
-
-/*
- * we want to free the jh when the buffer has been written
- * and waited on
- */
-void reiserfs_free_jh(struct buffer_head *bh)
-{
- struct reiserfs_jh *jh;
-
- jh = bh->b_private;
- if (jh) {
- bh->b_private = NULL;
- jh->bh = NULL;
- list_del_init(&jh->list);
- kfree(jh);
- if (atomic_read(&nr_reiserfs_jh) <= 0)
- BUG();
- atomic_dec(&nr_reiserfs_jh);
- put_bh(bh);
- }
-}
-
-static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
- int tail)
-{
- struct reiserfs_jh *jh;
-
- if (bh->b_private) {
- spin_lock(&j->j_dirty_buffers_lock);
- if (!bh->b_private) {
- spin_unlock(&j->j_dirty_buffers_lock);
- goto no_jh;
- }
- jh = bh->b_private;
- list_del_init(&jh->list);
- } else {
-no_jh:
- get_bh(bh);
- jh = alloc_jh();
- spin_lock(&j->j_dirty_buffers_lock);
- /*
- * buffer must be locked for __add_jh, should be able to have
- * two adds at the same time
- */
- BUG_ON(bh->b_private);
- jh->bh = bh;
- bh->b_private = jh;
- }
- jh->jl = j->j_current_jl;
- if (tail)
- list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
- else {
- list_add_tail(&jh->list, &jh->jl->j_bh_list);
- }
- spin_unlock(&j->j_dirty_buffers_lock);
- return 0;
-}
-
-int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
-{
- return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
-}
-int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
-{
- return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
-}
-
-#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
-static int write_ordered_buffers(spinlock_t * lock,
- struct reiserfs_journal *j,
- struct reiserfs_journal_list *jl,
- struct list_head *list)
-{
- struct buffer_head *bh;
- struct reiserfs_jh *jh;
- int ret = j->j_errno;
- struct buffer_chunk chunk;
- struct list_head tmp;
- INIT_LIST_HEAD(&tmp);
-
- chunk.nr = 0;
- spin_lock(lock);
- while (!list_empty(list)) {
- jh = JH_ENTRY(list->next);
- bh = jh->bh;
- get_bh(bh);
- if (!trylock_buffer(bh)) {
- if (!buffer_dirty(bh)) {
- list_move(&jh->list, &tmp);
- goto loop_next;
- }
- spin_unlock(lock);
- if (chunk.nr)
- write_ordered_chunk(&chunk);
- wait_on_buffer(bh);
- cond_resched();
- spin_lock(lock);
- goto loop_next;
- }
- /*
- * in theory, dirty non-uptodate buffers should never get here,
- * but the upper layer io error paths still have a few quirks.
- * Handle them here as gracefully as we can
- */
- if (!buffer_uptodate(bh) && buffer_dirty(bh)) {
- clear_buffer_dirty(bh);
- ret = -EIO;
- }
- if (buffer_dirty(bh)) {
- list_move(&jh->list, &tmp);
- add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
- } else {
- reiserfs_free_jh(bh);
- unlock_buffer(bh);
- }
-loop_next:
- put_bh(bh);
- cond_resched_lock(lock);
- }
- if (chunk.nr) {
- spin_unlock(lock);
- write_ordered_chunk(&chunk);
- spin_lock(lock);
- }
- while (!list_empty(&tmp)) {
- jh = JH_ENTRY(tmp.prev);
- bh = jh->bh;
- get_bh(bh);
- reiserfs_free_jh(bh);
-
- if (buffer_locked(bh)) {
- spin_unlock(lock);
- wait_on_buffer(bh);
- spin_lock(lock);
- }
- if (!buffer_uptodate(bh)) {
- ret = -EIO;
- }
- /*
- * ugly interaction with invalidate_folio here.
- * reiserfs_invalidate_folio will pin any buffer that has a
- * valid journal head from an older transaction. If someone
- * else sets our buffer dirty after we write it in the first
- * loop, and then someone truncates the page away, nobody
- * will ever write the buffer. We're safe if we write the
- * page one last time after freeing the journal header.
- */
- if (buffer_dirty(bh) && unlikely(bh->b_folio->mapping == NULL)) {
- spin_unlock(lock);
- write_dirty_buffer(bh, 0);
- spin_lock(lock);
- }
- put_bh(bh);
- cond_resched_lock(lock);
- }
- spin_unlock(lock);
- return ret;
-}
-
-static int flush_older_commits(struct super_block *s,
- struct reiserfs_journal_list *jl)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- struct reiserfs_journal_list *other_jl;
- struct reiserfs_journal_list *first_jl;
- struct list_head *entry;
- unsigned int trans_id = jl->j_trans_id;
- unsigned int other_trans_id;
-
-find_first:
- /*
- * first we walk backwards to find the oldest uncommitted transation
- */
- first_jl = jl;
- entry = jl->j_list.prev;
- while (1) {
- other_jl = JOURNAL_LIST_ENTRY(entry);
- if (entry == &journal->j_journal_list ||
- atomic_read(&other_jl->j_older_commits_done))
- break;
-
- first_jl = other_jl;
- entry = other_jl->j_list.prev;
- }
-
- /* if we didn't find any older uncommitted transactions, return now */
- if (first_jl == jl) {
- return 0;
- }
-
- entry = &first_jl->j_list;
- while (1) {
- other_jl = JOURNAL_LIST_ENTRY(entry);
- other_trans_id = other_jl->j_trans_id;
-
- if (other_trans_id < trans_id) {
- if (atomic_read(&other_jl->j_commit_left) != 0) {
- flush_commit_list(s, other_jl, 0);
-
- /* list we were called with is gone, return */
- if (!journal_list_still_alive(s, trans_id))
- return 1;
-
- /*
- * the one we just flushed is gone, this means
- * all older lists are also gone, so first_jl
- * is no longer valid either. Go back to the
- * beginning.
- */
- if (!journal_list_still_alive
- (s, other_trans_id)) {
- goto find_first;
- }
- }
- entry = entry->next;
- if (entry == &journal->j_journal_list)
- return 0;
- } else {
- return 0;
- }
- }
- return 0;
-}
-
-static int reiserfs_async_progress_wait(struct super_block *s)
-{
- struct reiserfs_journal *j = SB_JOURNAL(s);
-
- if (atomic_read(&j->j_async_throttle)) {
- int depth;
-
- depth = reiserfs_write_unlock_nested(s);
- wait_var_event_timeout(&j->j_async_throttle,
- atomic_read(&j->j_async_throttle) == 0,
- HZ / 10);
- reiserfs_write_lock_nested(s, depth);
- }
-
- return 0;
-}
-
-/*
- * if this journal list still has commit blocks unflushed, send them to disk.
- *
- * log areas must be flushed in order (transaction 2 can't commit before
- * transaction 1) Before the commit block can by written, every other log
- * block must be safely on disk
- */
-static int flush_commit_list(struct super_block *s,
- struct reiserfs_journal_list *jl, int flushall)
-{
- int i;
- b_blocknr_t bn;
- struct buffer_head *tbh = NULL;
- unsigned int trans_id = jl->j_trans_id;
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- int retval = 0;
- int write_len;
- int depth;
-
- reiserfs_check_lock_depth(s, "flush_commit_list");
-
- if (atomic_read(&jl->j_older_commits_done)) {
- return 0;
- }
-
- /*
- * before we can put our commit blocks on disk, we have to make
- * sure everyone older than us is on disk too
- */
- BUG_ON(jl->j_len <= 0);
- BUG_ON(trans_id == journal->j_trans_id);
-
- get_journal_list(jl);
- if (flushall) {
- if (flush_older_commits(s, jl) == 1) {
- /*
- * list disappeared during flush_older_commits.
- * return
- */
- goto put_jl;
- }
- }
-
- /* make sure nobody is trying to flush this one at the same time */
- reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
-
- if (!journal_list_still_alive(s, trans_id)) {
- mutex_unlock(&jl->j_commit_mutex);
- goto put_jl;
- }
- BUG_ON(jl->j_trans_id == 0);
-
- /* this commit is done, exit */
- if (atomic_read(&jl->j_commit_left) <= 0) {
- if (flushall) {
- atomic_set(&jl->j_older_commits_done, 1);
- }
- mutex_unlock(&jl->j_commit_mutex);
- goto put_jl;
- }
-
- if (!list_empty(&jl->j_bh_list)) {
- int ret;
-
- /*
- * We might sleep in numerous places inside
- * write_ordered_buffers. Relax the write lock.
- */
- depth = reiserfs_write_unlock_nested(s);
- ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
- journal, jl, &jl->j_bh_list);
- if (ret < 0 && retval == 0)
- retval = ret;
- reiserfs_write_lock_nested(s, depth);
- }
- BUG_ON(!list_empty(&jl->j_bh_list));
- /*
- * for the description block and all the log blocks, submit any buffers
- * that haven't already reached the disk. Try to write at least 256
- * log blocks. later on, we will only wait on blocks that correspond
- * to this transaction, but while we're unplugging we might as well
- * get a chunk of data on there.
- */
- atomic_inc(&journal->j_async_throttle);
- write_len = jl->j_len + 1;
- if (write_len < 256)
- write_len = 256;
- for (i = 0 ; i < write_len ; i++) {
- bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
- SB_ONDISK_JOURNAL_SIZE(s);
- tbh = journal_find_get_block(s, bn);
- if (tbh) {
- if (buffer_dirty(tbh)) {
- depth = reiserfs_write_unlock_nested(s);
- write_dirty_buffer(tbh, 0);
- reiserfs_write_lock_nested(s, depth);
- }
- put_bh(tbh) ;
- }
- }
- if (atomic_dec_and_test(&journal->j_async_throttle))
- wake_up_var(&journal->j_async_throttle);
-
- for (i = 0; i < (jl->j_len + 1); i++) {
- bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
- (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
- tbh = journal_find_get_block(s, bn);
-
- depth = reiserfs_write_unlock_nested(s);
- __wait_on_buffer(tbh);
- reiserfs_write_lock_nested(s, depth);
- /*
- * since we're using ll_rw_blk above, it might have skipped
- * over a locked buffer. Double check here
- */
- /* redundant, sync_dirty_buffer() checks */
- if (buffer_dirty(tbh)) {
- depth = reiserfs_write_unlock_nested(s);
- sync_dirty_buffer(tbh);
- reiserfs_write_lock_nested(s, depth);
- }
- if (unlikely(!buffer_uptodate(tbh))) {
-#ifdef CONFIG_REISERFS_CHECK
- reiserfs_warning(s, "journal-601",
- "buffer write failed");
-#endif
- retval = -EIO;
- }
- /* once for journal_find_get_block */
- put_bh(tbh);
- /* once due to original getblk in do_journal_end */
- put_bh(tbh);
- atomic_dec(&jl->j_commit_left);
- }
-
- BUG_ON(atomic_read(&jl->j_commit_left) != 1);
-
- /*
- * If there was a write error in the journal - we can't commit
- * this transaction - it will be invalid and, if successful,
- * will just end up propagating the write error out to
- * the file system.
- */
- if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
- if (buffer_dirty(jl->j_commit_bh))
- BUG();
- mark_buffer_dirty(jl->j_commit_bh) ;
- depth = reiserfs_write_unlock_nested(s);
- if (reiserfs_barrier_flush(s))
- __sync_dirty_buffer(jl->j_commit_bh,
- REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
- else
- sync_dirty_buffer(jl->j_commit_bh);
- reiserfs_write_lock_nested(s, depth);
- }
-
- /*
- * If there was a write error in the journal - we can't commit this
- * transaction - it will be invalid and, if successful, will just end
- * up propagating the write error out to the filesystem.
- */
- if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
-#ifdef CONFIG_REISERFS_CHECK
- reiserfs_warning(s, "journal-615", "buffer write failed");
-#endif
- retval = -EIO;
- }
- bforget(jl->j_commit_bh);
- if (journal->j_last_commit_id != 0 &&
- (jl->j_trans_id - journal->j_last_commit_id) != 1) {
- reiserfs_warning(s, "clm-2200", "last commit %lu, current %lu",
- journal->j_last_commit_id, jl->j_trans_id);
- }
- journal->j_last_commit_id = jl->j_trans_id;
-
- /*
- * now, every commit block is on the disk. It is safe to allow
- * blocks freed during this transaction to be reallocated
- */
- cleanup_freed_for_journal_list(s, jl);
-
- retval = retval ? retval : journal->j_errno;
-
- /* mark the metadata dirty */
- if (!retval)
- dirty_one_transaction(s, jl);
- atomic_dec(&jl->j_commit_left);
-
- if (flushall) {
- atomic_set(&jl->j_older_commits_done, 1);
- }
- mutex_unlock(&jl->j_commit_mutex);
-put_jl:
- put_journal_list(s, jl);
-
- if (retval)
- reiserfs_abort(s, retval, "Journal write error in %s",
- __func__);
- return retval;
-}
-
-/*
- * flush_journal_list frequently needs to find a newer transaction for a
- * given block. This does that, or returns NULL if it can't find anything
- */
-static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
- reiserfs_journal_cnode
- *cn)
-{
- struct super_block *sb = cn->sb;
- b_blocknr_t blocknr = cn->blocknr;
-
- cn = cn->hprev;
- while (cn) {
- if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
- return cn->jlist;
- }
- cn = cn->hprev;
- }
- return NULL;
-}
-
-static void remove_journal_hash(struct super_block *,
- struct reiserfs_journal_cnode **,
- struct reiserfs_journal_list *, unsigned long,
- int);
-
-/*
- * once all the real blocks have been flushed, it is safe to remove them
- * from the journal list for this transaction. Aside from freeing the
- * cnode, this also allows the block to be reallocated for data blocks
- * if it had been deleted.
- */
-static void remove_all_from_journal_list(struct super_block *sb,
- struct reiserfs_journal_list *jl,
- int debug)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_journal_cnode *cn, *last;
- cn = jl->j_realblock;
-
- /*
- * which is better, to lock once around the whole loop, or
- * to lock for each call to remove_journal_hash?
- */
- while (cn) {
- if (cn->blocknr != 0) {
- if (debug) {
- reiserfs_warning(sb, "reiserfs-2201",
- "block %u, bh is %d, state %ld",
- cn->blocknr, cn->bh ? 1 : 0,
- cn->state);
- }
- cn->state = 0;
- remove_journal_hash(sb, journal->j_list_hash_table,
- jl, cn->blocknr, 1);
- }
- last = cn;
- cn = cn->next;
- free_cnode(sb, last);
- }
- jl->j_realblock = NULL;
-}
-
-/*
- * if this timestamp is greater than the timestamp we wrote last to the
- * header block, write it to the header block. once this is done, I can
- * safely say the log area for this transaction won't ever be replayed,
- * and I can start releasing blocks in this transaction for reuse as data
- * blocks. called by flush_journal_list, before it calls
- * remove_all_from_journal_list
- */
-static int _update_journal_header_block(struct super_block *sb,
- unsigned long offset,
- unsigned int trans_id)
-{
- struct reiserfs_journal_header *jh;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- int depth;
-
- if (reiserfs_is_journal_aborted(journal))
- return -EIO;
-
- if (trans_id >= journal->j_last_flush_trans_id) {
- if (buffer_locked((journal->j_header_bh))) {
- depth = reiserfs_write_unlock_nested(sb);
- __wait_on_buffer(journal->j_header_bh);
- reiserfs_write_lock_nested(sb, depth);
- if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
-#ifdef CONFIG_REISERFS_CHECK
- reiserfs_warning(sb, "journal-699",
- "buffer write failed");
-#endif
- return -EIO;
- }
- }
- journal->j_last_flush_trans_id = trans_id;
- journal->j_first_unflushed_offset = offset;
- jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
- b_data);
- jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
- jh->j_first_unflushed_offset = cpu_to_le32(offset);
- jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
-
- set_buffer_dirty(journal->j_header_bh);
- depth = reiserfs_write_unlock_nested(sb);
-
- if (reiserfs_barrier_flush(sb))
- __sync_dirty_buffer(journal->j_header_bh,
- REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
- else
- sync_dirty_buffer(journal->j_header_bh);
-
- reiserfs_write_lock_nested(sb, depth);
- if (!buffer_uptodate(journal->j_header_bh)) {
- reiserfs_warning(sb, "journal-837",
- "IO error during journal replay");
- return -EIO;
- }
- }
- return 0;
-}
-
-static int update_journal_header_block(struct super_block *sb,
- unsigned long offset,
- unsigned int trans_id)
-{
- return _update_journal_header_block(sb, offset, trans_id);
-}
-
-/*
-** flush any and all journal lists older than you are
-** can only be called from flush_journal_list
-*/
-static int flush_older_journal_lists(struct super_block *sb,
- struct reiserfs_journal_list *jl)
-{
- struct list_head *entry;
- struct reiserfs_journal_list *other_jl;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- unsigned int trans_id = jl->j_trans_id;
-
- /*
- * we know we are the only ones flushing things, no extra race
- * protection is required.
- */
-restart:
- entry = journal->j_journal_list.next;
- /* Did we wrap? */
- if (entry == &journal->j_journal_list)
- return 0;
- other_jl = JOURNAL_LIST_ENTRY(entry);
- if (other_jl->j_trans_id < trans_id) {
- BUG_ON(other_jl->j_refcount <= 0);
- /* do not flush all */
- flush_journal_list(sb, other_jl, 0);
-
- /* other_jl is now deleted from the list */
- goto restart;
- }
- return 0;
-}
-
-static void del_from_work_list(struct super_block *s,
- struct reiserfs_journal_list *jl)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- if (!list_empty(&jl->j_working_list)) {
- list_del_init(&jl->j_working_list);
- journal->j_num_work_lists--;
- }
-}
-
-/*
- * flush a journal list, both commit and real blocks
- *
- * always set flushall to 1, unless you are calling from inside
- * flush_journal_list
- *
- * IMPORTANT. This can only be called while there are no journal writers,
- * and the journal is locked. That means it can only be called from
- * do_journal_end, or by journal_release
- */
-static int flush_journal_list(struct super_block *s,
- struct reiserfs_journal_list *jl, int flushall)
-{
- struct reiserfs_journal_list *pjl;
- struct reiserfs_journal_cnode *cn;
- int count;
- int was_jwait = 0;
- int was_dirty = 0;
- struct buffer_head *saved_bh;
- unsigned long j_len_saved = jl->j_len;
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- int err = 0;
- int depth;
-
- BUG_ON(j_len_saved <= 0);
-
- if (atomic_read(&journal->j_wcount) != 0) {
- reiserfs_warning(s, "clm-2048", "called with wcount %d",
- atomic_read(&journal->j_wcount));
- }
-
- /* if flushall == 0, the lock is already held */
- if (flushall) {
- reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
- } else if (mutex_trylock(&journal->j_flush_mutex)) {
- BUG();
- }
-
- count = 0;
- if (j_len_saved > journal->j_trans_max) {
- reiserfs_panic(s, "journal-715", "length is %lu, trans id %lu",
- j_len_saved, jl->j_trans_id);
- return 0;
- }
-
- /* if all the work is already done, get out of here */
- if (atomic_read(&jl->j_nonzerolen) <= 0 &&
- atomic_read(&jl->j_commit_left) <= 0) {
- goto flush_older_and_return;
- }
-
- /*
- * start by putting the commit list on disk. This will also flush
- * the commit lists of any olders transactions
- */
- flush_commit_list(s, jl, 1);
-
- if (!(jl->j_state & LIST_DIRTY)
- && !reiserfs_is_journal_aborted(journal))
- BUG();
-
- /* are we done now? */
- if (atomic_read(&jl->j_nonzerolen) <= 0 &&
- atomic_read(&jl->j_commit_left) <= 0) {
- goto flush_older_and_return;
- }
-
- /*
- * loop through each cnode, see if we need to write it,
- * or wait on a more recent transaction, or just ignore it
- */
- if (atomic_read(&journal->j_wcount) != 0) {
- reiserfs_panic(s, "journal-844", "journal list is flushing, "
- "wcount is not 0");
- }
- cn = jl->j_realblock;
- while (cn) {
- was_jwait = 0;
- was_dirty = 0;
- saved_bh = NULL;
- /* blocknr of 0 is no longer in the hash, ignore it */
- if (cn->blocknr == 0) {
- goto free_cnode;
- }
-
- /*
- * This transaction failed commit.
- * Don't write out to the disk
- */
- if (!(jl->j_state & LIST_DIRTY))
- goto free_cnode;
-
- pjl = find_newer_jl_for_cn(cn);
- /*
- * the order is important here. We check pjl to make sure we
- * don't clear BH_JDirty_wait if we aren't the one writing this
- * block to disk
- */
- if (!pjl && cn->bh) {
- saved_bh = cn->bh;
-
- /*
- * we do this to make sure nobody releases the
- * buffer while we are working with it
- */
- get_bh(saved_bh);
-
- if (buffer_journal_dirty(saved_bh)) {
- BUG_ON(!can_dirty(cn));
- was_jwait = 1;
- was_dirty = 1;
- } else if (can_dirty(cn)) {
- /*
- * everything with !pjl && jwait
- * should be writable
- */
- BUG();
- }
- }
-
- /*
- * if someone has this block in a newer transaction, just make
- * sure they are committed, and don't try writing it to disk
- */
- if (pjl) {
- if (atomic_read(&pjl->j_commit_left))
- flush_commit_list(s, pjl, 1);
- goto free_cnode;
- }
-
- /*
- * bh == NULL when the block got to disk on its own, OR,
- * the block got freed in a future transaction
- */
- if (saved_bh == NULL) {
- goto free_cnode;
- }
-
- /*
- * this should never happen. kupdate_one_transaction has
- * this list locked while it works, so we should never see a
- * buffer here that is not marked JDirty_wait
- */
- if ((!was_jwait) && !buffer_locked(saved_bh)) {
- reiserfs_warning(s, "journal-813",
- "BAD! buffer %llu %cdirty %cjwait, "
- "not in a newer transaction",
- (unsigned long long)saved_bh->
- b_blocknr, was_dirty ? ' ' : '!',
- was_jwait ? ' ' : '!');
- }
- if (was_dirty) {
- /*
- * we inc again because saved_bh gets decremented
- * at free_cnode
- */
- get_bh(saved_bh);
- set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
- lock_buffer(saved_bh);
- BUG_ON(cn->blocknr != saved_bh->b_blocknr);
- if (buffer_dirty(saved_bh))
- submit_logged_buffer(saved_bh);
- else
- unlock_buffer(saved_bh);
- count++;
- } else {
- reiserfs_warning(s, "clm-2082",
- "Unable to flush buffer %llu in %s",
- (unsigned long long)saved_bh->
- b_blocknr, __func__);
- }
-free_cnode:
- cn = cn->next;
- if (saved_bh) {
- /*
- * we incremented this to keep others from
- * taking the buffer head away
- */
- put_bh(saved_bh);
- if (atomic_read(&saved_bh->b_count) < 0) {
- reiserfs_warning(s, "journal-945",
- "saved_bh->b_count < 0");
- }
- }
- }
- if (count > 0) {
- cn = jl->j_realblock;
- while (cn) {
- if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
- if (!cn->bh) {
- reiserfs_panic(s, "journal-1011",
- "cn->bh is NULL");
- }
-
- depth = reiserfs_write_unlock_nested(s);
- __wait_on_buffer(cn->bh);
- reiserfs_write_lock_nested(s, depth);
-
- if (!cn->bh) {
- reiserfs_panic(s, "journal-1012",
- "cn->bh is NULL");
- }
- if (unlikely(!buffer_uptodate(cn->bh))) {
-#ifdef CONFIG_REISERFS_CHECK
- reiserfs_warning(s, "journal-949",
- "buffer write failed");
-#endif
- err = -EIO;
- }
- /*
- * note, we must clear the JDirty_wait bit
- * after the up to date check, otherwise we
- * race against our flushpage routine
- */
- BUG_ON(!test_clear_buffer_journal_dirty
- (cn->bh));
-
- /* drop one ref for us */
- put_bh(cn->bh);
- /* drop one ref for journal_mark_dirty */
- release_buffer_page(cn->bh);
- }
- cn = cn->next;
- }
- }
-
- if (err)
- reiserfs_abort(s, -EIO,
- "Write error while pushing transaction to disk in %s",
- __func__);
-flush_older_and_return:
-
- /*
- * before we can update the journal header block, we _must_ flush all
- * real blocks from all older transactions to disk. This is because
- * once the header block is updated, this transaction will not be
- * replayed after a crash
- */
- if (flushall) {
- flush_older_journal_lists(s, jl);
- }
-
- err = journal->j_errno;
- /*
- * before we can remove everything from the hash tables for this
- * transaction, we must make sure it can never be replayed
- *
- * since we are only called from do_journal_end, we know for sure there
- * are no allocations going on while we are flushing journal lists. So,
- * we only need to update the journal header block for the last list
- * being flushed
- */
- if (!err && flushall) {
- err =
- update_journal_header_block(s,
- (jl->j_start + jl->j_len +
- 2) % SB_ONDISK_JOURNAL_SIZE(s),
- jl->j_trans_id);
- if (err)
- reiserfs_abort(s, -EIO,
- "Write error while updating journal header in %s",
- __func__);
- }
- remove_all_from_journal_list(s, jl, 0);
- list_del_init(&jl->j_list);
- journal->j_num_lists--;
- del_from_work_list(s, jl);
-
- if (journal->j_last_flush_id != 0 &&
- (jl->j_trans_id - journal->j_last_flush_id) != 1) {
- reiserfs_warning(s, "clm-2201", "last flush %lu, current %lu",
- journal->j_last_flush_id, jl->j_trans_id);
- }
- journal->j_last_flush_id = jl->j_trans_id;
-
- /*
- * not strictly required since we are freeing the list, but it should
- * help find code using dead lists later on
- */
- jl->j_len = 0;
- atomic_set(&jl->j_nonzerolen, 0);
- jl->j_start = 0;
- jl->j_realblock = NULL;
- jl->j_commit_bh = NULL;
- jl->j_trans_id = 0;
- jl->j_state = 0;
- put_journal_list(s, jl);
- if (flushall)
- mutex_unlock(&journal->j_flush_mutex);
- return err;
-}
-
-static int write_one_transaction(struct super_block *s,
- struct reiserfs_journal_list *jl,
- struct buffer_chunk *chunk)
-{
- struct reiserfs_journal_cnode *cn;
- int ret = 0;
-
- jl->j_state |= LIST_TOUCHED;
- del_from_work_list(s, jl);
- if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
- return 0;
- }
-
- cn = jl->j_realblock;
- while (cn) {
- /*
- * if the blocknr == 0, this has been cleared from the hash,
- * skip it
- */
- if (cn->blocknr == 0) {
- goto next;
- }
- if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
- struct buffer_head *tmp_bh;
- /*
- * we can race against journal_mark_freed when we try
- * to lock_buffer(cn->bh), so we have to inc the buffer
- * count, and recheck things after locking
- */
- tmp_bh = cn->bh;
- get_bh(tmp_bh);
- lock_buffer(tmp_bh);
- if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
- if (!buffer_journal_dirty(tmp_bh) ||
- buffer_journal_prepared(tmp_bh))
- BUG();
- add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
- ret++;
- } else {
- /* note, cn->bh might be null now */
- unlock_buffer(tmp_bh);
- }
- put_bh(tmp_bh);
- }
-next:
- cn = cn->next;
- cond_resched();
- }
- return ret;
-}
-
-/* used by flush_commit_list */
-static void dirty_one_transaction(struct super_block *s,
- struct reiserfs_journal_list *jl)
-{
- struct reiserfs_journal_cnode *cn;
- struct reiserfs_journal_list *pjl;
-
- jl->j_state |= LIST_DIRTY;
- cn = jl->j_realblock;
- while (cn) {
- /*
- * look for a more recent transaction that logged this
- * buffer. Only the most recent transaction with a buffer in
- * it is allowed to send that buffer to disk
- */
- pjl = find_newer_jl_for_cn(cn);
- if (!pjl && cn->blocknr && cn->bh
- && buffer_journal_dirty(cn->bh)) {
- BUG_ON(!can_dirty(cn));
- /*
- * if the buffer is prepared, it will either be logged
- * or restored. If restored, we need to make sure
- * it actually gets marked dirty
- */
- clear_buffer_journal_new(cn->bh);
- if (buffer_journal_prepared(cn->bh)) {
- set_buffer_journal_restore_dirty(cn->bh);
- } else {
- set_buffer_journal_test(cn->bh);
- mark_buffer_dirty(cn->bh);
- }
- }
- cn = cn->next;
- }
-}
-
-static int kupdate_transactions(struct super_block *s,
- struct reiserfs_journal_list *jl,
- struct reiserfs_journal_list **next_jl,
- unsigned int *next_trans_id,
- int num_blocks, int num_trans)
-{
- int ret = 0;
- int written = 0;
- int transactions_flushed = 0;
- unsigned int orig_trans_id = jl->j_trans_id;
- struct buffer_chunk chunk;
- struct list_head *entry;
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- chunk.nr = 0;
-
- reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
- if (!journal_list_still_alive(s, orig_trans_id)) {
- goto done;
- }
-
- /*
- * we've got j_flush_mutex held, nobody is going to delete any
- * of these lists out from underneath us
- */
- while ((num_trans && transactions_flushed < num_trans) ||
- (!num_trans && written < num_blocks)) {
-
- if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
- atomic_read(&jl->j_commit_left)
- || !(jl->j_state & LIST_DIRTY)) {
- del_from_work_list(s, jl);
- break;
- }
- ret = write_one_transaction(s, jl, &chunk);
-
- if (ret < 0)
- goto done;
- transactions_flushed++;
- written += ret;
- entry = jl->j_list.next;
-
- /* did we wrap? */
- if (entry == &journal->j_journal_list) {
- break;
- }
- jl = JOURNAL_LIST_ENTRY(entry);
-
- /* don't bother with older transactions */
- if (jl->j_trans_id <= orig_trans_id)
- break;
- }
- if (chunk.nr) {
- write_chunk(&chunk);
- }
-
-done:
- mutex_unlock(&journal->j_flush_mutex);
- return ret;
-}
-
-/*
- * for o_sync and fsync heavy applications, they tend to use
- * all the journa list slots with tiny transactions. These
- * trigger lots and lots of calls to update the header block, which
- * adds seeks and slows things down.
- *
- * This function tries to clear out a large chunk of the journal lists
- * at once, which makes everything faster since only the newest journal
- * list updates the header block
- */
-static int flush_used_journal_lists(struct super_block *s,
- struct reiserfs_journal_list *jl)
-{
- unsigned long len = 0;
- unsigned long cur_len;
- int i;
- int limit = 256;
- struct reiserfs_journal_list *tjl;
- struct reiserfs_journal_list *flush_jl;
- unsigned int trans_id;
- struct reiserfs_journal *journal = SB_JOURNAL(s);
-
- flush_jl = tjl = jl;
-
- /* in data logging mode, try harder to flush a lot of blocks */
- if (reiserfs_data_log(s))
- limit = 1024;
- /* flush for 256 transactions or limit blocks, whichever comes first */
- for (i = 0; i < 256 && len < limit; i++) {
- if (atomic_read(&tjl->j_commit_left) ||
- tjl->j_trans_id < jl->j_trans_id) {
- break;
- }
- cur_len = atomic_read(&tjl->j_nonzerolen);
- if (cur_len > 0) {
- tjl->j_state &= ~LIST_TOUCHED;
- }
- len += cur_len;
- flush_jl = tjl;
- if (tjl->j_list.next == &journal->j_journal_list)
- break;
- tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
- }
- get_journal_list(jl);
- get_journal_list(flush_jl);
-
- /*
- * try to find a group of blocks we can flush across all the
- * transactions, but only bother if we've actually spanned
- * across multiple lists
- */
- if (flush_jl != jl)
- kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
-
- flush_journal_list(s, flush_jl, 1);
- put_journal_list(s, flush_jl);
- put_journal_list(s, jl);
- return 0;
-}
-
-/*
- * removes any nodes in table with name block and dev as bh.
- * only touchs the hnext and hprev pointers.
- */
-static void remove_journal_hash(struct super_block *sb,
- struct reiserfs_journal_cnode **table,
- struct reiserfs_journal_list *jl,
- unsigned long block, int remove_freed)
-{
- struct reiserfs_journal_cnode *cur;
- struct reiserfs_journal_cnode **head;
-
- head = &(journal_hash(table, sb, block));
- if (!head) {
- return;
- }
- cur = *head;
- while (cur) {
- if (cur->blocknr == block && cur->sb == sb
- && (jl == NULL || jl == cur->jlist)
- && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
- if (cur->hnext) {
- cur->hnext->hprev = cur->hprev;
- }
- if (cur->hprev) {
- cur->hprev->hnext = cur->hnext;
- } else {
- *head = cur->hnext;
- }
- cur->blocknr = 0;
- cur->sb = NULL;
- cur->state = 0;
- /*
- * anybody who clears the cur->bh will also
- * dec the nonzerolen
- */
- if (cur->bh && cur->jlist)
- atomic_dec(&cur->jlist->j_nonzerolen);
- cur->bh = NULL;
- cur->jlist = NULL;
- }
- cur = cur->hnext;
- }
-}
-
-static void free_journal_ram(struct super_block *sb)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- kfree(journal->j_current_jl);
- journal->j_num_lists--;
-
- vfree(journal->j_cnode_free_orig);
- free_list_bitmaps(sb, journal->j_list_bitmap);
- free_bitmap_nodes(sb); /* must be after free_list_bitmaps */
- if (journal->j_header_bh) {
- brelse(journal->j_header_bh);
- }
- /*
- * j_header_bh is on the journal dev, make sure
- * not to release the journal dev until we brelse j_header_bh
- */
- release_journal_dev(journal);
- vfree(journal);
-}
-
-/*
- * call on unmount. Only set error to 1 if you haven't made your way out
- * of read_super() yet. Any other caller must keep error at 0.
- */
-static int do_journal_release(struct reiserfs_transaction_handle *th,
- struct super_block *sb, int error)
-{
- struct reiserfs_transaction_handle myth;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
- /*
- * we only want to flush out transactions if we were
- * called with error == 0
- */
- if (!error && !sb_rdonly(sb)) {
- /* end the current trans */
- BUG_ON(!th->t_trans_id);
- do_journal_end(th, FLUSH_ALL);
-
- /*
- * make sure something gets logged to force
- * our way into the flush code
- */
- if (!journal_join(&myth, sb)) {
- reiserfs_prepare_for_journal(sb,
- SB_BUFFER_WITH_SB(sb),
- 1);
- journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
- do_journal_end(&myth, FLUSH_ALL);
- }
- }
-
- /* this also catches errors during the do_journal_end above */
- if (!error && reiserfs_is_journal_aborted(journal)) {
- memset(&myth, 0, sizeof(myth));
- if (!journal_join_abort(&myth, sb)) {
- reiserfs_prepare_for_journal(sb,
- SB_BUFFER_WITH_SB(sb),
- 1);
- journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
- do_journal_end(&myth, FLUSH_ALL);
- }
- }
-
-
- /*
- * We must release the write lock here because
- * the workqueue job (flush_async_commit) needs this lock
- */
- reiserfs_write_unlock(sb);
-
- /*
- * Cancel flushing of old commits. Note that neither of these works
- * will be requeued because superblock is being shutdown and doesn't
- * have SB_ACTIVE set.
- */
- reiserfs_cancel_old_flush(sb);
- /* wait for all commits to finish */
- cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
-
- free_journal_ram(sb);
-
- reiserfs_write_lock(sb);
-
- return 0;
-}
-
-/* * call on unmount. flush all journal trans, release all alloc'd ram */
-int journal_release(struct reiserfs_transaction_handle *th,
- struct super_block *sb)
-{
- return do_journal_release(th, sb, 0);
-}
-
-/* only call from an error condition inside reiserfs_read_super! */
-int journal_release_error(struct reiserfs_transaction_handle *th,
- struct super_block *sb)
-{
- return do_journal_release(th, sb, 1);
-}
-
-/*
- * compares description block with commit block.
- * returns 1 if they differ, 0 if they are the same
- */
-static int journal_compare_desc_commit(struct super_block *sb,
- struct reiserfs_journal_desc *desc,
- struct reiserfs_journal_commit *commit)
-{
- if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
- get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
- get_commit_trans_len(commit) > SB_JOURNAL(sb)->j_trans_max ||
- get_commit_trans_len(commit) <= 0) {
- return 1;
- }
- return 0;
-}
-
-/*
- * returns 0 if it did not find a description block
- * returns -1 if it found a corrupt commit block
- * returns 1 if both desc and commit were valid
- * NOTE: only called during fs mount
- */
-static int journal_transaction_is_valid(struct super_block *sb,
- struct buffer_head *d_bh,
- unsigned int *oldest_invalid_trans_id,
- unsigned long *newest_mount_id)
-{
- struct reiserfs_journal_desc *desc;
- struct reiserfs_journal_commit *commit;
- struct buffer_head *c_bh;
- unsigned long offset;
-
- if (!d_bh)
- return 0;
-
- desc = (struct reiserfs_journal_desc *)d_bh->b_data;
- if (get_desc_trans_len(desc) > 0
- && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
- if (oldest_invalid_trans_id && *oldest_invalid_trans_id
- && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-986: transaction "
- "is valid returning because trans_id %d is greater than "
- "oldest_invalid %lu",
- get_desc_trans_id(desc),
- *oldest_invalid_trans_id);
- return 0;
- }
- if (newest_mount_id
- && *newest_mount_id > get_desc_mount_id(desc)) {
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-1087: transaction "
- "is valid returning because mount_id %d is less than "
- "newest_mount_id %lu",
- get_desc_mount_id(desc),
- *newest_mount_id);
- return -1;
- }
- if (get_desc_trans_len(desc) > SB_JOURNAL(sb)->j_trans_max) {
- reiserfs_warning(sb, "journal-2018",
- "Bad transaction length %d "
- "encountered, ignoring transaction",
- get_desc_trans_len(desc));
- return -1;
- }
- offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
-
- /*
- * ok, we have a journal description block,
- * let's see if the transaction was valid
- */
- c_bh =
- journal_bread(sb,
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- ((offset + get_desc_trans_len(desc) +
- 1) % SB_ONDISK_JOURNAL_SIZE(sb)));
- if (!c_bh)
- return 0;
- commit = (struct reiserfs_journal_commit *)c_bh->b_data;
- if (journal_compare_desc_commit(sb, desc, commit)) {
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal_transaction_is_valid, commit offset %ld had bad "
- "time %d or length %d",
- c_bh->b_blocknr -
- SB_ONDISK_JOURNAL_1st_BLOCK(sb),
- get_commit_trans_id(commit),
- get_commit_trans_len(commit));
- brelse(c_bh);
- if (oldest_invalid_trans_id) {
- *oldest_invalid_trans_id =
- get_desc_trans_id(desc);
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-1004: "
- "transaction_is_valid setting oldest invalid trans_id "
- "to %d",
- get_desc_trans_id(desc));
- }
- return -1;
- }
- brelse(c_bh);
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-1006: found valid "
- "transaction start offset %llu, len %d id %d",
- d_bh->b_blocknr -
- SB_ONDISK_JOURNAL_1st_BLOCK(sb),
- get_desc_trans_len(desc),
- get_desc_trans_id(desc));
- return 1;
- } else {
- return 0;
- }
-}
-
-static void brelse_array(struct buffer_head **heads, int num)
-{
- int i;
- for (i = 0; i < num; i++) {
- brelse(heads[i]);
- }
-}
-
-/*
- * given the start, and values for the oldest acceptable transactions,
- * this either reads in a replays a transaction, or returns because the
- * transaction is invalid, or too old.
- * NOTE: only called during fs mount
- */
-static int journal_read_transaction(struct super_block *sb,
- unsigned long cur_dblock,
- unsigned long oldest_start,
- unsigned int oldest_trans_id,
- unsigned long newest_mount_id)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_journal_desc *desc;
- struct reiserfs_journal_commit *commit;
- unsigned int trans_id = 0;
- struct buffer_head *c_bh;
- struct buffer_head *d_bh;
- struct buffer_head **log_blocks = NULL;
- struct buffer_head **real_blocks = NULL;
- unsigned int trans_offset;
- int i;
- int trans_half;
-
- d_bh = journal_bread(sb, cur_dblock);
- if (!d_bh)
- return 1;
- desc = (struct reiserfs_journal_desc *)d_bh->b_data;
- trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
- reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1037: "
- "journal_read_transaction, offset %llu, len %d mount_id %d",
- d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
- get_desc_trans_len(desc), get_desc_mount_id(desc));
- if (get_desc_trans_id(desc) < oldest_trans_id) {
- reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1039: "
- "journal_read_trans skipping because %lu is too old",
- cur_dblock -
- SB_ONDISK_JOURNAL_1st_BLOCK(sb));
- brelse(d_bh);
- return 1;
- }
- if (get_desc_mount_id(desc) != newest_mount_id) {
- reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1146: "
- "journal_read_trans skipping because %d is != "
- "newest_mount_id %lu", get_desc_mount_id(desc),
- newest_mount_id);
- brelse(d_bh);
- return 1;
- }
- c_bh = journal_bread(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- ((trans_offset + get_desc_trans_len(desc) + 1) %
- SB_ONDISK_JOURNAL_SIZE(sb)));
- if (!c_bh) {
- brelse(d_bh);
- return 1;
- }
- commit = (struct reiserfs_journal_commit *)c_bh->b_data;
- if (journal_compare_desc_commit(sb, desc, commit)) {
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal_read_transaction, "
- "commit offset %llu had bad time %d or length %d",
- c_bh->b_blocknr -
- SB_ONDISK_JOURNAL_1st_BLOCK(sb),
- get_commit_trans_id(commit),
- get_commit_trans_len(commit));
- brelse(c_bh);
- brelse(d_bh);
- return 1;
- }
-
- if (bdev_read_only(sb->s_bdev)) {
- reiserfs_warning(sb, "clm-2076",
- "device is readonly, unable to replay log");
- brelse(c_bh);
- brelse(d_bh);
- return -EROFS;
- }
-
- trans_id = get_desc_trans_id(desc);
- /*
- * now we know we've got a good transaction, and it was
- * inside the valid time ranges
- */
- log_blocks = kmalloc_array(get_desc_trans_len(desc),
- sizeof(struct buffer_head *),
- GFP_NOFS);
- real_blocks = kmalloc_array(get_desc_trans_len(desc),
- sizeof(struct buffer_head *),
- GFP_NOFS);
- if (!log_blocks || !real_blocks) {
- brelse(c_bh);
- brelse(d_bh);
- kfree(log_blocks);
- kfree(real_blocks);
- reiserfs_warning(sb, "journal-1169",
- "kmalloc failed, unable to mount FS");
- return -1;
- }
- /* get all the buffer heads */
- trans_half = journal_trans_half(sb->s_blocksize);
- for (i = 0; i < get_desc_trans_len(desc); i++) {
- log_blocks[i] =
- journal_getblk(sb,
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- (trans_offset + 1 +
- i) % SB_ONDISK_JOURNAL_SIZE(sb));
- if (i < trans_half) {
- real_blocks[i] =
- sb_getblk(sb,
- le32_to_cpu(desc->j_realblock[i]));
- } else {
- real_blocks[i] =
- sb_getblk(sb,
- le32_to_cpu(commit->
- j_realblock[i - trans_half]));
- }
- if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(sb)) {
- reiserfs_warning(sb, "journal-1207",
- "REPLAY FAILURE fsck required! "
- "Block to replay is outside of "
- "filesystem");
- goto abort_replay;
- }
- /* make sure we don't try to replay onto log or reserved area */
- if (is_block_in_log_or_reserved_area
- (sb, real_blocks[i]->b_blocknr)) {
- reiserfs_warning(sb, "journal-1204",
- "REPLAY FAILURE fsck required! "
- "Trying to replay onto a log block");
-abort_replay:
- brelse_array(log_blocks, i);
- brelse_array(real_blocks, i);
- brelse(c_bh);
- brelse(d_bh);
- kfree(log_blocks);
- kfree(real_blocks);
- return -1;
- }
- }
- /* read in the log blocks, memcpy to the corresponding real block */
- bh_read_batch(get_desc_trans_len(desc), log_blocks);
- for (i = 0; i < get_desc_trans_len(desc); i++) {
-
- wait_on_buffer(log_blocks[i]);
- if (!buffer_uptodate(log_blocks[i])) {
- reiserfs_warning(sb, "journal-1212",
- "REPLAY FAILURE fsck required! "
- "buffer write failed");
- brelse_array(log_blocks + i,
- get_desc_trans_len(desc) - i);
- brelse_array(real_blocks, get_desc_trans_len(desc));
- brelse(c_bh);
- brelse(d_bh);
- kfree(log_blocks);
- kfree(real_blocks);
- return -1;
- }
- memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
- real_blocks[i]->b_size);
- set_buffer_uptodate(real_blocks[i]);
- brelse(log_blocks[i]);
- }
- /* flush out the real blocks */
- for (i = 0; i < get_desc_trans_len(desc); i++) {
- set_buffer_dirty(real_blocks[i]);
- write_dirty_buffer(real_blocks[i], 0);
- }
- for (i = 0; i < get_desc_trans_len(desc); i++) {
- wait_on_buffer(real_blocks[i]);
- if (!buffer_uptodate(real_blocks[i])) {
- reiserfs_warning(sb, "journal-1226",
- "REPLAY FAILURE, fsck required! "
- "buffer write failed");
- brelse_array(real_blocks + i,
- get_desc_trans_len(desc) - i);
- brelse(c_bh);
- brelse(d_bh);
- kfree(log_blocks);
- kfree(real_blocks);
- return -1;
- }
- brelse(real_blocks[i]);
- }
- cur_dblock =
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- ((trans_offset + get_desc_trans_len(desc) +
- 2) % SB_ONDISK_JOURNAL_SIZE(sb));
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-1095: setting journal " "start to offset %ld",
- cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
-
- /*
- * init starting values for the first transaction, in case
- * this is the last transaction to be replayed.
- */
- journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
- journal->j_last_flush_trans_id = trans_id;
- journal->j_trans_id = trans_id + 1;
- /* check for trans_id overflow */
- if (journal->j_trans_id == 0)
- journal->j_trans_id = 10;
- brelse(c_bh);
- brelse(d_bh);
- kfree(log_blocks);
- kfree(real_blocks);
- return 0;
-}
-
-/*
- * This function reads blocks starting from block and to max_block of bufsize
- * size (but no more than BUFNR blocks at a time). This proved to improve
- * mounting speed on self-rebuilding raid5 arrays at least.
- * Right now it is only used from journal code. But later we might use it
- * from other places.
- * Note: Do not use journal_getblk/sb_getblk functions here!
- */
-static struct buffer_head *reiserfs_breada(struct block_device *dev,
- b_blocknr_t block, int bufsize,
- b_blocknr_t max_block)
-{
- struct buffer_head *bhlist[BUFNR];
- unsigned int blocks = BUFNR;
- struct buffer_head *bh;
- int i, j;
-
- bh = __getblk(dev, block, bufsize);
- if (!bh || buffer_uptodate(bh))
- return (bh);
-
- if (block + BUFNR > max_block) {
- blocks = max_block - block;
- }
- bhlist[0] = bh;
- j = 1;
- for (i = 1; i < blocks; i++) {
- bh = __getblk(dev, block + i, bufsize);
- if (!bh)
- break;
- if (buffer_uptodate(bh)) {
- brelse(bh);
- break;
- } else
- bhlist[j++] = bh;
- }
- bh = bhlist[0];
- bh_read_nowait(bh, 0);
- bh_readahead_batch(j - 1, &bhlist[1], 0);
- for (i = 1; i < j; i++)
- brelse(bhlist[i]);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return bh;
- brelse(bh);
- return NULL;
-}
-
-/*
- * read and replay the log
- * on a clean unmount, the journal header's next unflushed pointer will be
- * to an invalid transaction. This tests that before finding all the
- * transactions in the log, which makes normal mount times fast.
- *
- * After a crash, this starts with the next unflushed transaction, and
- * replays until it finds one too old, or invalid.
- *
- * On exit, it sets things up so the first transaction will work correctly.
- * NOTE: only called during fs mount
- */
-static int journal_read(struct super_block *sb)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_journal_desc *desc;
- unsigned int oldest_trans_id = 0;
- unsigned int oldest_invalid_trans_id = 0;
- time64_t start;
- unsigned long oldest_start = 0;
- unsigned long cur_dblock = 0;
- unsigned long newest_mount_id = 9;
- struct buffer_head *d_bh;
- struct reiserfs_journal_header *jh;
- int valid_journal_header = 0;
- int replay_count = 0;
- int continue_replay = 1;
- int ret;
-
- cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
- reiserfs_info(sb, "checking transaction log (%pg)\n",
- file_bdev(journal->j_bdev_file));
- start = ktime_get_seconds();
-
- /*
- * step 1, read in the journal header block. Check the transaction
- * it says is the first unflushed, and if that transaction is not
- * valid, replay is done
- */
- journal->j_header_bh = journal_bread(sb,
- SB_ONDISK_JOURNAL_1st_BLOCK(sb)
- + SB_ONDISK_JOURNAL_SIZE(sb));
- if (!journal->j_header_bh) {
- return 1;
- }
- jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
- if (le32_to_cpu(jh->j_first_unflushed_offset) <
- SB_ONDISK_JOURNAL_SIZE(sb)
- && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
- oldest_start =
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- le32_to_cpu(jh->j_first_unflushed_offset);
- oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
- newest_mount_id = le32_to_cpu(jh->j_mount_id);
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-1153: found in "
- "header: first_unflushed_offset %d, last_flushed_trans_id "
- "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
- le32_to_cpu(jh->j_last_flush_trans_id));
- valid_journal_header = 1;
-
- /*
- * now, we try to read the first unflushed offset. If it
- * is not valid, there is nothing more we can do, and it
- * makes no sense to read through the whole log.
- */
- d_bh =
- journal_bread(sb,
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- le32_to_cpu(jh->j_first_unflushed_offset));
- ret = journal_transaction_is_valid(sb, d_bh, NULL, NULL);
- if (!ret) {
- continue_replay = 0;
- }
- brelse(d_bh);
- goto start_log_replay;
- }
-
- /*
- * ok, there are transactions that need to be replayed. start
- * with the first log block, find all the valid transactions, and
- * pick out the oldest.
- */
- while (continue_replay
- && cur_dblock <
- (SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- SB_ONDISK_JOURNAL_SIZE(sb))) {
- /*
- * Note that it is required for blocksize of primary fs
- * device and journal device to be the same
- */
- d_bh =
- reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock,
- sb->s_blocksize,
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- SB_ONDISK_JOURNAL_SIZE(sb));
- ret =
- journal_transaction_is_valid(sb, d_bh,
- &oldest_invalid_trans_id,
- &newest_mount_id);
- if (ret == 1) {
- desc = (struct reiserfs_journal_desc *)d_bh->b_data;
- if (oldest_start == 0) { /* init all oldest_ values */
- oldest_trans_id = get_desc_trans_id(desc);
- oldest_start = d_bh->b_blocknr;
- newest_mount_id = get_desc_mount_id(desc);
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-1179: Setting "
- "oldest_start to offset %llu, trans_id %lu",
- oldest_start -
- SB_ONDISK_JOURNAL_1st_BLOCK
- (sb), oldest_trans_id);
- } else if (oldest_trans_id > get_desc_trans_id(desc)) {
- /* one we just read was older */
- oldest_trans_id = get_desc_trans_id(desc);
- oldest_start = d_bh->b_blocknr;
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-1180: Resetting "
- "oldest_start to offset %lu, trans_id %lu",
- oldest_start -
- SB_ONDISK_JOURNAL_1st_BLOCK
- (sb), oldest_trans_id);
- }
- if (newest_mount_id < get_desc_mount_id(desc)) {
- newest_mount_id = get_desc_mount_id(desc);
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-1299: Setting "
- "newest_mount_id to %d",
- get_desc_mount_id(desc));
- }
- cur_dblock += get_desc_trans_len(desc) + 2;
- } else {
- cur_dblock++;
- }
- brelse(d_bh);
- }
-
-start_log_replay:
- cur_dblock = oldest_start;
- if (oldest_trans_id) {
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-1206: Starting replay "
- "from offset %llu, trans_id %lu",
- cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb),
- oldest_trans_id);
-
- }
- replay_count = 0;
- while (continue_replay && oldest_trans_id > 0) {
- ret =
- journal_read_transaction(sb, cur_dblock, oldest_start,
- oldest_trans_id, newest_mount_id);
- if (ret < 0) {
- return ret;
- } else if (ret != 0) {
- break;
- }
- cur_dblock =
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) + journal->j_start;
- replay_count++;
- if (cur_dblock == oldest_start)
- break;
- }
-
- if (oldest_trans_id == 0) {
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "journal-1225: No valid " "transactions found");
- }
- /*
- * j_start does not get set correctly if we don't replay any
- * transactions. if we had a valid journal_header, set j_start
- * to the first unflushed transaction value, copy the trans_id
- * from the header
- */
- if (valid_journal_header && replay_count == 0) {
- journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
- journal->j_trans_id =
- le32_to_cpu(jh->j_last_flush_trans_id) + 1;
- /* check for trans_id overflow */
- if (journal->j_trans_id == 0)
- journal->j_trans_id = 10;
- journal->j_last_flush_trans_id =
- le32_to_cpu(jh->j_last_flush_trans_id);
- journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
- } else {
- journal->j_mount_id = newest_mount_id + 1;
- }
- reiserfs_debug(sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
- "newest_mount_id to %lu", journal->j_mount_id);
- journal->j_first_unflushed_offset = journal->j_start;
- if (replay_count > 0) {
- reiserfs_info(sb,
- "replayed %d transactions in %lu seconds\n",
- replay_count, ktime_get_seconds() - start);
- }
- /* needed to satisfy the locking in _update_journal_header_block */
- reiserfs_write_lock(sb);
- if (!bdev_read_only(sb->s_bdev) &&
- _update_journal_header_block(sb, journal->j_start,
- journal->j_last_flush_trans_id)) {
- reiserfs_write_unlock(sb);
- /*
- * replay failed, caller must call free_journal_ram and abort
- * the mount
- */
- return -1;
- }
- reiserfs_write_unlock(sb);
- return 0;
-}
-
-static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
-{
- struct reiserfs_journal_list *jl;
- jl = kzalloc(sizeof(struct reiserfs_journal_list),
- GFP_NOFS | __GFP_NOFAIL);
- INIT_LIST_HEAD(&jl->j_list);
- INIT_LIST_HEAD(&jl->j_working_list);
- INIT_LIST_HEAD(&jl->j_tail_bh_list);
- INIT_LIST_HEAD(&jl->j_bh_list);
- mutex_init(&jl->j_commit_mutex);
- SB_JOURNAL(s)->j_num_lists++;
- get_journal_list(jl);
- return jl;
-}
-
-static void journal_list_init(struct super_block *sb)
-{
- SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
-}
-
-static void release_journal_dev(struct reiserfs_journal *journal)
-{
- if (journal->j_bdev_file) {
- bdev_fput(journal->j_bdev_file);
- journal->j_bdev_file = NULL;
- }
-}
-
-static int journal_init_dev(struct super_block *super,
- struct reiserfs_journal *journal,
- const char *jdev_name)
-{
- blk_mode_t blkdev_mode = BLK_OPEN_READ;
- void *holder = journal;
- int result;
- dev_t jdev;
-
- result = 0;
-
- journal->j_bdev_file = NULL;
- jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
- new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
-
- if (!bdev_read_only(super->s_bdev))
- blkdev_mode |= BLK_OPEN_WRITE;
-
- /* there is no "jdev" option and journal is on separate device */
- if ((!jdev_name || !jdev_name[0])) {
- if (jdev == super->s_dev)
- holder = NULL;
- journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode,
- holder, NULL);
- if (IS_ERR(journal->j_bdev_file)) {
- result = PTR_ERR(journal->j_bdev_file);
- journal->j_bdev_file = NULL;
- reiserfs_warning(super, "sh-458",
- "cannot init journal device unknown-block(%u,%u): %i",
- MAJOR(jdev), MINOR(jdev), result);
- return result;
- } else if (jdev != super->s_dev)
- set_blocksize(journal->j_bdev_file, super->s_blocksize);
-
- return 0;
- }
-
- journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode,
- holder, NULL);
- if (IS_ERR(journal->j_bdev_file)) {
- result = PTR_ERR(journal->j_bdev_file);
- journal->j_bdev_file = NULL;
- reiserfs_warning(super, "sh-457",
- "journal_init_dev: Cannot open '%s': %i",
- jdev_name, result);
- return result;
- }
-
- set_blocksize(journal->j_bdev_file, super->s_blocksize);
- reiserfs_info(super,
- "journal_init_dev: journal device: %pg\n",
- file_bdev(journal->j_bdev_file));
- return 0;
-}
-
-/*
- * When creating/tuning a file system user can assign some
- * journal params within boundaries which depend on the ratio
- * blocksize/standard_blocksize.
- *
- * For blocks >= standard_blocksize transaction size should
- * be not less then JOURNAL_TRANS_MIN_DEFAULT, and not more
- * then JOURNAL_TRANS_MAX_DEFAULT.
- *
- * For blocks < standard_blocksize these boundaries should be
- * decreased proportionally.
- */
-#define REISERFS_STANDARD_BLKSIZE (4096)
-
-static int check_advise_trans_params(struct super_block *sb,
- struct reiserfs_journal *journal)
-{
- if (journal->j_trans_max) {
- /* Non-default journal params. Do sanity check for them. */
- int ratio = 1;
- if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
- ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
-
- if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio ||
- journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio ||
- SB_ONDISK_JOURNAL_SIZE(sb) / journal->j_trans_max <
- JOURNAL_MIN_RATIO) {
- reiserfs_warning(sb, "sh-462",
- "bad transaction max size (%u). "
- "FSCK?", journal->j_trans_max);
- return 1;
- }
- if (journal->j_max_batch != (journal->j_trans_max) *
- JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT) {
- reiserfs_warning(sb, "sh-463",
- "bad transaction max batch (%u). "
- "FSCK?", journal->j_max_batch);
- return 1;
- }
- } else {
- /*
- * Default journal params.
- * The file system was created by old version
- * of mkreiserfs, so some fields contain zeros,
- * and we need to advise proper values for them
- */
- if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
- reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
- sb->s_blocksize);
- return 1;
- }
- journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
- journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
- journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
- }
- return 0;
-}
-
-/* must be called once on fs mount. calls journal_read for you */
-int journal_init(struct super_block *sb, const char *j_dev_name,
- int old_format, unsigned int commit_max_age)
-{
- int num_cnodes = SB_ONDISK_JOURNAL_SIZE(sb) * 2;
- struct buffer_head *bhjh;
- struct reiserfs_super_block *rs;
- struct reiserfs_journal_header *jh;
- struct reiserfs_journal *journal;
- struct reiserfs_journal_list *jl;
- int ret;
-
- journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
- if (!journal) {
- reiserfs_warning(sb, "journal-1256",
- "unable to get memory for journal structure");
- return 1;
- }
- INIT_LIST_HEAD(&journal->j_bitmap_nodes);
- INIT_LIST_HEAD(&journal->j_prealloc_list);
- INIT_LIST_HEAD(&journal->j_working_list);
- INIT_LIST_HEAD(&journal->j_journal_list);
- journal->j_persistent_trans = 0;
- if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
- reiserfs_bmap_count(sb)))
- goto free_and_return;
-
- allocate_bitmap_nodes(sb);
-
- /* reserved for journal area support */
- SB_JOURNAL_1st_RESERVED_BLOCK(sb) = (old_format ?
- REISERFS_OLD_DISK_OFFSET_IN_BYTES
- / sb->s_blocksize +
- reiserfs_bmap_count(sb) +
- 1 :
- REISERFS_DISK_OFFSET_IN_BYTES /
- sb->s_blocksize + 2);
-
- /*
- * Sanity check to see is the standard journal fitting
- * within first bitmap (actual for small blocksizes)
- */
- if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
- (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
- SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
- reiserfs_warning(sb, "journal-1393",
- "journal does not fit for area addressed "
- "by first of bitmap blocks. It starts at "
- "%u and its size is %u. Block size %ld",
- SB_JOURNAL_1st_RESERVED_BLOCK(sb),
- SB_ONDISK_JOURNAL_SIZE(sb),
- sb->s_blocksize);
- goto free_and_return;
- }
-
- /*
- * Sanity check to see if journal first block is correct.
- * If journal first block is invalid it can cause
- * zeroing important superblock members.
- */
- if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) {
- reiserfs_warning(sb, "journal-1393",
- "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d",
- SB_JOURNAL_1st_RESERVED_BLOCK(sb),
- SB_ONDISK_JOURNAL_1st_BLOCK(sb));
- goto free_and_return;
- }
-
- if (journal_init_dev(sb, journal, j_dev_name) != 0) {
- reiserfs_warning(sb, "sh-462",
- "unable to initialize journal device");
- goto free_and_return;
- }
-
- rs = SB_DISK_SUPER_BLOCK(sb);
-
- /* read journal header */
- bhjh = journal_bread(sb,
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- SB_ONDISK_JOURNAL_SIZE(sb));
- if (!bhjh) {
- reiserfs_warning(sb, "sh-459",
- "unable to read journal header");
- goto free_and_return;
- }
- jh = (struct reiserfs_journal_header *)(bhjh->b_data);
-
- /* make sure that journal matches to the super block */
- if (is_reiserfs_jr(rs)
- && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
- sb_jp_journal_magic(rs))) {
- reiserfs_warning(sb, "sh-460",
- "journal header magic %x (device %pg) does "
- "not match to magic found in super block %x",
- jh->jh_journal.jp_journal_magic,
- file_bdev(journal->j_bdev_file),
- sb_jp_journal_magic(rs));
- brelse(bhjh);
- goto free_and_return;
- }
-
- journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
- journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
- journal->j_max_commit_age =
- le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
- journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
-
- if (check_advise_trans_params(sb, journal) != 0)
- goto free_and_return;
- journal->j_default_max_commit_age = journal->j_max_commit_age;
-
- if (commit_max_age != 0) {
- journal->j_max_commit_age = commit_max_age;
- journal->j_max_trans_age = commit_max_age;
- }
-
- reiserfs_info(sb, "journal params: device %pg, size %u, "
- "journal first block %u, max trans len %u, max batch %u, "
- "max commit age %u, max trans age %u\n",
- file_bdev(journal->j_bdev_file),
- SB_ONDISK_JOURNAL_SIZE(sb),
- SB_ONDISK_JOURNAL_1st_BLOCK(sb),
- journal->j_trans_max,
- journal->j_max_batch,
- journal->j_max_commit_age, journal->j_max_trans_age);
-
- brelse(bhjh);
-
- journal->j_list_bitmap_index = 0;
- journal_list_init(sb);
-
- memset(journal->j_list_hash_table, 0,
- JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
-
- INIT_LIST_HEAD(&journal->j_dirty_buffers);
- spin_lock_init(&journal->j_dirty_buffers_lock);
-
- journal->j_start = 0;
- journal->j_len = 0;
- journal->j_len_alloc = 0;
- atomic_set(&journal->j_wcount, 0);
- atomic_set(&journal->j_async_throttle, 0);
- journal->j_bcount = 0;
- journal->j_trans_start_time = 0;
- journal->j_last = NULL;
- journal->j_first = NULL;
- init_waitqueue_head(&journal->j_join_wait);
- mutex_init(&journal->j_mutex);
- mutex_init(&journal->j_flush_mutex);
-
- journal->j_trans_id = 10;
- journal->j_mount_id = 10;
- journal->j_state = 0;
- atomic_set(&journal->j_jlock, 0);
- journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
- journal->j_cnode_free_orig = journal->j_cnode_free_list;
- journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
- journal->j_cnode_used = 0;
- journal->j_must_wait = 0;
-
- if (journal->j_cnode_free == 0) {
- reiserfs_warning(sb, "journal-2004", "Journal cnode memory "
- "allocation failed (%ld bytes). Journal is "
- "too large for available memory. Usually "
- "this is due to a journal that is too large.",
- sizeof (struct reiserfs_journal_cnode) * num_cnodes);
- goto free_and_return;
- }
-
- init_journal_hash(sb);
- jl = journal->j_current_jl;
-
- /*
- * get_list_bitmap() may call flush_commit_list() which
- * requires the lock. Calling flush_commit_list() shouldn't happen
- * this early but I like to be paranoid.
- */
- reiserfs_write_lock(sb);
- jl->j_list_bitmap = get_list_bitmap(sb, jl);
- reiserfs_write_unlock(sb);
- if (!jl->j_list_bitmap) {
- reiserfs_warning(sb, "journal-2005",
- "get_list_bitmap failed for journal list 0");
- goto free_and_return;
- }
-
- ret = journal_read(sb);
- if (ret < 0) {
- reiserfs_warning(sb, "reiserfs-2006",
- "Replay Failure, unable to mount");
- goto free_and_return;
- }
-
- INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
- journal->j_work_sb = sb;
- return 0;
-free_and_return:
- free_journal_ram(sb);
- return 1;
-}
-
-/*
- * test for a polite end of the current transaction. Used by file_write,
- * and should be used by delete to make sure they don't write more than
- * can fit inside a single transaction
- */
-int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
- int new_alloc)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
- time64_t now = ktime_get_seconds();
- /* cannot restart while nested */
- BUG_ON(!th->t_trans_id);
- if (th->t_refcount > 1)
- return 0;
- if (journal->j_must_wait > 0 ||
- (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
- atomic_read(&journal->j_jlock) ||
- (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
- journal->j_cnode_free < (journal->j_trans_max * 3)) {
- return 1;
- }
-
- journal->j_len_alloc += new_alloc;
- th->t_blocks_allocated += new_alloc ;
- return 0;
-}
-
-/* this must be called inside a transaction */
-void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
- BUG_ON(!th->t_trans_id);
- journal->j_must_wait = 1;
- set_bit(J_WRITERS_BLOCKED, &journal->j_state);
- return;
-}
-
-/* this must be called without a transaction started */
-void reiserfs_allow_writes(struct super_block *s)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
- wake_up(&journal->j_join_wait);
-}
-
-/* this must be called without a transaction started */
-void reiserfs_wait_on_write_block(struct super_block *s)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- wait_event(journal->j_join_wait,
- !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
-}
-
-static void queue_log_writer(struct super_block *s)
-{
- wait_queue_entry_t wait;
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- set_bit(J_WRITERS_QUEUED, &journal->j_state);
-
- /*
- * we don't want to use wait_event here because
- * we only want to wait once.
- */
- init_waitqueue_entry(&wait, current);
- add_wait_queue(&journal->j_join_wait, &wait);
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
- int depth = reiserfs_write_unlock_nested(s);
- schedule();
- reiserfs_write_lock_nested(s, depth);
- }
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&journal->j_join_wait, &wait);
-}
-
-static void wake_queued_writers(struct super_block *s)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
- wake_up(&journal->j_join_wait);
-}
-
-static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- unsigned long bcount = journal->j_bcount;
- while (1) {
- int depth;
-
- depth = reiserfs_write_unlock_nested(sb);
- schedule_timeout_uninterruptible(1);
- reiserfs_write_lock_nested(sb, depth);
-
- journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
- while ((atomic_read(&journal->j_wcount) > 0 ||
- atomic_read(&journal->j_jlock)) &&
- journal->j_trans_id == trans_id) {
- queue_log_writer(sb);
- }
- if (journal->j_trans_id != trans_id)
- break;
- if (bcount == journal->j_bcount)
- break;
- bcount = journal->j_bcount;
- }
-}
-
-/*
- * join == true if you must join an existing transaction.
- * join == false if you can deal with waiting for others to finish
- *
- * this will block until the transaction is joinable. send the number of
- * blocks you expect to use in nblocks.
-*/
-static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks,
- int join)
-{
- time64_t now = ktime_get_seconds();
- unsigned int old_trans_id;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_transaction_handle myth;
- int retval;
- int depth;
-
- reiserfs_check_lock_depth(sb, "journal_begin");
- BUG_ON(nblocks > journal->j_trans_max);
-
- PROC_INFO_INC(sb, journal.journal_being);
- /* set here for journal_join */
- th->t_refcount = 1;
- th->t_super = sb;
-
-relock:
- lock_journal(sb);
- if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
- unlock_journal(sb);
- retval = journal->j_errno;
- goto out_fail;
- }
- journal->j_bcount++;
-
- if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
- unlock_journal(sb);
- depth = reiserfs_write_unlock_nested(sb);
- reiserfs_wait_on_write_block(sb);
- reiserfs_write_lock_nested(sb, depth);
- PROC_INFO_INC(sb, journal.journal_relock_writers);
- goto relock;
- }
- now = ktime_get_seconds();
-
- /*
- * if there is no room in the journal OR
- * if this transaction is too old, and we weren't called joinable,
- * wait for it to finish before beginning we don't sleep if there
- * aren't other writers
- */
-
- if ((!join && journal->j_must_wait > 0) ||
- (!join
- && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
- || (!join && atomic_read(&journal->j_wcount) > 0
- && journal->j_trans_start_time > 0
- && (now - journal->j_trans_start_time) >
- journal->j_max_trans_age) || (!join
- && atomic_read(&journal->j_jlock))
- || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
-
- old_trans_id = journal->j_trans_id;
- /* allow others to finish this transaction */
- unlock_journal(sb);
-
- if (!join && (journal->j_len_alloc + nblocks + 2) >=
- journal->j_max_batch &&
- ((journal->j_len + nblocks + 2) * 100) <
- (journal->j_len_alloc * 75)) {
- if (atomic_read(&journal->j_wcount) > 10) {
- queue_log_writer(sb);
- goto relock;
- }
- }
- /*
- * don't mess with joining the transaction if all we
- * have to do is wait for someone else to do a commit
- */
- if (atomic_read(&journal->j_jlock)) {
- while (journal->j_trans_id == old_trans_id &&
- atomic_read(&journal->j_jlock)) {
- queue_log_writer(sb);
- }
- goto relock;
- }
- retval = journal_join(&myth, sb);
- if (retval)
- goto out_fail;
-
- /* someone might have ended the transaction while we joined */
- if (old_trans_id != journal->j_trans_id) {
- retval = do_journal_end(&myth, 0);
- } else {
- retval = do_journal_end(&myth, COMMIT_NOW);
- }
-
- if (retval)
- goto out_fail;
-
- PROC_INFO_INC(sb, journal.journal_relock_wcount);
- goto relock;
- }
- /* we are the first writer, set trans_id */
- if (journal->j_trans_start_time == 0) {
- journal->j_trans_start_time = ktime_get_seconds();
- }
- atomic_inc(&journal->j_wcount);
- journal->j_len_alloc += nblocks;
- th->t_blocks_logged = 0;
- th->t_blocks_allocated = nblocks;
- th->t_trans_id = journal->j_trans_id;
- unlock_journal(sb);
- INIT_LIST_HEAD(&th->t_list);
- return 0;
-
-out_fail:
- memset(th, 0, sizeof(*th));
- /*
- * Re-set th->t_super, so we can properly keep track of how many
- * persistent transactions there are. We need to do this so if this
- * call is part of a failed restart_transaction, we can free it later
- */
- th->t_super = sb;
- return retval;
-}
-
-struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
- super_block
- *s,
- int nblocks)
-{
- int ret;
- struct reiserfs_transaction_handle *th;
-
- /*
- * if we're nesting into an existing transaction. It will be
- * persistent on its own
- */
- if (reiserfs_transaction_running(s)) {
- th = current->journal_info;
- th->t_refcount++;
- BUG_ON(th->t_refcount < 2);
-
- return th;
- }
- th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
- if (!th)
- return NULL;
- ret = journal_begin(th, s, nblocks);
- if (ret) {
- kfree(th);
- return NULL;
- }
-
- SB_JOURNAL(s)->j_persistent_trans++;
- return th;
-}
-
-int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
-{
- struct super_block *s = th->t_super;
- int ret = 0;
- if (th->t_trans_id)
- ret = journal_end(th);
- else
- ret = -EIO;
- if (th->t_refcount == 0) {
- SB_JOURNAL(s)->j_persistent_trans--;
- kfree(th);
- }
- return ret;
-}
-
-static int journal_join(struct reiserfs_transaction_handle *th,
- struct super_block *sb)
-{
- struct reiserfs_transaction_handle *cur_th = current->journal_info;
-
- /*
- * this keeps do_journal_end from NULLing out the
- * current->journal_info pointer
- */
- th->t_handle_save = cur_th;
- BUG_ON(cur_th && cur_th->t_refcount > 1);
- return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN);
-}
-
-int journal_join_abort(struct reiserfs_transaction_handle *th,
- struct super_block *sb)
-{
- struct reiserfs_transaction_handle *cur_th = current->journal_info;
-
- /*
- * this keeps do_journal_end from NULLing out the
- * current->journal_info pointer
- */
- th->t_handle_save = cur_th;
- BUG_ON(cur_th && cur_th->t_refcount > 1);
- return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT);
-}
-
-int journal_begin(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks)
-{
- struct reiserfs_transaction_handle *cur_th = current->journal_info;
- int ret;
-
- th->t_handle_save = NULL;
- if (cur_th) {
- /* we are nesting into the current transaction */
- if (cur_th->t_super == sb) {
- BUG_ON(!cur_th->t_refcount);
- cur_th->t_refcount++;
- memcpy(th, cur_th, sizeof(*th));
- if (th->t_refcount <= 1)
- reiserfs_warning(sb, "reiserfs-2005",
- "BAD: refcount <= 1, but "
- "journal_info != 0");
- return 0;
- } else {
- /*
- * we've ended up with a handle from a different
- * filesystem. save it and restore on journal_end.
- * This should never really happen...
- */
- reiserfs_warning(sb, "clm-2100",
- "nesting info a different FS");
- th->t_handle_save = current->journal_info;
- current->journal_info = th;
- }
- } else {
- current->journal_info = th;
- }
- ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
- BUG_ON(current->journal_info != th);
-
- /*
- * I guess this boils down to being the reciprocal of clm-2100 above.
- * If do_journal_begin_r fails, we need to put it back, since
- * journal_end won't be called to do it. */
- if (ret)
- current->journal_info = th->t_handle_save;
- else
- BUG_ON(!th->t_refcount);
-
- return ret;
-}
-
-/*
- * puts bh into the current transaction. If it was already there, reorders
- * removes the old pointers from the hash, and puts new ones in (to make
- * sure replay happen in the right order).
- *
- * if it was dirty, cleans and files onto the clean list. I can't let it
- * be dirty again until the transaction is committed.
- *
- * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
- */
-int journal_mark_dirty(struct reiserfs_transaction_handle *th,
- struct buffer_head *bh)
-{
- struct super_block *sb = th->t_super;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_journal_cnode *cn = NULL;
- int count_already_incd = 0;
- int prepared = 0;
- BUG_ON(!th->t_trans_id);
-
- PROC_INFO_INC(sb, journal.mark_dirty);
- if (th->t_trans_id != journal->j_trans_id) {
- reiserfs_panic(th->t_super, "journal-1577",
- "handle trans id %ld != current trans id %ld",
- th->t_trans_id, journal->j_trans_id);
- }
-
- prepared = test_clear_buffer_journal_prepared(bh);
- clear_buffer_journal_restore_dirty(bh);
- /* already in this transaction, we are done */
- if (buffer_journaled(bh)) {
- PROC_INFO_INC(sb, journal.mark_dirty_already);
- return 0;
- }
-
- /*
- * this must be turned into a panic instead of a warning. We can't
- * allow a dirty or journal_dirty or locked buffer to be logged, as
- * some changes could get to disk too early. NOT GOOD.
- */
- if (!prepared || buffer_dirty(bh)) {
- reiserfs_warning(sb, "journal-1777",
- "buffer %llu bad state "
- "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
- (unsigned long long)bh->b_blocknr,
- prepared ? ' ' : '!',
- buffer_locked(bh) ? ' ' : '!',
- buffer_dirty(bh) ? ' ' : '!',
- buffer_journal_dirty(bh) ? ' ' : '!');
- }
-
- if (atomic_read(&journal->j_wcount) <= 0) {
- reiserfs_warning(sb, "journal-1409",
- "returning because j_wcount was %d",
- atomic_read(&journal->j_wcount));
- return 1;
- }
- /*
- * this error means I've screwed up, and we've overflowed
- * the transaction. Nothing can be done here, except make the
- * FS readonly or panic.
- */
- if (journal->j_len >= journal->j_trans_max) {
- reiserfs_panic(th->t_super, "journal-1413",
- "j_len (%lu) is too big",
- journal->j_len);
- }
-
- if (buffer_journal_dirty(bh)) {
- count_already_incd = 1;
- PROC_INFO_INC(sb, journal.mark_dirty_notjournal);
- clear_buffer_journal_dirty(bh);
- }
-
- if (journal->j_len > journal->j_len_alloc) {
- journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
- }
-
- set_buffer_journaled(bh);
-
- /* now put this guy on the end */
- if (!cn) {
- cn = get_cnode(sb);
- if (!cn) {
- reiserfs_panic(sb, "journal-4", "get_cnode failed!");
- }
-
- if (th->t_blocks_logged == th->t_blocks_allocated) {
- th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
- journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
- }
- th->t_blocks_logged++;
- journal->j_len++;
-
- cn->bh = bh;
- cn->blocknr = bh->b_blocknr;
- cn->sb = sb;
- cn->jlist = NULL;
- insert_journal_hash(journal->j_hash_table, cn);
- if (!count_already_incd) {
- get_bh(bh);
- }
- }
- cn->next = NULL;
- cn->prev = journal->j_last;
- cn->bh = bh;
- if (journal->j_last) {
- journal->j_last->next = cn;
- journal->j_last = cn;
- } else {
- journal->j_first = cn;
- journal->j_last = cn;
- }
- reiserfs_schedule_old_flush(sb);
- return 0;
-}
-
-int journal_end(struct reiserfs_transaction_handle *th)
-{
- struct super_block *sb = th->t_super;
- if (!current->journal_info && th->t_refcount > 1)
- reiserfs_warning(sb, "REISER-NESTING",
- "th NULL, refcount %d", th->t_refcount);
-
- if (!th->t_trans_id) {
- WARN_ON(1);
- return -EIO;
- }
-
- th->t_refcount--;
- if (th->t_refcount > 0) {
- struct reiserfs_transaction_handle *cur_th =
- current->journal_info;
-
- /*
- * we aren't allowed to close a nested transaction on a
- * different filesystem from the one in the task struct
- */
- BUG_ON(cur_th->t_super != th->t_super);
-
- if (th != cur_th) {
- memcpy(current->journal_info, th, sizeof(*th));
- th->t_trans_id = 0;
- }
- return 0;
- } else {
- return do_journal_end(th, 0);
- }
-}
-
-/*
- * removes from the current transaction, relsing and descrementing any counters.
- * also files the removed buffer directly onto the clean list
- *
- * called by journal_mark_freed when a block has been deleted
- *
- * returns 1 if it cleaned and relsed the buffer. 0 otherwise
- */
-static int remove_from_transaction(struct super_block *sb,
- b_blocknr_t blocknr, int already_cleaned)
-{
- struct buffer_head *bh;
- struct reiserfs_journal_cnode *cn;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- int ret = 0;
-
- cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
- if (!cn || !cn->bh) {
- return ret;
- }
- bh = cn->bh;
- if (cn->prev) {
- cn->prev->next = cn->next;
- }
- if (cn->next) {
- cn->next->prev = cn->prev;
- }
- if (cn == journal->j_first) {
- journal->j_first = cn->next;
- }
- if (cn == journal->j_last) {
- journal->j_last = cn->prev;
- }
- remove_journal_hash(sb, journal->j_hash_table, NULL,
- bh->b_blocknr, 0);
- clear_buffer_journaled(bh); /* don't log this one */
-
- if (!already_cleaned) {
- clear_buffer_journal_dirty(bh);
- clear_buffer_dirty(bh);
- clear_buffer_journal_test(bh);
- put_bh(bh);
- if (atomic_read(&bh->b_count) < 0) {
- reiserfs_warning(sb, "journal-1752",
- "b_count < 0");
- }
- ret = 1;
- }
- journal->j_len--;
- journal->j_len_alloc--;
- free_cnode(sb, cn);
- return ret;
-}
-
-/*
- * for any cnode in a journal list, it can only be dirtied of all the
- * transactions that include it are committed to disk.
- * this checks through each transaction, and returns 1 if you are allowed
- * to dirty, and 0 if you aren't
- *
- * it is called by dirty_journal_list, which is called after
- * flush_commit_list has gotten all the log blocks for a given
- * transaction on disk
- *
- */
-static int can_dirty(struct reiserfs_journal_cnode *cn)
-{
- struct super_block *sb = cn->sb;
- b_blocknr_t blocknr = cn->blocknr;
- struct reiserfs_journal_cnode *cur = cn->hprev;
- int can_dirty = 1;
-
- /*
- * first test hprev. These are all newer than cn, so any node here
- * with the same block number and dev means this node can't be sent
- * to disk right now.
- */
- while (cur && can_dirty) {
- if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
- cur->blocknr == blocknr) {
- can_dirty = 0;
- }
- cur = cur->hprev;
- }
- /*
- * then test hnext. These are all older than cn. As long as they
- * are committed to the log, it is safe to write cn to disk
- */
- cur = cn->hnext;
- while (cur && can_dirty) {
- if (cur->jlist && cur->jlist->j_len > 0 &&
- atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh &&
- cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
- can_dirty = 0;
- }
- cur = cur->hnext;
- }
- return can_dirty;
-}
-
-/*
- * syncs the commit blocks, but does not force the real buffers to disk
- * will wait until the current transaction is done/committed before returning
- */
-int journal_end_sync(struct reiserfs_transaction_handle *th)
-{
- struct super_block *sb = th->t_super;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
- BUG_ON(!th->t_trans_id);
- /* you can sync while nested, very, very bad */
- BUG_ON(th->t_refcount > 1);
- if (journal->j_len == 0) {
- reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
- 1);
- journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
- }
- return do_journal_end(th, COMMIT_NOW | WAIT);
-}
-
-/* writeback the pending async commits to disk */
-static void flush_async_commits(struct work_struct *work)
-{
- struct reiserfs_journal *journal =
- container_of(work, struct reiserfs_journal, j_work.work);
- struct super_block *sb = journal->j_work_sb;
- struct reiserfs_journal_list *jl;
- struct list_head *entry;
-
- reiserfs_write_lock(sb);
- if (!list_empty(&journal->j_journal_list)) {
- /* last entry is the youngest, commit it and you get everything */
- entry = journal->j_journal_list.prev;
- jl = JOURNAL_LIST_ENTRY(entry);
- flush_commit_list(sb, jl, 1);
- }
- reiserfs_write_unlock(sb);
-}
-
-/*
- * flushes any old transactions to disk
- * ends the current transaction if it is too old
- */
-void reiserfs_flush_old_commits(struct super_block *sb)
-{
- time64_t now;
- struct reiserfs_transaction_handle th;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
- now = ktime_get_seconds();
- /*
- * safety check so we don't flush while we are replaying the log during
- * mount
- */
- if (list_empty(&journal->j_journal_list))
- return;
-
- /*
- * check the current transaction. If there are no writers, and it is
- * too old, finish it, and force the commit blocks to disk
- */
- if (atomic_read(&journal->j_wcount) <= 0 &&
- journal->j_trans_start_time > 0 &&
- journal->j_len > 0 &&
- (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
- if (!journal_join(&th, sb)) {
- reiserfs_prepare_for_journal(sb,
- SB_BUFFER_WITH_SB(sb),
- 1);
- journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
-
- /*
- * we're only being called from kreiserfsd, it makes
- * no sense to do an async commit so that kreiserfsd
- * can do it later
- */
- do_journal_end(&th, COMMIT_NOW | WAIT);
- }
- }
-}
-
-/*
- * returns 0 if do_journal_end should return right away, returns 1 if
- * do_journal_end should finish the commit
- *
- * if the current transaction is too old, but still has writers, this will
- * wait on j_join_wait until all the writers are done. By the time it
- * wakes up, the transaction it was called has already ended, so it just
- * flushes the commit list and returns 0.
- *
- * Won't batch when flush or commit_now is set. Also won't batch when
- * others are waiting on j_join_wait.
- *
- * Note, we can't allow the journal_end to proceed while there are still
- * writers in the log.
- */
-static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
-{
-
- time64_t now;
- int flush = flags & FLUSH_ALL;
- int commit_now = flags & COMMIT_NOW;
- int wait_on_commit = flags & WAIT;
- struct reiserfs_journal_list *jl;
- struct super_block *sb = th->t_super;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
-
- BUG_ON(!th->t_trans_id);
-
- if (th->t_trans_id != journal->j_trans_id) {
- reiserfs_panic(th->t_super, "journal-1577",
- "handle trans id %ld != current trans id %ld",
- th->t_trans_id, journal->j_trans_id);
- }
-
- journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
- /* <= 0 is allowed. unmounting might not call begin */
- if (atomic_read(&journal->j_wcount) > 0)
- atomic_dec(&journal->j_wcount);
-
- /*
- * BUG, deal with case where j_len is 0, but people previously
- * freed blocks need to be released will be dealt with by next
- * transaction that actually writes something, but should be taken
- * care of in this trans
- */
- BUG_ON(journal->j_len == 0);
-
- /*
- * if wcount > 0, and we are called to with flush or commit_now,
- * we wait on j_join_wait. We will wake up when the last writer has
- * finished the transaction, and started it on its way to the disk.
- * Then, we flush the commit or journal list, and just return 0
- * because the rest of journal end was already done for this
- * transaction.
- */
- if (atomic_read(&journal->j_wcount) > 0) {
- if (flush || commit_now) {
- unsigned trans_id;
-
- jl = journal->j_current_jl;
- trans_id = jl->j_trans_id;
- if (wait_on_commit)
- jl->j_state |= LIST_COMMIT_PENDING;
- atomic_set(&journal->j_jlock, 1);
- if (flush) {
- journal->j_next_full_flush = 1;
- }
- unlock_journal(sb);
-
- /*
- * sleep while the current transaction is
- * still j_jlocked
- */
- while (journal->j_trans_id == trans_id) {
- if (atomic_read(&journal->j_jlock)) {
- queue_log_writer(sb);
- } else {
- lock_journal(sb);
- if (journal->j_trans_id == trans_id) {
- atomic_set(&journal->j_jlock,
- 1);
- }
- unlock_journal(sb);
- }
- }
- BUG_ON(journal->j_trans_id == trans_id);
-
- if (commit_now
- && journal_list_still_alive(sb, trans_id)
- && wait_on_commit) {
- flush_commit_list(sb, jl, 1);
- }
- return 0;
- }
- unlock_journal(sb);
- return 0;
- }
-
- /* deal with old transactions where we are the last writers */
- now = ktime_get_seconds();
- if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
- commit_now = 1;
- journal->j_next_async_flush = 1;
- }
- /* don't batch when someone is waiting on j_join_wait */
- /* don't batch when syncing the commit or flushing the whole trans */
- if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock))
- && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
- && journal->j_len_alloc < journal->j_max_batch
- && journal->j_cnode_free > (journal->j_trans_max * 3)) {
- journal->j_bcount++;
- unlock_journal(sb);
- return 0;
- }
-
- if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(sb)) {
- reiserfs_panic(sb, "journal-003",
- "j_start (%ld) is too high",
- journal->j_start);
- }
- return 1;
-}
-
-/*
- * Does all the work that makes deleting blocks safe.
- * when deleting a block mark BH_JNew, just remove it from the current
- * transaction, clean it's buffer_head and move on.
- *
- * otherwise:
- * set a bit for the block in the journal bitmap. That will prevent it from
- * being allocated for unformatted nodes before this transaction has finished.
- *
- * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
- * That will prevent any old transactions with this block from trying to flush
- * to the real location. Since we aren't removing the cnode from the
- * journal_list_hash, *the block can't be reallocated yet.
- *
- * Then remove it from the current transaction, decrementing any counters and
- * filing it on the clean list.
- */
-int journal_mark_freed(struct reiserfs_transaction_handle *th,
- struct super_block *sb, b_blocknr_t blocknr)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_journal_cnode *cn = NULL;
- struct buffer_head *bh = NULL;
- struct reiserfs_list_bitmap *jb = NULL;
- int cleaned = 0;
- BUG_ON(!th->t_trans_id);
-
- cn = get_journal_hash_dev(sb, journal->j_hash_table, blocknr);
- if (cn && cn->bh) {
- bh = cn->bh;
- get_bh(bh);
- }
- /* if it is journal new, we just remove it from this transaction */
- if (bh && buffer_journal_new(bh)) {
- clear_buffer_journal_new(bh);
- clear_prepared_bits(bh);
- reiserfs_clean_and_file_buffer(bh);
- cleaned = remove_from_transaction(sb, blocknr, cleaned);
- } else {
- /*
- * set the bit for this block in the journal bitmap
- * for this transaction
- */
- jb = journal->j_current_jl->j_list_bitmap;
- if (!jb) {
- reiserfs_panic(sb, "journal-1702",
- "journal_list_bitmap is NULL");
- }
- set_bit_in_list_bitmap(sb, blocknr, jb);
-
- /* Note, the entire while loop is not allowed to schedule. */
-
- if (bh) {
- clear_prepared_bits(bh);
- reiserfs_clean_and_file_buffer(bh);
- }
- cleaned = remove_from_transaction(sb, blocknr, cleaned);
-
- /*
- * find all older transactions with this block,
- * make sure they don't try to write it out
- */
- cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
- blocknr);
- while (cn) {
- if (sb == cn->sb && blocknr == cn->blocknr) {
- set_bit(BLOCK_FREED, &cn->state);
- if (cn->bh) {
- /*
- * remove_from_transaction will brelse
- * the buffer if it was in the current
- * trans
- */
- if (!cleaned) {
- clear_buffer_journal_dirty(cn->
- bh);
- clear_buffer_dirty(cn->bh);
- clear_buffer_journal_test(cn->
- bh);
- cleaned = 1;
- put_bh(cn->bh);
- if (atomic_read
- (&cn->bh->b_count) < 0) {
- reiserfs_warning(sb,
- "journal-2138",
- "cn->bh->b_count < 0");
- }
- }
- /*
- * since we are clearing the bh,
- * we MUST dec nonzerolen
- */
- if (cn->jlist) {
- atomic_dec(&cn->jlist->
- j_nonzerolen);
- }
- cn->bh = NULL;
- }
- }
- cn = cn->hnext;
- }
- }
-
- if (bh)
- release_buffer_page(bh); /* get_hash grabs the buffer */
- return 0;
-}
-
-void reiserfs_update_inode_transaction(struct inode *inode)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
- REISERFS_I(inode)->i_jl = journal->j_current_jl;
- REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
-}
-
-/*
- * returns -1 on error, 0 if no commits/barriers were done and 1
- * if a transaction was actually committed and the barrier was done
- */
-static int __commit_trans_jl(struct inode *inode, unsigned long id,
- struct reiserfs_journal_list *jl)
-{
- struct reiserfs_transaction_handle th;
- struct super_block *sb = inode->i_sb;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- int ret = 0;
-
- /*
- * is it from the current transaction,
- * or from an unknown transaction?
- */
- if (id == journal->j_trans_id) {
- jl = journal->j_current_jl;
- /*
- * try to let other writers come in and
- * grow this transaction
- */
- let_transaction_grow(sb, id);
- if (journal->j_trans_id != id) {
- goto flush_commit_only;
- }
-
- ret = journal_begin(&th, sb, 1);
- if (ret)
- return ret;
-
- /* someone might have ended this transaction while we joined */
- if (journal->j_trans_id != id) {
- reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
- 1);
- journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
- ret = journal_end(&th);
- goto flush_commit_only;
- }
-
- ret = journal_end_sync(&th);
- if (!ret)
- ret = 1;
-
- } else {
- /*
- * this gets tricky, we have to make sure the journal list in
- * the inode still exists. We know the list is still around
- * if we've got a larger transaction id than the oldest list
- */
-flush_commit_only:
- if (journal_list_still_alive(inode->i_sb, id)) {
- /*
- * we only set ret to 1 when we know for sure
- * the barrier hasn't been started yet on the commit
- * block.
- */
- if (atomic_read(&jl->j_commit_left) > 1)
- ret = 1;
- flush_commit_list(sb, jl, 1);
- if (journal->j_errno)
- ret = journal->j_errno;
- }
- }
- /* otherwise the list is gone, and long since committed */
- return ret;
-}
-
-int reiserfs_commit_for_inode(struct inode *inode)
-{
- unsigned int id = REISERFS_I(inode)->i_trans_id;
- struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
-
- /*
- * for the whole inode, assume unset id means it was
- * changed in the current transaction. More conservative
- */
- if (!id || !jl) {
- reiserfs_update_inode_transaction(inode);
- id = REISERFS_I(inode)->i_trans_id;
- /* jl will be updated in __commit_trans_jl */
- }
-
- return __commit_trans_jl(inode, id, jl);
-}
-
-void reiserfs_restore_prepared_buffer(struct super_block *sb,
- struct buffer_head *bh)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- PROC_INFO_INC(sb, journal.restore_prepared);
- if (!bh) {
- return;
- }
- if (test_clear_buffer_journal_restore_dirty(bh) &&
- buffer_journal_dirty(bh)) {
- struct reiserfs_journal_cnode *cn;
- reiserfs_write_lock(sb);
- cn = get_journal_hash_dev(sb,
- journal->j_list_hash_table,
- bh->b_blocknr);
- if (cn && can_dirty(cn)) {
- set_buffer_journal_test(bh);
- mark_buffer_dirty(bh);
- }
- reiserfs_write_unlock(sb);
- }
- clear_buffer_journal_prepared(bh);
-}
-
-extern struct tree_balance *cur_tb;
-/*
- * before we can change a metadata block, we have to make sure it won't
- * be written to disk while we are altering it. So, we must:
- * clean it
- * wait on it.
- */
-int reiserfs_prepare_for_journal(struct super_block *sb,
- struct buffer_head *bh, int wait)
-{
- PROC_INFO_INC(sb, journal.prepare);
-
- if (!trylock_buffer(bh)) {
- if (!wait)
- return 0;
- lock_buffer(bh);
- }
- set_buffer_journal_prepared(bh);
- if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
- clear_buffer_journal_test(bh);
- set_buffer_journal_restore_dirty(bh);
- }
- unlock_buffer(bh);
- return 1;
-}
-
-/*
- * long and ugly. If flush, will not return until all commit
- * blocks and all real buffers in the trans are on disk.
- * If no_async, won't return until all commit blocks are on disk.
- *
- * keep reading, there are comments as you go along
- *
- * If the journal is aborted, we just clean up. Things like flushing
- * journal lists, etc just won't happen.
- */
-static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
-{
- struct super_block *sb = th->t_super;
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- struct reiserfs_journal_cnode *cn, *next, *jl_cn;
- struct reiserfs_journal_cnode *last_cn = NULL;
- struct reiserfs_journal_desc *desc;
- struct reiserfs_journal_commit *commit;
- struct buffer_head *c_bh; /* commit bh */
- struct buffer_head *d_bh; /* desc bh */
- int cur_write_start = 0; /* start index of current log write */
- int i;
- int flush;
- int wait_on_commit;
- struct reiserfs_journal_list *jl, *temp_jl;
- struct list_head *entry, *safe;
- unsigned long jindex;
- unsigned int commit_trans_id;
- int trans_half;
- int depth;
-
- BUG_ON(th->t_refcount > 1);
- BUG_ON(!th->t_trans_id);
- BUG_ON(!th->t_super);
-
- /*
- * protect flush_older_commits from doing mistakes if the
- * transaction ID counter gets overflowed.
- */
- if (th->t_trans_id == ~0U)
- flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
- flush = flags & FLUSH_ALL;
- wait_on_commit = flags & WAIT;
-
- current->journal_info = th->t_handle_save;
- reiserfs_check_lock_depth(sb, "journal end");
- if (journal->j_len == 0) {
- reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
- 1);
- journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
- }
-
- lock_journal(sb);
- if (journal->j_next_full_flush) {
- flags |= FLUSH_ALL;
- flush = 1;
- }
- if (journal->j_next_async_flush) {
- flags |= COMMIT_NOW | WAIT;
- wait_on_commit = 1;
- }
-
- /*
- * check_journal_end locks the journal, and unlocks if it does
- * not return 1 it tells us if we should continue with the
- * journal_end, or just return
- */
- if (!check_journal_end(th, flags)) {
- reiserfs_schedule_old_flush(sb);
- wake_queued_writers(sb);
- reiserfs_async_progress_wait(sb);
- goto out;
- }
-
- /* check_journal_end might set these, check again */
- if (journal->j_next_full_flush) {
- flush = 1;
- }
-
- /*
- * j must wait means we have to flush the log blocks, and the
- * real blocks for this transaction
- */
- if (journal->j_must_wait > 0) {
- flush = 1;
- }
-#ifdef REISERFS_PREALLOCATE
- /*
- * quota ops might need to nest, setup the journal_info pointer
- * for them and raise the refcount so that it is > 0.
- */
- current->journal_info = th;
- th->t_refcount++;
-
- /* it should not involve new blocks into the transaction */
- reiserfs_discard_all_prealloc(th);
-
- th->t_refcount--;
- current->journal_info = th->t_handle_save;
-#endif
-
- /* setup description block */
- d_bh =
- journal_getblk(sb,
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- journal->j_start);
- set_buffer_uptodate(d_bh);
- desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
- memset(d_bh->b_data, 0, d_bh->b_size);
- memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
- set_desc_trans_id(desc, journal->j_trans_id);
-
- /*
- * setup commit block. Don't write (keep it clean too) this one
- * until after everyone else is written
- */
- c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- ((journal->j_start + journal->j_len +
- 1) % SB_ONDISK_JOURNAL_SIZE(sb)));
- commit = (struct reiserfs_journal_commit *)c_bh->b_data;
- memset(c_bh->b_data, 0, c_bh->b_size);
- set_commit_trans_id(commit, journal->j_trans_id);
- set_buffer_uptodate(c_bh);
-
- /* init this journal list */
- jl = journal->j_current_jl;
-
- /*
- * we lock the commit before doing anything because
- * we want to make sure nobody tries to run flush_commit_list until
- * the new transaction is fully setup, and we've already flushed the
- * ordered bh list
- */
- reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
-
- /* save the transaction id in case we need to commit it later */
- commit_trans_id = jl->j_trans_id;
-
- atomic_set(&jl->j_older_commits_done, 0);
- jl->j_trans_id = journal->j_trans_id;
- jl->j_timestamp = journal->j_trans_start_time;
- jl->j_commit_bh = c_bh;
- jl->j_start = journal->j_start;
- jl->j_len = journal->j_len;
- atomic_set(&jl->j_nonzerolen, journal->j_len);
- atomic_set(&jl->j_commit_left, journal->j_len + 2);
- jl->j_realblock = NULL;
-
- /*
- * The ENTIRE FOR LOOP MUST not cause schedule to occur.
- * for each real block, add it to the journal list hash,
- * copy into real block index array in the commit or desc block
- */
- trans_half = journal_trans_half(sb->s_blocksize);
- for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
- if (buffer_journaled(cn->bh)) {
- jl_cn = get_cnode(sb);
- if (!jl_cn) {
- reiserfs_panic(sb, "journal-1676",
- "get_cnode returned NULL");
- }
- if (i == 0) {
- jl->j_realblock = jl_cn;
- }
- jl_cn->prev = last_cn;
- jl_cn->next = NULL;
- if (last_cn) {
- last_cn->next = jl_cn;
- }
- last_cn = jl_cn;
- /*
- * make sure the block we are trying to log
- * is not a block of journal or reserved area
- */
- if (is_block_in_log_or_reserved_area
- (sb, cn->bh->b_blocknr)) {
- reiserfs_panic(sb, "journal-2332",
- "Trying to log block %lu, "
- "which is a log block",
- cn->bh->b_blocknr);
- }
- jl_cn->blocknr = cn->bh->b_blocknr;
- jl_cn->state = 0;
- jl_cn->sb = sb;
- jl_cn->bh = cn->bh;
- jl_cn->jlist = jl;
- insert_journal_hash(journal->j_list_hash_table, jl_cn);
- if (i < trans_half) {
- desc->j_realblock[i] =
- cpu_to_le32(cn->bh->b_blocknr);
- } else {
- commit->j_realblock[i - trans_half] =
- cpu_to_le32(cn->bh->b_blocknr);
- }
- } else {
- i--;
- }
- }
- set_desc_trans_len(desc, journal->j_len);
- set_desc_mount_id(desc, journal->j_mount_id);
- set_desc_trans_id(desc, journal->j_trans_id);
- set_commit_trans_len(commit, journal->j_len);
-
- /*
- * special check in case all buffers in the journal
- * were marked for not logging
- */
- BUG_ON(journal->j_len == 0);
-
- /*
- * we're about to dirty all the log blocks, mark the description block
- * dirty now too. Don't mark the commit block dirty until all the
- * others are on disk
- */
- mark_buffer_dirty(d_bh);
-
- /*
- * first data block is j_start + 1, so add one to
- * cur_write_start wherever you use it
- */
- cur_write_start = journal->j_start;
- cn = journal->j_first;
- jindex = 1; /* start at one so we don't get the desc again */
- while (cn) {
- clear_buffer_journal_new(cn->bh);
- /* copy all the real blocks into log area. dirty log blocks */
- if (buffer_journaled(cn->bh)) {
- struct buffer_head *tmp_bh;
- char *addr;
- struct page *page;
- tmp_bh =
- journal_getblk(sb,
- SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
- ((cur_write_start +
- jindex) %
- SB_ONDISK_JOURNAL_SIZE(sb)));
- set_buffer_uptodate(tmp_bh);
- page = cn->bh->b_page;
- addr = kmap(page);
- memcpy(tmp_bh->b_data,
- addr + offset_in_page(cn->bh->b_data),
- cn->bh->b_size);
- kunmap(page);
- mark_buffer_dirty(tmp_bh);
- jindex++;
- set_buffer_journal_dirty(cn->bh);
- clear_buffer_journaled(cn->bh);
- } else {
- /*
- * JDirty cleared sometime during transaction.
- * don't log this one
- */
- reiserfs_warning(sb, "journal-2048",
- "BAD, buffer in journal hash, "
- "but not JDirty!");
- brelse(cn->bh);
- }
- next = cn->next;
- free_cnode(sb, cn);
- cn = next;
- reiserfs_cond_resched(sb);
- }
-
- /*
- * we are done with both the c_bh and d_bh, but
- * c_bh must be written after all other commit blocks,
- * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
- */
-
- journal->j_current_jl = alloc_journal_list(sb);
-
- /* now it is safe to insert this transaction on the main list */
- list_add_tail(&jl->j_list, &journal->j_journal_list);
- list_add_tail(&jl->j_working_list, &journal->j_working_list);
- journal->j_num_work_lists++;
-
- /* reset journal values for the next transaction */
- journal->j_start =
- (journal->j_start + journal->j_len +
- 2) % SB_ONDISK_JOURNAL_SIZE(sb);
- atomic_set(&journal->j_wcount, 0);
- journal->j_bcount = 0;
- journal->j_last = NULL;
- journal->j_first = NULL;
- journal->j_len = 0;
- journal->j_trans_start_time = 0;
- /* check for trans_id overflow */
- if (++journal->j_trans_id == 0)
- journal->j_trans_id = 10;
- journal->j_current_jl->j_trans_id = journal->j_trans_id;
- journal->j_must_wait = 0;
- journal->j_len_alloc = 0;
- journal->j_next_full_flush = 0;
- journal->j_next_async_flush = 0;
- init_journal_hash(sb);
-
- /*
- * make sure reiserfs_add_jh sees the new current_jl before we
- * write out the tails
- */
- smp_mb();
-
- /*
- * tail conversion targets have to hit the disk before we end the
- * transaction. Otherwise a later transaction might repack the tail
- * before this transaction commits, leaving the data block unflushed
- * and clean, if we crash before the later transaction commits, the
- * data block is lost.
- */
- if (!list_empty(&jl->j_tail_bh_list)) {
- depth = reiserfs_write_unlock_nested(sb);
- write_ordered_buffers(&journal->j_dirty_buffers_lock,
- journal, jl, &jl->j_tail_bh_list);
- reiserfs_write_lock_nested(sb, depth);
- }
- BUG_ON(!list_empty(&jl->j_tail_bh_list));
- mutex_unlock(&jl->j_commit_mutex);
-
- /*
- * honor the flush wishes from the caller, simple commits can
- * be done outside the journal lock, they are done below
- *
- * if we don't flush the commit list right now, we put it into
- * the work queue so the people waiting on the async progress work
- * queue don't wait for this proc to flush journal lists and such.
- */
- if (flush) {
- flush_commit_list(sb, jl, 1);
- flush_journal_list(sb, jl, 1);
- } else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
- /*
- * Avoid queueing work when sb is being shut down. Transaction
- * will be flushed on journal shutdown.
- */
- if (sb->s_flags & SB_ACTIVE)
- queue_delayed_work(REISERFS_SB(sb)->commit_wq,
- &journal->j_work, HZ / 10);
- }
-
- /*
- * if the next transaction has any chance of wrapping, flush
- * transactions that might get overwritten. If any journal lists
- * are very old flush them as well.
- */
-first_jl:
- list_for_each_safe(entry, safe, &journal->j_journal_list) {
- temp_jl = JOURNAL_LIST_ENTRY(entry);
- if (journal->j_start <= temp_jl->j_start) {
- if ((journal->j_start + journal->j_trans_max + 1) >=
- temp_jl->j_start) {
- flush_used_journal_lists(sb, temp_jl);
- goto first_jl;
- } else if ((journal->j_start +
- journal->j_trans_max + 1) <
- SB_ONDISK_JOURNAL_SIZE(sb)) {
- /*
- * if we don't cross into the next
- * transaction and we don't wrap, there is
- * no way we can overlap any later transactions
- * break now
- */
- break;
- }
- } else if ((journal->j_start +
- journal->j_trans_max + 1) >
- SB_ONDISK_JOURNAL_SIZE(sb)) {
- if (((journal->j_start + journal->j_trans_max + 1) %
- SB_ONDISK_JOURNAL_SIZE(sb)) >=
- temp_jl->j_start) {
- flush_used_journal_lists(sb, temp_jl);
- goto first_jl;
- } else {
- /*
- * we don't overlap anything from out start
- * to the end of the log, and our wrapped
- * portion doesn't overlap anything at
- * the start of the log. We can break
- */
- break;
- }
- }
- }
-
- journal->j_current_jl->j_list_bitmap =
- get_list_bitmap(sb, journal->j_current_jl);
-
- if (!(journal->j_current_jl->j_list_bitmap)) {
- reiserfs_panic(sb, "journal-1996",
- "could not get a list bitmap");
- }
-
- atomic_set(&journal->j_jlock, 0);
- unlock_journal(sb);
- /* wake up any body waiting to join. */
- clear_bit(J_WRITERS_QUEUED, &journal->j_state);
- wake_up(&journal->j_join_wait);
-
- if (!flush && wait_on_commit &&
- journal_list_still_alive(sb, commit_trans_id)) {
- flush_commit_list(sb, jl, 1);
- }
-out:
- reiserfs_check_lock_depth(sb, "journal end2");
-
- memset(th, 0, sizeof(*th));
- /*
- * Re-set th->t_super, so we can properly keep track of how many
- * persistent transactions there are. We need to do this so if this
- * call is part of a failed restart_transaction, we can free it later
- */
- th->t_super = sb;
-
- return journal->j_errno;
-}
-
-/* Send the file system read only and refuse new transactions */
-void reiserfs_abort_journal(struct super_block *sb, int errno)
-{
- struct reiserfs_journal *journal = SB_JOURNAL(sb);
- if (test_bit(J_ABORTED, &journal->j_state))
- return;
-
- if (!journal->j_errno)
- journal->j_errno = errno;
-
- sb->s_flags |= SB_RDONLY;
- set_bit(J_ABORTED, &journal->j_state);
-
-#ifdef CONFIG_REISERFS_CHECK
- dump_stack();
-#endif
-}
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
deleted file mode 100644
index 7f868569d4d0..000000000000
--- a/fs/reiserfs/lbalance.c
+++ /dev/null
@@ -1,1426 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/uaccess.h>
-#include <linux/string.h>
-#include <linux/time.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-/*
- * copy copy_count entries from source directory item to dest buffer
- * (creating new item if needed)
- */
-static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
- struct buffer_head *source, int last_first,
- int item_num, int from, int copy_count)
-{
- struct buffer_head *dest = dest_bi->bi_bh;
- /*
- * either the number of target item, or if we must create a
- * new item, the number of the item we will create it next to
- */
- int item_num_in_dest;
-
- struct item_head *ih;
- struct reiserfs_de_head *deh;
- int copy_records_len; /* length of all records in item to be copied */
- char *records;
-
- ih = item_head(source, item_num);
-
- RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item");
-
- /*
- * length of all record to be copied and first byte of
- * the last of them
- */
- deh = B_I_DEH(source, ih);
- if (copy_count) {
- copy_records_len = (from ? deh_location(&deh[from - 1]) :
- ih_item_len(ih)) -
- deh_location(&deh[from + copy_count - 1]);
- records =
- source->b_data + ih_location(ih) +
- deh_location(&deh[from + copy_count - 1]);
- } else {
- copy_records_len = 0;
- records = NULL;
- }
-
- /* when copy last to first, dest buffer can contain 0 items */
- item_num_in_dest =
- (last_first ==
- LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest)
- - 1);
-
- /*
- * if there are no items in dest or the first/last item in
- * dest is not item of the same directory
- */
- if ((item_num_in_dest == -1) ||
- (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) ||
- (last_first == LAST_TO_FIRST
- && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key,
- leaf_key(dest,
- item_num_in_dest))))
- {
- /* create new item in dest */
- struct item_head new_ih;
-
- /* form item header */
- memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
- put_ih_version(&new_ih, KEY_FORMAT_3_5);
- /* calculate item len */
- put_ih_item_len(&new_ih,
- DEH_SIZE * copy_count + copy_records_len);
- put_ih_entry_count(&new_ih, 0);
-
- if (last_first == LAST_TO_FIRST) {
- /* form key by the following way */
- if (from < ih_entry_count(ih)) {
- set_le_ih_k_offset(&new_ih,
- deh_offset(&deh[from]));
- } else {
- /*
- * no entries will be copied to this
- * item in this function
- */
- set_le_ih_k_offset(&new_ih, U32_MAX);
- /*
- * this item is not yet valid, but we
- * want I_IS_DIRECTORY_ITEM to return 1
- * for it, so we -1
- */
- }
- set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key,
- TYPE_DIRENTRY);
- }
-
- /* insert item into dest buffer */
- leaf_insert_into_buf(dest_bi,
- (last_first ==
- LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest),
- &new_ih, NULL, 0);
- } else {
- /* prepare space for entries */
- leaf_paste_in_buffer(dest_bi,
- (last_first ==
- FIRST_TO_LAST) ? (B_NR_ITEMS(dest) -
- 1) : 0, MAX_US_INT,
- DEH_SIZE * copy_count + copy_records_len,
- records, 0);
- }
-
- item_num_in_dest =
- (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
-
- leaf_paste_entries(dest_bi, item_num_in_dest,
- (last_first ==
- FIRST_TO_LAST) ? ih_entry_count(item_head(dest,
- item_num_in_dest))
- : 0, copy_count, deh + from, records,
- DEH_SIZE * copy_count + copy_records_len);
-}
-
-/*
- * Copy the first (if last_first == FIRST_TO_LAST) or last
- * (last_first == LAST_TO_FIRST) item or part of it or nothing
- * (see the return 0 below) from SOURCE to the end (if last_first)
- * or beginning (!last_first) of the DEST
- */
-/* returns 1 if anything was copied, else 0 */
-static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
- struct buffer_head *src, int last_first,
- int bytes_or_entries)
-{
- struct buffer_head *dest = dest_bi->bi_bh;
- /* number of items in the source and destination buffers */
- int dest_nr_item, src_nr_item;
- struct item_head *ih;
- struct item_head *dih;
-
- dest_nr_item = B_NR_ITEMS(dest);
-
- /*
- * if ( DEST is empty or first item of SOURCE and last item of
- * DEST are the items of different objects or of different types )
- * then there is no need to treat this item differently from the
- * other items that we copy, so we return
- */
- if (last_first == FIRST_TO_LAST) {
- ih = item_head(src, 0);
- dih = item_head(dest, dest_nr_item - 1);
-
- /* there is nothing to merge */
- if (!dest_nr_item
- || (!op_is_left_mergeable(&ih->ih_key, src->b_size)))
- return 0;
-
- RFALSE(!ih_item_len(ih),
- "vs-10010: item can not have empty length");
-
- if (is_direntry_le_ih(ih)) {
- if (bytes_or_entries == -1)
- /* copy all entries to dest */
- bytes_or_entries = ih_entry_count(ih);
- leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0,
- bytes_or_entries);
- return 1;
- }
-
- /*
- * copy part of the body of the first item of SOURCE
- * to the end of the body of the last item of the DEST
- * part defined by 'bytes_or_entries'; if bytes_or_entries
- * == -1 copy whole body; don't create new item header
- */
- if (bytes_or_entries == -1)
- bytes_or_entries = ih_item_len(ih);
-
-#ifdef CONFIG_REISERFS_CHECK
- else {
- if (bytes_or_entries == ih_item_len(ih)
- && is_indirect_le_ih(ih))
- if (get_ih_free_space(ih))
- reiserfs_panic(sb_from_bi(dest_bi),
- "vs-10020",
- "last unformatted node "
- "must be filled "
- "entirely (%h)", ih);
- }
-#endif
-
- /*
- * merge first item (or its part) of src buffer with the last
- * item of dest buffer. Both are of the same file
- */
- leaf_paste_in_buffer(dest_bi,
- dest_nr_item - 1, ih_item_len(dih),
- bytes_or_entries, ih_item_body(src, ih), 0);
-
- if (is_indirect_le_ih(dih)) {
- RFALSE(get_ih_free_space(dih),
- "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
- ih);
- if (bytes_or_entries == ih_item_len(ih))
- set_ih_free_space(dih, get_ih_free_space(ih));
- }
-
- return 1;
- }
-
- /* copy boundary item to right (last_first == LAST_TO_FIRST) */
-
- /*
- * (DEST is empty or last item of SOURCE and first item of DEST
- * are the items of different object or of different types)
- */
- src_nr_item = B_NR_ITEMS(src);
- ih = item_head(src, src_nr_item - 1);
- dih = item_head(dest, 0);
-
- if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size))
- return 0;
-
- if (is_direntry_le_ih(ih)) {
- /*
- * bytes_or_entries = entries number in last
- * item body of SOURCE
- */
- if (bytes_or_entries == -1)
- bytes_or_entries = ih_entry_count(ih);
-
- leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
- src_nr_item - 1,
- ih_entry_count(ih) - bytes_or_entries,
- bytes_or_entries);
- return 1;
- }
-
- /*
- * copy part of the body of the last item of SOURCE to the
- * begin of the body of the first item of the DEST; part defined
- * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body;
- * change first item key of the DEST; don't create new item header
- */
-
- RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih),
- "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
- ih);
-
- if (bytes_or_entries == -1) {
- /* bytes_or_entries = length of last item body of SOURCE */
- bytes_or_entries = ih_item_len(ih);
-
- RFALSE(le_ih_k_offset(dih) !=
- le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size),
- "vs-10050: items %h and %h do not match", ih, dih);
-
- /* change first item key of the DEST */
- set_le_ih_k_offset(dih, le_ih_k_offset(ih));
-
- /* item becomes non-mergeable */
- /* or mergeable if left item was */
- set_le_ih_k_type(dih, le_ih_k_type(ih));
- } else {
- /* merge to right only part of item */
- RFALSE(ih_item_len(ih) <= bytes_or_entries,
- "vs-10060: no so much bytes %lu (needed %lu)",
- (unsigned long)ih_item_len(ih),
- (unsigned long)bytes_or_entries);
-
- /* change first item key of the DEST */
- if (is_direct_le_ih(dih)) {
- RFALSE(le_ih_k_offset(dih) <=
- (unsigned long)bytes_or_entries,
- "vs-10070: dih %h, bytes_or_entries(%d)", dih,
- bytes_or_entries);
- set_le_ih_k_offset(dih,
- le_ih_k_offset(dih) -
- bytes_or_entries);
- } else {
- RFALSE(le_ih_k_offset(dih) <=
- (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
- "vs-10080: dih %h, bytes_or_entries(%d)",
- dih,
- (bytes_or_entries / UNFM_P_SIZE) * dest->b_size);
- set_le_ih_k_offset(dih,
- le_ih_k_offset(dih) -
- ((bytes_or_entries / UNFM_P_SIZE) *
- dest->b_size));
- }
- }
-
- leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries,
- ih_item_body(src,
- ih) + ih_item_len(ih) - bytes_or_entries,
- 0);
- return 1;
-}
-
-/*
- * copy cpy_mun items from buffer src to buffer dest
- * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning
- * from first-th item in src to tail of dest
- * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning
- * from first-th item in src to head of dest
- */
-static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
- struct buffer_head *src, int last_first,
- int first, int cpy_num)
-{
- struct buffer_head *dest;
- int nr, free_space;
- int dest_before;
- int last_loc, last_inserted_loc, location;
- int i, j;
- struct block_head *blkh;
- struct item_head *ih;
-
- RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST,
- "vs-10090: bad last_first parameter %d", last_first);
- RFALSE(B_NR_ITEMS(src) - first < cpy_num,
- "vs-10100: too few items in source %d, required %d from %d",
- B_NR_ITEMS(src), cpy_num, first);
- RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items");
- RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items");
-
- dest = dest_bi->bi_bh;
-
- RFALSE(!dest, "vs-10130: can not copy negative amount of items");
-
- if (cpy_num == 0)
- return;
-
- blkh = B_BLK_HEAD(dest);
- nr = blkh_nr_item(blkh);
- free_space = blkh_free_space(blkh);
-
- /*
- * we will insert items before 0-th or nr-th item in dest buffer.
- * It depends of last_first parameter
- */
- dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
-
- /* location of head of first new item */
- ih = item_head(dest, dest_before);
-
- RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE,
- "vs-10140: not enough free space for headers %d (needed %d)",
- B_FREE_SPACE(dest), cpy_num * IH_SIZE);
-
- /* prepare space for headers */
- memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE);
-
- /* copy item headers */
- memcpy(ih, item_head(src, first), cpy_num * IH_SIZE);
-
- free_space -= (IH_SIZE * cpy_num);
- set_blkh_free_space(blkh, free_space);
-
- /* location of unmovable item */
- j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1);
- for (i = dest_before; i < nr + cpy_num; i++) {
- location -= ih_item_len(ih + i - dest_before);
- put_ih_location(ih + i - dest_before, location);
- }
-
- /* prepare space for items */
- last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]);
- last_inserted_loc = ih_location(&ih[cpy_num - 1]);
-
- /* check free space */
- RFALSE(free_space < j - last_inserted_loc,
- "vs-10150: not enough free space for items %d (needed %d)",
- free_space, j - last_inserted_loc);
-
- memmove(dest->b_data + last_loc,
- dest->b_data + last_loc + j - last_inserted_loc,
- last_inserted_loc - last_loc);
-
- /* copy items */
- memcpy(dest->b_data + last_inserted_loc,
- item_body(src, (first + cpy_num - 1)),
- j - last_inserted_loc);
-
- /* sizes, item number */
- set_blkh_nr_item(blkh, nr + cpy_num);
- set_blkh_free_space(blkh, free_space - (j - last_inserted_loc));
-
- do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0);
-
- if (dest_bi->bi_parent) {
- struct disk_child *t_dc;
- t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
- RFALSE(dc_block_number(t_dc) != dest->b_blocknr,
- "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
- (long unsigned)dest->b_blocknr,
- (long unsigned)dc_block_number(t_dc));
- put_dc_size(t_dc,
- dc_size(t_dc) + (j - last_inserted_loc +
- IH_SIZE * cpy_num));
-
- do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
- 0);
- }
-}
-
-/*
- * This function splits the (liquid) item into two items (useful when
- * shifting part of an item into another node.)
- */
-static void leaf_item_bottle(struct buffer_info *dest_bi,
- struct buffer_head *src, int last_first,
- int item_num, int cpy_bytes)
-{
- struct buffer_head *dest = dest_bi->bi_bh;
- struct item_head *ih;
-
- RFALSE(cpy_bytes == -1,
- "vs-10170: bytes == - 1 means: do not split item");
-
- if (last_first == FIRST_TO_LAST) {
- /*
- * if ( if item in position item_num in buffer SOURCE
- * is directory item )
- */
- ih = item_head(src, item_num);
- if (is_direntry_le_ih(ih))
- leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
- item_num, 0, cpy_bytes);
- else {
- struct item_head n_ih;
-
- /*
- * copy part of the body of the item number 'item_num'
- * of SOURCE to the end of the DEST part defined by
- * 'cpy_bytes'; create new item header; change old
- * item_header (????); n_ih = new item_header;
- */
- memcpy(&n_ih, ih, IH_SIZE);
- put_ih_item_len(&n_ih, cpy_bytes);
- if (is_indirect_le_ih(ih)) {
- RFALSE(cpy_bytes == ih_item_len(ih)
- && get_ih_free_space(ih),
- "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
- (long unsigned)get_ih_free_space(ih));
- set_ih_free_space(&n_ih, 0);
- }
-
- RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size),
- "vs-10190: bad mergeability of item %h", ih);
- n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
- leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih,
- item_body(src, item_num), 0);
- }
- } else {
- /*
- * if ( if item in position item_num in buffer
- * SOURCE is directory item )
- */
- ih = item_head(src, item_num);
- if (is_direntry_le_ih(ih))
- leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
- item_num,
- ih_entry_count(ih) - cpy_bytes,
- cpy_bytes);
- else {
- struct item_head n_ih;
-
- /*
- * copy part of the body of the item number 'item_num'
- * of SOURCE to the begin of the DEST part defined by
- * 'cpy_bytes'; create new item header;
- * n_ih = new item_header;
- */
- memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE);
-
- /* Endian safe, both le */
- n_ih.ih_version = ih->ih_version;
-
- if (is_direct_le_ih(ih)) {
- set_le_ih_k_offset(&n_ih,
- le_ih_k_offset(ih) +
- ih_item_len(ih) - cpy_bytes);
- set_le_ih_k_type(&n_ih, TYPE_DIRECT);
- set_ih_free_space(&n_ih, MAX_US_INT);
- } else {
- /* indirect item */
- RFALSE(!cpy_bytes && get_ih_free_space(ih),
- "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
- set_le_ih_k_offset(&n_ih,
- le_ih_k_offset(ih) +
- (ih_item_len(ih) -
- cpy_bytes) / UNFM_P_SIZE *
- dest->b_size);
- set_le_ih_k_type(&n_ih, TYPE_INDIRECT);
- set_ih_free_space(&n_ih, get_ih_free_space(ih));
- }
-
- /* set item length */
- put_ih_item_len(&n_ih, cpy_bytes);
-
- /* Endian safe, both le */
- n_ih.ih_version = ih->ih_version;
-
- leaf_insert_into_buf(dest_bi, 0, &n_ih,
- item_body(src, item_num) +
- ih_item_len(ih) - cpy_bytes, 0);
- }
- }
-}
-
-/*
- * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE
- * to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole
- * items from SOURCE to DEST. From last item copy cpy_num bytes for regular
- * item and cpy_num directory entries for directory item.
- */
-static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
- int last_first, int cpy_num, int cpy_bytes)
-{
- struct buffer_head *dest;
- int pos, i, src_nr_item, bytes;
-
- dest = dest_bi->bi_bh;
- RFALSE(!dest || !src, "vs-10210: !dest || !src");
- RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
- "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
- RFALSE(B_NR_ITEMS(src) < cpy_num,
- "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src),
- cpy_num);
- RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num);
-
- if (cpy_num == 0)
- return 0;
-
- if (last_first == FIRST_TO_LAST) {
- /* copy items to left */
- pos = 0;
- if (cpy_num == 1)
- bytes = cpy_bytes;
- else
- bytes = -1;
-
- /*
- * copy the first item or it part or nothing to the end of
- * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes))
- */
- i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes);
- cpy_num -= i;
- if (cpy_num == 0)
- return i;
- pos += i;
- if (cpy_bytes == -1)
- /*
- * copy first cpy_num items starting from position
- * 'pos' of SOURCE to end of DEST
- */
- leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
- pos, cpy_num);
- else {
- /*
- * copy first cpy_num-1 items starting from position
- * 'pos-1' of the SOURCE to the end of the DEST
- */
- leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
- pos, cpy_num - 1);
-
- /*
- * copy part of the item which number is
- * cpy_num+pos-1 to the end of the DEST
- */
- leaf_item_bottle(dest_bi, src, FIRST_TO_LAST,
- cpy_num + pos - 1, cpy_bytes);
- }
- } else {
- /* copy items to right */
- src_nr_item = B_NR_ITEMS(src);
- if (cpy_num == 1)
- bytes = cpy_bytes;
- else
- bytes = -1;
-
- /*
- * copy the last item or it part or nothing to the
- * begin of the DEST
- * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes));
- */
- i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes);
-
- cpy_num -= i;
- if (cpy_num == 0)
- return i;
-
- pos = src_nr_item - cpy_num - i;
- if (cpy_bytes == -1) {
- /*
- * starting from position 'pos' copy last cpy_num
- * items of SOURCE to begin of DEST
- */
- leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
- pos, cpy_num);
- } else {
- /*
- * copy last cpy_num-1 items starting from position
- * 'pos+1' of the SOURCE to the begin of the DEST;
- */
- leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
- pos + 1, cpy_num - 1);
-
- /*
- * copy part of the item which number is pos to
- * the begin of the DEST
- */
- leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos,
- cpy_bytes);
- }
- }
- return i;
-}
-
-/*
- * there are types of coping: from S[0] to L[0], from S[0] to R[0],
- * from R[0] to L[0]. for each of these we have to define parent and
- * positions of destination and source buffers
- */
-static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
- struct buffer_info *dest_bi,
- struct buffer_info *src_bi,
- int *first_last,
- struct buffer_head *Snew)
-{
- memset(dest_bi, 0, sizeof(struct buffer_info));
- memset(src_bi, 0, sizeof(struct buffer_info));
-
- /* define dest, src, dest parent, dest position */
- switch (shift_mode) {
- case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */
- src_bi->tb = tb;
- src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
- src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
-
- /* src->b_item_order */
- src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
- dest_bi->tb = tb;
- dest_bi->bi_bh = tb->L[0];
- dest_bi->bi_parent = tb->FL[0];
- dest_bi->bi_position = get_left_neighbor_position(tb, 0);
- *first_last = FIRST_TO_LAST;
- break;
-
- case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */
- src_bi->tb = tb;
- src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
- src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
- src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
- dest_bi->tb = tb;
- dest_bi->bi_bh = tb->R[0];
- dest_bi->bi_parent = tb->FR[0];
- dest_bi->bi_position = get_right_neighbor_position(tb, 0);
- *first_last = LAST_TO_FIRST;
- break;
-
- case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */
- src_bi->tb = tb;
- src_bi->bi_bh = tb->R[0];
- src_bi->bi_parent = tb->FR[0];
- src_bi->bi_position = get_right_neighbor_position(tb, 0);
- dest_bi->tb = tb;
- dest_bi->bi_bh = tb->L[0];
- dest_bi->bi_parent = tb->FL[0];
- dest_bi->bi_position = get_left_neighbor_position(tb, 0);
- *first_last = FIRST_TO_LAST;
- break;
-
- case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */
- src_bi->tb = tb;
- src_bi->bi_bh = tb->L[0];
- src_bi->bi_parent = tb->FL[0];
- src_bi->bi_position = get_left_neighbor_position(tb, 0);
- dest_bi->tb = tb;
- dest_bi->bi_bh = tb->R[0];
- dest_bi->bi_parent = tb->FR[0];
- dest_bi->bi_position = get_right_neighbor_position(tb, 0);
- *first_last = LAST_TO_FIRST;
- break;
-
- case LEAF_FROM_S_TO_SNEW:
- src_bi->tb = tb;
- src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
- src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
- src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
- dest_bi->tb = tb;
- dest_bi->bi_bh = Snew;
- dest_bi->bi_parent = NULL;
- dest_bi->bi_position = 0;
- *first_last = LAST_TO_FIRST;
- break;
-
- default:
- reiserfs_panic(sb_from_bi(src_bi), "vs-10250",
- "shift type is unknown (%d)", shift_mode);
- }
- RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh,
- "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
- shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
-}
-
-/*
- * copy mov_num items and mov_bytes of the (mov_num-1)th item to
- * neighbor. Delete them from source
- */
-int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
- int mov_bytes, struct buffer_head *Snew)
-{
- int ret_value;
- struct buffer_info dest_bi, src_bi;
- int first_last;
-
- leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi,
- &first_last, Snew);
-
- ret_value =
- leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num,
- mov_bytes);
-
- leaf_delete_items(&src_bi, first_last,
- (first_last ==
- FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) -
- mov_num), mov_num, mov_bytes);
-
- return ret_value;
-}
-
-/*
- * Shift shift_num items (and shift_bytes of last shifted item if
- * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key
- */
-int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
-{
- struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path);
- int i;
-
- /*
- * move shift_num (and shift_bytes bytes) items from S[0]
- * to left neighbor L[0]
- */
- i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
-
- if (shift_num) {
- /* number of items in S[0] == 0 */
- if (B_NR_ITEMS(S0) == 0) {
-
- RFALSE(shift_bytes != -1,
- "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
- shift_bytes);
-#ifdef CONFIG_REISERFS_CHECK
- if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
- print_cur_tb("vs-10275");
- reiserfs_panic(tb->tb_sb, "vs-10275",
- "balance condition corrupted "
- "(%c)", tb->tb_mode);
- }
-#endif
-
- if (PATH_H_POSITION(tb->tb_path, 1) == 0)
- replace_key(tb, tb->CFL[0], tb->lkey[0],
- PATH_H_PPARENT(tb->tb_path, 0), 0);
-
- } else {
- /* replace lkey in CFL[0] by 0-th key from S[0]; */
- replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0);
-
- RFALSE((shift_bytes != -1 &&
- !(is_direntry_le_ih(item_head(S0, 0))
- && !ih_entry_count(item_head(S0, 0)))) &&
- (!op_is_left_mergeable
- (leaf_key(S0, 0), S0->b_size)),
- "vs-10280: item must be mergeable");
- }
- }
-
- return i;
-}
-
-/* CLEANING STOPPED HERE */
-
-/*
- * Shift shift_num (shift_bytes) items from S[0] to the right neighbor,
- * and replace the delimiting key
- */
-int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
-{
- int ret_value;
-
- /*
- * move shift_num (and shift_bytes) items from S[0] to
- * right neighbor R[0]
- */
- ret_value =
- leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
-
- /* replace rkey in CFR[0] by the 0-th key from R[0] */
- if (shift_num) {
- replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
-
- }
-
- return ret_value;
-}
-
-static void leaf_delete_items_entirely(struct buffer_info *bi,
- int first, int del_num);
-/*
- * If del_bytes == -1, starting from position 'first' delete del_num
- * items in whole in buffer CUR.
- * If not.
- * If last_first == 0. Starting from position 'first' delete del_num-1
- * items in whole. Delete part of body of the first item. Part defined by
- * del_bytes. Don't delete first item header
- * If last_first == 1. Starting from position 'first+1' delete del_num-1
- * items in whole. Delete part of body of the last item . Part defined by
- * del_bytes. Don't delete last item header.
-*/
-void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
- int first, int del_num, int del_bytes)
-{
- struct buffer_head *bh;
- int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh);
-
- RFALSE(!bh, "10155: bh is not defined");
- RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d",
- del_num);
- RFALSE(first < 0
- || first + del_num > item_amount,
- "10165: invalid number of first item to be deleted (%d) or "
- "no so much items (%d) to delete (only %d)", first,
- first + del_num, item_amount);
-
- if (del_num == 0)
- return;
-
- if (first == 0 && del_num == item_amount && del_bytes == -1) {
- make_empty_node(cur_bi);
- do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0);
- return;
- }
-
- if (del_bytes == -1)
- /* delete del_num items beginning from item in position first */
- leaf_delete_items_entirely(cur_bi, first, del_num);
- else {
- if (last_first == FIRST_TO_LAST) {
- /*
- * delete del_num-1 items beginning from
- * item in position first
- */
- leaf_delete_items_entirely(cur_bi, first, del_num - 1);
-
- /*
- * delete the part of the first item of the bh
- * do not delete item header
- */
- leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes);
- } else {
- struct item_head *ih;
- int len;
-
- /*
- * delete del_num-1 items beginning from
- * item in position first+1
- */
- leaf_delete_items_entirely(cur_bi, first + 1,
- del_num - 1);
-
- ih = item_head(bh, B_NR_ITEMS(bh) - 1);
- if (is_direntry_le_ih(ih))
- /* the last item is directory */
- /*
- * len = numbers of directory entries
- * in this item
- */
- len = ih_entry_count(ih);
- else
- /* len = body len of item */
- len = ih_item_len(ih);
-
- /*
- * delete the part of the last item of the bh
- * do not delete item header
- */
- leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
- len - del_bytes, del_bytes);
- }
- }
-}
-
-/* insert item into the leaf node in position before */
-void leaf_insert_into_buf(struct buffer_info *bi, int before,
- struct item_head * const inserted_item_ih,
- const char * const inserted_item_body,
- int zeros_number)
-{
- struct buffer_head *bh = bi->bi_bh;
- int nr, free_space;
- struct block_head *blkh;
- struct item_head *ih;
- int i;
- int last_loc, unmoved_loc;
- char *to;
-
- blkh = B_BLK_HEAD(bh);
- nr = blkh_nr_item(blkh);
- free_space = blkh_free_space(blkh);
-
- /* check free space */
- RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
- "vs-10170: not enough free space in block %z, new item %h",
- bh, inserted_item_ih);
- RFALSE(zeros_number > ih_item_len(inserted_item_ih),
- "vs-10172: zero number == %d, item length == %d",
- zeros_number, ih_item_len(inserted_item_ih));
-
- /* get item new item must be inserted before */
- ih = item_head(bh, before);
-
- /* prepare space for the body of new item */
- last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size;
- unmoved_loc = before ? ih_location(ih - 1) : bh->b_size;
-
- memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih),
- bh->b_data + last_loc, unmoved_loc - last_loc);
-
- to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
- memset(to, 0, zeros_number);
- to += zeros_number;
-
- /* copy body to prepared space */
- if (inserted_item_body)
- memmove(to, inserted_item_body,
- ih_item_len(inserted_item_ih) - zeros_number);
- else
- memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
-
- /* insert item header */
- memmove(ih + 1, ih, IH_SIZE * (nr - before));
- memmove(ih, inserted_item_ih, IH_SIZE);
-
- /* change locations */
- for (i = before; i < nr + 1; i++) {
- unmoved_loc -= ih_item_len(&ih[i - before]);
- put_ih_location(&ih[i - before], unmoved_loc);
- }
-
- /* sizes, free space, item number */
- set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
- set_blkh_free_space(blkh,
- free_space - (IH_SIZE +
- ih_item_len(inserted_item_ih)));
- do_balance_mark_leaf_dirty(bi->tb, bh, 1);
-
- if (bi->bi_parent) {
- struct disk_child *t_dc;
- t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
- put_dc_size(t_dc,
- dc_size(t_dc) + (IH_SIZE +
- ih_item_len(inserted_item_ih)));
- do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
- }
-}
-
-/*
- * paste paste_size bytes to affected_item_num-th item.
- * When item is a directory, this only prepare space for new entries
- */
-void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
- int pos_in_item, int paste_size,
- const char *body, int zeros_number)
-{
- struct buffer_head *bh = bi->bi_bh;
- int nr, free_space;
- struct block_head *blkh;
- struct item_head *ih;
- int i;
- int last_loc, unmoved_loc;
-
- blkh = B_BLK_HEAD(bh);
- nr = blkh_nr_item(blkh);
- free_space = blkh_free_space(blkh);
-
- /* check free space */
- RFALSE(free_space < paste_size,
- "vs-10175: not enough free space: needed %d, available %d",
- paste_size, free_space);
-
-#ifdef CONFIG_REISERFS_CHECK
- if (zeros_number > paste_size) {
- struct super_block *sb = NULL;
- if (bi && bi->tb)
- sb = bi->tb->tb_sb;
- print_cur_tb("10177");
- reiserfs_panic(sb, "vs-10177",
- "zeros_number == %d, paste_size == %d",
- zeros_number, paste_size);
- }
-#endif /* CONFIG_REISERFS_CHECK */
-
- /* item to be appended */
- ih = item_head(bh, affected_item_num);
-
- last_loc = ih_location(&ih[nr - affected_item_num - 1]);
- unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size;
-
- /* prepare space */
- memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
- unmoved_loc - last_loc);
-
- /* change locations */
- for (i = affected_item_num; i < nr; i++)
- put_ih_location(&ih[i - affected_item_num],
- ih_location(&ih[i - affected_item_num]) -
- paste_size);
-
- if (body) {
- if (!is_direntry_le_ih(ih)) {
- if (!pos_in_item) {
- /* shift data to right */
- memmove(bh->b_data + ih_location(ih) +
- paste_size,
- bh->b_data + ih_location(ih),
- ih_item_len(ih));
- /* paste data in the head of item */
- memset(bh->b_data + ih_location(ih), 0,
- zeros_number);
- memcpy(bh->b_data + ih_location(ih) +
- zeros_number, body,
- paste_size - zeros_number);
- } else {
- memset(bh->b_data + unmoved_loc - paste_size, 0,
- zeros_number);
- memcpy(bh->b_data + unmoved_loc - paste_size +
- zeros_number, body,
- paste_size - zeros_number);
- }
- }
- } else
- memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
-
- put_ih_item_len(ih, ih_item_len(ih) + paste_size);
-
- /* change free space */
- set_blkh_free_space(blkh, free_space - paste_size);
-
- do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-
- if (bi->bi_parent) {
- struct disk_child *t_dc =
- B_N_CHILD(bi->bi_parent, bi->bi_position);
- put_dc_size(t_dc, dc_size(t_dc) + paste_size);
- do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
- }
-}
-
-/*
- * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
- * does not have free space, so it moves DEHs and remaining records as
- * necessary. Return value is size of removed part of directory item
- * in bytes.
- */
-static int leaf_cut_entries(struct buffer_head *bh,
- struct item_head *ih, int from, int del_count)
-{
- char *item;
- struct reiserfs_de_head *deh;
- int prev_record_offset; /* offset of record, that is (from-1)th */
- char *prev_record; /* */
- int cut_records_len; /* length of all removed records */
- int i;
-
- /*
- * make sure that item is directory and there are enough entries to
- * remove
- */
- RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
- RFALSE(ih_entry_count(ih) < from + del_count,
- "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d",
- ih_entry_count(ih), from, del_count);
-
- if (del_count == 0)
- return 0;
-
- /* first byte of item */
- item = bh->b_data + ih_location(ih);
-
- /* entry head array */
- deh = B_I_DEH(bh, ih);
-
- /*
- * first byte of remaining entries, those are BEFORE cut entries
- * (prev_record) and length of all removed records (cut_records_len)
- */
- prev_record_offset =
- (from ? deh_location(&deh[from - 1]) : ih_item_len(ih));
- cut_records_len = prev_record_offset /*from_record */ -
- deh_location(&deh[from + del_count - 1]);
- prev_record = item + prev_record_offset;
-
- /* adjust locations of remaining entries */
- for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--)
- put_deh_location(&deh[i],
- deh_location(&deh[i]) -
- (DEH_SIZE * del_count));
-
- for (i = 0; i < from; i++)
- put_deh_location(&deh[i],
- deh_location(&deh[i]) - (DEH_SIZE * del_count +
- cut_records_len));
-
- put_ih_entry_count(ih, ih_entry_count(ih) - del_count);
-
- /* shift entry head array and entries those are AFTER removed entries */
- memmove((char *)(deh + from),
- deh + from + del_count,
- prev_record - cut_records_len - (char *)(deh + from +
- del_count));
-
- /* shift records, those are BEFORE removed entries */
- memmove(prev_record - cut_records_len - DEH_SIZE * del_count,
- prev_record, item + ih_item_len(ih) - prev_record);
-
- return DEH_SIZE * del_count + cut_records_len;
-}
-
-/*
- * when cut item is part of regular file
- * pos_in_item - first byte that must be cut
- * cut_size - number of bytes to be cut beginning from pos_in_item
- *
- * when cut item is part of directory
- * pos_in_item - number of first deleted entry
- * cut_size - count of deleted entries
- */
-void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
- int pos_in_item, int cut_size)
-{
- int nr;
- struct buffer_head *bh = bi->bi_bh;
- struct block_head *blkh;
- struct item_head *ih;
- int last_loc, unmoved_loc;
- int i;
-
- blkh = B_BLK_HEAD(bh);
- nr = blkh_nr_item(blkh);
-
- /* item head of truncated item */
- ih = item_head(bh, cut_item_num);
-
- if (is_direntry_le_ih(ih)) {
- /* first cut entry () */
- cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size);
- if (pos_in_item == 0) {
- /* change key */
- RFALSE(cut_item_num,
- "when 0-th enrty of item is cut, that item must be first in the node, not %d-th",
- cut_item_num);
- /* change item key by key of first entry in the item */
- set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih)));
- }
- } else {
- /* item is direct or indirect */
- RFALSE(is_statdata_le_ih(ih), "10195: item is stat data");
- RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
- "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
- (long unsigned)pos_in_item, (long unsigned)cut_size,
- (long unsigned)ih_item_len(ih));
-
- /* shift item body to left if cut is from the head of item */
- if (pos_in_item == 0) {
- memmove(bh->b_data + ih_location(ih),
- bh->b_data + ih_location(ih) + cut_size,
- ih_item_len(ih) - cut_size);
-
- /* change key of item */
- if (is_direct_le_ih(ih))
- set_le_ih_k_offset(ih,
- le_ih_k_offset(ih) +
- cut_size);
- else {
- set_le_ih_k_offset(ih,
- le_ih_k_offset(ih) +
- (cut_size / UNFM_P_SIZE) *
- bh->b_size);
- RFALSE(ih_item_len(ih) == cut_size
- && get_ih_free_space(ih),
- "10205: invalid ih_free_space (%h)", ih);
- }
- }
- }
-
- /* location of the last item */
- last_loc = ih_location(&ih[nr - cut_item_num - 1]);
-
- /* location of the item, which is remaining at the same place */
- unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size;
-
- /* shift */
- memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
- unmoved_loc - last_loc - cut_size);
-
- /* change item length */
- put_ih_item_len(ih, ih_item_len(ih) - cut_size);
-
- if (is_indirect_le_ih(ih)) {
- if (pos_in_item)
- set_ih_free_space(ih, 0);
- }
-
- /* change locations */
- for (i = cut_item_num; i < nr; i++)
- put_ih_location(&ih[i - cut_item_num],
- ih_location(&ih[i - cut_item_num]) + cut_size);
-
- /* size, free space */
- set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size);
-
- do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-
- if (bi->bi_parent) {
- struct disk_child *t_dc;
- t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
- put_dc_size(t_dc, dc_size(t_dc) - cut_size);
- do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
- }
-}
-
-/* delete del_num items from buffer starting from the first'th item */
-static void leaf_delete_items_entirely(struct buffer_info *bi,
- int first, int del_num)
-{
- struct buffer_head *bh = bi->bi_bh;
- int nr;
- int i, j;
- int last_loc, last_removed_loc;
- struct block_head *blkh;
- struct item_head *ih;
-
- RFALSE(bh == NULL, "10210: buffer is 0");
- RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num);
-
- if (del_num == 0)
- return;
-
- blkh = B_BLK_HEAD(bh);
- nr = blkh_nr_item(blkh);
-
- RFALSE(first < 0 || first + del_num > nr,
- "10220: first=%d, number=%d, there is %d items", first, del_num,
- nr);
-
- if (first == 0 && del_num == nr) {
- /* this does not work */
- make_empty_node(bi);
-
- do_balance_mark_leaf_dirty(bi->tb, bh, 0);
- return;
- }
-
- ih = item_head(bh, first);
-
- /* location of unmovable item */
- j = (first == 0) ? bh->b_size : ih_location(ih - 1);
-
- /* delete items */
- last_loc = ih_location(&ih[nr - 1 - first]);
- last_removed_loc = ih_location(&ih[del_num - 1]);
-
- memmove(bh->b_data + last_loc + j - last_removed_loc,
- bh->b_data + last_loc, last_removed_loc - last_loc);
-
- /* delete item headers */
- memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
-
- /* change item location */
- for (i = first; i < nr - del_num; i++)
- put_ih_location(&ih[i - first],
- ih_location(&ih[i - first]) + (j -
- last_removed_loc));
-
- /* sizes, item number */
- set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
- set_blkh_free_space(blkh,
- blkh_free_space(blkh) + (j - last_removed_loc +
- IH_SIZE * del_num));
-
- do_balance_mark_leaf_dirty(bi->tb, bh, 0);
-
- if (bi->bi_parent) {
- struct disk_child *t_dc =
- B_N_CHILD(bi->bi_parent, bi->bi_position);
- put_dc_size(t_dc,
- dc_size(t_dc) - (j - last_removed_loc +
- IH_SIZE * del_num));
- do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
- }
-}
-
-/*
- * paste new_entry_count entries (new_dehs, records) into position
- * before to item_num-th item
- */
-void leaf_paste_entries(struct buffer_info *bi,
- int item_num,
- int before,
- int new_entry_count,
- struct reiserfs_de_head *new_dehs,
- const char *records, int paste_size)
-{
- struct item_head *ih;
- char *item;
- struct reiserfs_de_head *deh;
- char *insert_point;
- int i;
- struct buffer_head *bh = bi->bi_bh;
-
- if (new_entry_count == 0)
- return;
-
- ih = item_head(bh, item_num);
-
- /*
- * make sure, that item is directory, and there are enough
- * records in it
- */
- RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item");
- RFALSE(ih_entry_count(ih) < before,
- "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
- ih_entry_count(ih), before);
-
- /* first byte of dest item */
- item = bh->b_data + ih_location(ih);
-
- /* entry head array */
- deh = B_I_DEH(bh, ih);
-
- /* new records will be pasted at this point */
- insert_point =
- item +
- (before ? deh_location(&deh[before - 1])
- : (ih_item_len(ih) - paste_size));
-
- /* adjust locations of records that will be AFTER new records */
- for (i = ih_entry_count(ih) - 1; i >= before; i--)
- put_deh_location(&deh[i],
- deh_location(&deh[i]) +
- (DEH_SIZE * new_entry_count));
-
- /* adjust locations of records that will be BEFORE new records */
- for (i = 0; i < before; i++)
- put_deh_location(&deh[i],
- deh_location(&deh[i]) + paste_size);
-
- put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
-
- /* prepare space for pasted records */
- memmove(insert_point + paste_size, insert_point,
- item + (ih_item_len(ih) - paste_size) - insert_point);
-
- /* copy new records */
- memcpy(insert_point + DEH_SIZE * new_entry_count, records,
- paste_size - DEH_SIZE * new_entry_count);
-
- /* prepare space for new entry heads */
- deh += before;
- memmove((char *)(deh + new_entry_count), deh,
- insert_point - (char *)deh);
-
- /* copy new entry heads */
- deh = (struct reiserfs_de_head *)((char *)deh);
- memcpy(deh, new_dehs, DEH_SIZE * new_entry_count);
-
- /* set locations of new records */
- for (i = 0; i < new_entry_count; i++) {
- put_deh_location(&deh[i],
- deh_location(&deh[i]) +
- (-deh_location
- (&new_dehs[new_entry_count - 1]) +
- insert_point + DEH_SIZE * new_entry_count -
- item));
- }
-
- /* change item key if necessary (when we paste before 0-th entry */
- if (!before) {
- set_le_ih_k_offset(ih, deh_offset(new_dehs));
- }
-#ifdef CONFIG_REISERFS_CHECK
- {
- int prev, next;
- /* check record locations */
- deh = B_I_DEH(bh, ih);
- for (i = 0; i < ih_entry_count(ih); i++) {
- next =
- (i <
- ih_entry_count(ih) -
- 1) ? deh_location(&deh[i + 1]) : 0;
- prev = (i != 0) ? deh_location(&deh[i - 1]) : 0;
-
- if (prev && prev <= deh_location(&deh[i]))
- reiserfs_error(sb_from_bi(bi), "vs-10240",
- "directory item (%h) "
- "corrupted (prev %a, "
- "cur(%d) %a)",
- ih, deh + i - 1, i, deh + i);
- if (next && next >= deh_location(&deh[i]))
- reiserfs_error(sb_from_bi(bi), "vs-10250",
- "directory item (%h) "
- "corrupted (cur(%d) %a, "
- "next %a)",
- ih, i, deh + i, deh + i + 1);
- }
- }
-#endif
-
-}
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
deleted file mode 100644
index 46bd7bd63a71..000000000000
--- a/fs/reiserfs/lock.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/mutex.h>
-
-/*
- * The previous reiserfs locking scheme was heavily based on
- * the tricky properties of the Bkl:
- *
- * - it was acquired recursively by a same task
- * - the performances relied on the release-while-schedule() property
- *
- * Now that we replace it by a mutex, we still want to keep the same
- * recursive property to avoid big changes in the code structure.
- * We use our own lock_owner here because the owner field on a mutex
- * is only available in SMP or mutex debugging, also we only need this field
- * for this mutex, no need for a system wide mutex facility.
- *
- * Also this lock is often released before a call that could block because
- * reiserfs performances were partially based on the release while schedule()
- * property of the Bkl.
- */
-void reiserfs_write_lock(struct super_block *s)
-{
- struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-
- if (sb_i->lock_owner != current) {
- mutex_lock(&sb_i->lock);
- sb_i->lock_owner = current;
- }
-
- /* No need to protect it, only the current task touches it */
- sb_i->lock_depth++;
-}
-
-void reiserfs_write_unlock(struct super_block *s)
-{
- struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-
- /*
- * Are we unlocking without even holding the lock?
- * Such a situation must raise a BUG() if we don't want
- * to corrupt the data.
- */
- BUG_ON(sb_i->lock_owner != current);
-
- if (--sb_i->lock_depth == -1) {
- sb_i->lock_owner = NULL;
- mutex_unlock(&sb_i->lock);
- }
-}
-
-int __must_check reiserfs_write_unlock_nested(struct super_block *s)
-{
- struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
- int depth;
-
- /* this can happen when the lock isn't always held */
- if (sb_i->lock_owner != current)
- return -1;
-
- depth = sb_i->lock_depth;
-
- sb_i->lock_depth = -1;
- sb_i->lock_owner = NULL;
- mutex_unlock(&sb_i->lock);
-
- return depth;
-}
-
-void reiserfs_write_lock_nested(struct super_block *s, int depth)
-{
- struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
-
- /* this can happen when the lock isn't always held */
- if (depth == -1)
- return;
-
- mutex_lock(&sb_i->lock);
- sb_i->lock_owner = current;
- sb_i->lock_depth = depth;
-}
-
-/*
- * Utility function to force a BUG if it is called without the superblock
- * write lock held. caller is the string printed just before calling BUG()
- */
-void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
-{
- struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
-
- WARN_ON(sb_i->lock_depth < 0);
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-void reiserfs_lock_check_recursive(struct super_block *sb)
-{
- struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
-
- WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
-}
-#endif
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
deleted file mode 100644
index 7e7b531fcc49..000000000000
--- a/fs/reiserfs/namei.c
+++ /dev/null
@@ -1,1725 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- *
- * Trivial changes by Alan Cox to remove EHASHCOLLISION for compatibility
- *
- * Trivial Changes:
- * Rights granted to Hans Reiser to redistribute under other terms providing
- * he accepts all liability including but not limited to patent, fitness
- * for purpose, and direct or indirect claims arising from failure to perform.
- *
- * NO WARRANTY
- */
-
-#include <linux/time.h>
-#include <linux/bitops.h>
-#include <linux/slab.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/quotaops.h>
-
-#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
-#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
-
-/*
- * directory item contains array of entry headers. This performs
- * binary search through that array
- */
-static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
-{
- struct item_head *ih = de->de_ih;
- struct reiserfs_de_head *deh = de->de_deh;
- int rbound, lbound, j;
-
- lbound = 0;
- rbound = ih_entry_count(ih) - 1;
-
- for (j = (rbound + lbound) / 2; lbound <= rbound;
- j = (rbound + lbound) / 2) {
- if (off < deh_offset(deh + j)) {
- rbound = j - 1;
- continue;
- }
- if (off > deh_offset(deh + j)) {
- lbound = j + 1;
- continue;
- }
- /* this is not name found, but matched third key component */
- de->de_entry_num = j;
- return NAME_FOUND;
- }
-
- de->de_entry_num = lbound;
- return NAME_NOT_FOUND;
-}
-
-/*
- * comment? maybe something like set de to point to what the path points to?
- */
-static inline void set_de_item_location(struct reiserfs_dir_entry *de,
- struct treepath *path)
-{
- de->de_bh = get_last_bh(path);
- de->de_ih = tp_item_head(path);
- de->de_deh = B_I_DEH(de->de_bh, de->de_ih);
- de->de_item_num = PATH_LAST_POSITION(path);
-}
-
-/*
- * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
- */
-inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
-{
- struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
-
- BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
-
- de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
- de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
- de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh);
- if (de->de_name[de->de_namelen - 1] == 0)
- de->de_namelen = strlen(de->de_name);
-}
-
-/* what entry points to */
-static inline void set_de_object_key(struct reiserfs_dir_entry *de)
-{
- BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
- de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]);
- de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]);
-}
-
-static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
-{
- struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
-
- BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
-
- /* store key of the found entry */
- de->de_entry_key.version = KEY_FORMAT_3_5;
- de->de_entry_key.on_disk_key.k_dir_id =
- le32_to_cpu(de->de_ih->ih_key.k_dir_id);
- de->de_entry_key.on_disk_key.k_objectid =
- le32_to_cpu(de->de_ih->ih_key.k_objectid);
- set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh));
- set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY);
-}
-
-/*
- * We assign a key to each directory item, and place multiple entries in a
- * single directory item. A directory item has a key equal to the key of
- * the first directory entry in it.
-
- * This function first calls search_by_key, then, if item whose first entry
- * matches is not found it looks for the entry inside directory item found
- * by search_by_key. Fills the path to the entry, and to the entry position
- * in the item
- */
-/* The function is NOT SCHEDULE-SAFE! */
-int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
- struct treepath *path, struct reiserfs_dir_entry *de)
-{
- int retval;
-
- retval = search_item(sb, key, path);
- switch (retval) {
- case ITEM_NOT_FOUND:
- if (!PATH_LAST_POSITION(path)) {
- reiserfs_error(sb, "vs-7000", "search_by_key "
- "returned item position == 0");
- pathrelse(path);
- return IO_ERROR;
- }
- PATH_LAST_POSITION(path)--;
- break;
-
- case ITEM_FOUND:
- break;
-
- case IO_ERROR:
- return retval;
-
- default:
- pathrelse(path);
- reiserfs_error(sb, "vs-7002", "no path to here");
- return IO_ERROR;
- }
-
- set_de_item_location(de, path);
-
-#ifdef CONFIG_REISERFS_CHECK
- if (!is_direntry_le_ih(de->de_ih) ||
- COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) {
- print_block(de->de_bh, 0, -1, -1);
- reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
- "item or does not belong to the same directory "
- "as key %K", de->de_ih, key);
- }
-#endif /* CONFIG_REISERFS_CHECK */
-
- /*
- * binary search in directory item by third component of the
- * key. sets de->de_entry_num of de
- */
- retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
- path->pos_in_item = de->de_entry_num;
- if (retval != NAME_NOT_FOUND) {
- /*
- * ugly, but rename needs de_bh, de_deh, de_name,
- * de_namelen, de_objectid set
- */
- set_de_name_and_namelen(de);
- set_de_object_key(de);
- }
- return retval;
-}
-
-/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
-
-/*
- * The third component is hashed, and you can choose from more than
- * one hash function. Per directory hashes are not yet implemented
- * but are thought about. This function should be moved to hashes.c
- * Jedi, please do so. -Hans
- */
-static __u32 get_third_component(struct super_block *s,
- const char *name, int len)
-{
- __u32 res;
-
- if (!len || (len == 1 && name[0] == '.'))
- return DOT_OFFSET;
- if (len == 2 && name[0] == '.' && name[1] == '.')
- return DOT_DOT_OFFSET;
-
- res = REISERFS_SB(s)->s_hash_function(name, len);
-
- /* take bits from 7-th to 30-th including both bounds */
- res = GET_HASH_VALUE(res);
- if (res == 0)
- /*
- * needed to have no names before "." and ".." those have hash
- * value == 0 and generation conters 1 and 2 accordingly
- */
- res = 128;
- return res + MAX_GENERATION_NUMBER;
-}
-
-static int reiserfs_match(struct reiserfs_dir_entry *de,
- const char *name, int namelen)
-{
- int retval = NAME_NOT_FOUND;
-
- if ((namelen == de->de_namelen) &&
- !memcmp(de->de_name, name, de->de_namelen))
- retval =
- (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND :
- NAME_FOUND_INVISIBLE);
-
- return retval;
-}
-
-/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
-
-/* used when hash collisions exist */
-
-static int linear_search_in_dir_item(struct cpu_key *key,
- struct reiserfs_dir_entry *de,
- const char *name, int namelen)
-{
- struct reiserfs_de_head *deh = de->de_deh;
- int retval;
- int i;
-
- i = de->de_entry_num;
-
- if (i == ih_entry_count(de->de_ih) ||
- GET_HASH_VALUE(deh_offset(deh + i)) !=
- GET_HASH_VALUE(cpu_key_k_offset(key))) {
- i--;
- }
-
- RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih),
- "vs-7010: array of entry headers not found");
-
- deh += i;
-
- for (; i >= 0; i--, deh--) {
- /* hash value does not match, no need to check whole name */
- if (GET_HASH_VALUE(deh_offset(deh)) !=
- GET_HASH_VALUE(cpu_key_k_offset(key))) {
- return NAME_NOT_FOUND;
- }
-
- /* mark that this generation number is used */
- if (de->de_gen_number_bit_string)
- set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
- de->de_gen_number_bit_string);
-
- /* calculate pointer to name and namelen */
- de->de_entry_num = i;
- set_de_name_and_namelen(de);
-
- /*
- * de's de_name, de_namelen, de_recordlen are set.
- * Fill the rest.
- */
- if ((retval =
- reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) {
-
- /* key of pointed object */
- set_de_object_key(de);
-
- store_de_entry_key(de);
-
- /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */
- return retval;
- }
- }
-
- if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
- /*
- * we have reached left most entry in the node. In common we
- * have to go to the left neighbor, but if generation counter
- * is 0 already, we know for sure, that there is no name with
- * the same hash value
- */
- /*
- * FIXME: this work correctly only because hash value can not
- * be 0. Btw, in case of Yura's hash it is probably possible,
- * so, this is a bug
- */
- return NAME_NOT_FOUND;
-
- RFALSE(de->de_item_num,
- "vs-7015: two diritems of the same directory in one node?");
-
- return GOTO_PREVIOUS_ITEM;
-}
-
-/*
- * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
- * FIXME: should add something like IOERROR
- */
-static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
- struct treepath *path_to_entry,
- struct reiserfs_dir_entry *de)
-{
- struct cpu_key key_to_search;
- int retval;
-
- if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
- return NAME_NOT_FOUND;
-
- /* we will search for this key in the tree */
- make_cpu_key(&key_to_search, dir,
- get_third_component(dir->i_sb, name, namelen),
- TYPE_DIRENTRY, 3);
-
- while (1) {
- retval =
- search_by_entry_key(dir->i_sb, &key_to_search,
- path_to_entry, de);
- if (retval == IO_ERROR) {
- reiserfs_error(dir->i_sb, "zam-7001", "io error");
- return IO_ERROR;
- }
-
- /* compare names for all entries having given hash value */
- retval =
- linear_search_in_dir_item(&key_to_search, de, name,
- namelen);
- /*
- * there is no need to scan directory anymore.
- * Given entry found or does not exist
- */
- if (retval != GOTO_PREVIOUS_ITEM) {
- path_to_entry->pos_in_item = de->de_entry_num;
- return retval;
- }
-
- /*
- * there is left neighboring item of this directory
- * and given entry can be there
- */
- set_cpu_key_k_offset(&key_to_search,
- le_ih_k_offset(de->de_ih) - 1);
- pathrelse(path_to_entry);
-
- } /* while (1) */
-}
-
-static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- int retval;
- struct inode *inode = NULL;
- struct reiserfs_dir_entry de;
- INITIALIZE_PATH(path_to_entry);
-
- if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
- return ERR_PTR(-ENAMETOOLONG);
-
- reiserfs_write_lock(dir->i_sb);
-
- de.de_gen_number_bit_string = NULL;
- retval =
- reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
- &path_to_entry, &de);
- pathrelse(&path_to_entry);
- if (retval == NAME_FOUND) {
- inode = reiserfs_iget(dir->i_sb,
- (struct cpu_key *)&de.de_dir_id);
- if (!inode || IS_ERR(inode)) {
- reiserfs_write_unlock(dir->i_sb);
- return ERR_PTR(-EACCES);
- }
-
- /*
- * Propagate the private flag so we know we're
- * in the priv tree. Also clear xattr support
- * since we don't have xattrs on xattr files.
- */
- if (IS_PRIVATE(dir))
- reiserfs_init_priv_inode(inode);
- }
- reiserfs_write_unlock(dir->i_sb);
- if (retval == IO_ERROR) {
- return ERR_PTR(-EIO);
- }
-
- return d_splice_alias(inode, dentry);
-}
-
-/*
- * looks up the dentry of the parent directory for child.
- * taken from ext2_get_parent
- */
-struct dentry *reiserfs_get_parent(struct dentry *child)
-{
- int retval;
- struct inode *inode = NULL;
- struct reiserfs_dir_entry de;
- INITIALIZE_PATH(path_to_entry);
- struct inode *dir = d_inode(child);
-
- if (dir->i_nlink == 0) {
- return ERR_PTR(-ENOENT);
- }
- de.de_gen_number_bit_string = NULL;
-
- reiserfs_write_lock(dir->i_sb);
- retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de);
- pathrelse(&path_to_entry);
- if (retval != NAME_FOUND) {
- reiserfs_write_unlock(dir->i_sb);
- return ERR_PTR(-ENOENT);
- }
- inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id);
- reiserfs_write_unlock(dir->i_sb);
-
- return d_obtain_alias(inode);
-}
-
-/* add entry to the directory (entry can be hidden).
-
-insert definition of when hidden directories are used here -Hans
-
- Does not mark dir inode dirty, do it after successesfull call to it */
-
-static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
- struct inode *dir, const char *name, int namelen,
- struct inode *inode, int visible)
-{
- struct cpu_key entry_key;
- struct reiserfs_de_head *deh;
- INITIALIZE_PATH(path);
- struct reiserfs_dir_entry de;
- DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
- int gen_number;
-
- /*
- * 48 bytes now and we avoid kmalloc if we
- * create file with short name
- */
- char small_buf[32 + DEH_SIZE];
-
- char *buffer;
- int buflen, paste_size;
- int retval;
-
- BUG_ON(!th->t_trans_id);
-
- /* each entry has unique key. compose it */
- make_cpu_key(&entry_key, dir,
- get_third_component(dir->i_sb, name, namelen),
- TYPE_DIRENTRY, 3);
-
- /* get memory for composing the entry */
- buflen = DEH_SIZE + ROUND_UP(namelen);
- if (buflen > sizeof(small_buf)) {
- buffer = kmalloc(buflen, GFP_NOFS);
- if (!buffer)
- return -ENOMEM;
- } else
- buffer = small_buf;
-
- paste_size =
- (get_inode_sd_version(dir) ==
- STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
-
- /*
- * fill buffer : directory entry head, name[, dir objectid | ,
- * stat data | ,stat data, dir objectid ]
- */
- deh = (struct reiserfs_de_head *)buffer;
- deh->deh_location = 0; /* JDM Endian safe if 0 */
- put_deh_offset(deh, cpu_key_k_offset(&entry_key));
- deh->deh_state = 0; /* JDM Endian safe if 0 */
- /* put key (ino analog) to de */
-
- /* safe: k_dir_id is le */
- deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id;
- /* safe: k_objectid is le */
- deh->deh_objectid = INODE_PKEY(inode)->k_objectid;
-
- /* copy name */
- memcpy((char *)(deh + 1), name, namelen);
- /* padd by 0s to the 4 byte boundary */
- padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen);
-
- /*
- * entry is ready to be pasted into tree, set 'visibility'
- * and 'stat data in entry' attributes
- */
- mark_de_without_sd(deh);
- visible ? mark_de_visible(deh) : mark_de_hidden(deh);
-
- /* find the proper place for the new entry */
- memset(bit_string, 0, sizeof(bit_string));
- de.de_gen_number_bit_string = bit_string;
- retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
- if (retval != NAME_NOT_FOUND) {
- if (buffer != small_buf)
- kfree(buffer);
- pathrelse(&path);
-
- if (retval == IO_ERROR) {
- return -EIO;
- }
-
- if (retval != NAME_FOUND) {
- reiserfs_error(dir->i_sb, "zam-7002",
- "reiserfs_find_entry() returned "
- "unexpected value (%d)", retval);
- }
-
- return -EEXIST;
- }
-
- gen_number =
- find_first_zero_bit(bit_string,
- MAX_GENERATION_NUMBER + 1);
- if (gen_number > MAX_GENERATION_NUMBER) {
- /* there is no free generation number */
- reiserfs_warning(dir->i_sb, "reiserfs-7010",
- "Congratulations! we have got hash function "
- "screwed up");
- if (buffer != small_buf)
- kfree(buffer);
- pathrelse(&path);
- return -EBUSY;
- }
- /* adjust offset of directory enrty */
- put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
- set_cpu_key_k_offset(&entry_key, deh_offset(deh));
-
- /* update max-hash-collisions counter in reiserfs_sb_info */
- PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number);
-
- /* we need to re-search for the insertion point */
- if (gen_number != 0) {
- if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
- NAME_NOT_FOUND) {
- reiserfs_warning(dir->i_sb, "vs-7032",
- "entry with this key (%K) already "
- "exists", &entry_key);
-
- if (buffer != small_buf)
- kfree(buffer);
- pathrelse(&path);
- return -EBUSY;
- }
- }
-
- /* perform the insertion of the entry that we have prepared */
- retval =
- reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer,
- paste_size);
- if (buffer != small_buf)
- kfree(buffer);
- if (retval) {
- reiserfs_check_path(&path);
- return retval;
- }
-
- dir->i_size += paste_size;
- inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- if (!S_ISDIR(inode->i_mode) && visible)
- /* reiserfs_mkdir or reiserfs_rename will do that by itself */
- reiserfs_update_sd(th, dir);
-
- reiserfs_check_path(&path);
- return 0;
-}
-
-/*
- * quota utility function, call if you've had to abort after calling
- * new_inode_init, and have not called reiserfs_new_inode yet.
- * This should only be called on inodes that do not have stat data
- * inserted into the tree yet.
- */
-static int drop_new_inode(struct inode *inode)
-{
- dquot_drop(inode);
- make_bad_inode(inode);
- inode->i_flags |= S_NOQUOTA;
- iput(inode);
- return 0;
-}
-
-/*
- * utility function that does setup for reiserfs_new_inode.
- * dquot_initialize needs lots of credits so it's better to have it
- * outside of a transaction, so we had to pull some bits of
- * reiserfs_new_inode out into this func.
- */
-static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
-{
- /*
- * Make inode invalid - just in case we are going to drop it before
- * the initialization happens
- */
- INODE_PKEY(inode)->k_objectid = 0;
-
- /*
- * the quota init calls have to know who to charge the quota to, so
- * we have to set uid and gid here
- */
- inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- return dquot_initialize(inode);
-}
-
-static int reiserfs_create(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool excl)
-{
- int retval;
- struct inode *inode;
- /*
- * We need blocks for transaction + (user+group)*(quotas
- * for new inode + update of quota for directory owner)
- */
- int jbegin_count =
- JOURNAL_PER_BALANCE_CNT * 2 +
- 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
- REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
- struct reiserfs_transaction_handle th;
- struct reiserfs_security_handle security;
-
- retval = dquot_initialize(dir);
- if (retval)
- return retval;
-
- if (!(inode = new_inode(dir->i_sb))) {
- return -ENOMEM;
- }
- retval = new_inode_init(inode, dir, mode);
- if (retval) {
- drop_new_inode(inode);
- return retval;
- }
-
- jbegin_count += reiserfs_cache_default_acl(dir);
- retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
- if (retval < 0) {
- drop_new_inode(inode);
- return retval;
- }
- jbegin_count += retval;
- reiserfs_write_lock(dir->i_sb);
-
- retval = journal_begin(&th, dir->i_sb, jbegin_count);
- if (retval) {
- drop_new_inode(inode);
- goto out_failed;
- }
-
- retval =
- reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
- inode, &security);
- if (retval)
- goto out_failed;
-
- inode->i_op = &reiserfs_file_inode_operations;
- inode->i_fop = &reiserfs_file_operations;
- inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-
- retval =
- reiserfs_add_entry(&th, dir, dentry->d_name.name,
- dentry->d_name.len, inode, 1 /*visible */ );
- if (retval) {
- int err;
- drop_nlink(inode);
- reiserfs_update_sd(&th, inode);
- err = journal_end(&th);
- if (err)
- retval = err;
- unlock_new_inode(inode);
- iput(inode);
- goto out_failed;
- }
- reiserfs_update_inode_transaction(inode);
- reiserfs_update_inode_transaction(dir);
-
- d_instantiate_new(dentry, inode);
- retval = journal_end(&th);
-
-out_failed:
- reiserfs_write_unlock(dir->i_sb);
- reiserfs_security_free(&security);
- return retval;
-}
-
-static int reiserfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t rdev)
-{
- int retval;
- struct inode *inode;
- struct reiserfs_transaction_handle th;
- struct reiserfs_security_handle security;
- /*
- * We need blocks for transaction + (user+group)*(quotas
- * for new inode + update of quota for directory owner)
- */
- int jbegin_count =
- JOURNAL_PER_BALANCE_CNT * 3 +
- 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
- REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-
- retval = dquot_initialize(dir);
- if (retval)
- return retval;
-
- if (!(inode = new_inode(dir->i_sb))) {
- return -ENOMEM;
- }
- retval = new_inode_init(inode, dir, mode);
- if (retval) {
- drop_new_inode(inode);
- return retval;
- }
-
- jbegin_count += reiserfs_cache_default_acl(dir);
- retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
- if (retval < 0) {
- drop_new_inode(inode);
- return retval;
- }
- jbegin_count += retval;
- reiserfs_write_lock(dir->i_sb);
-
- retval = journal_begin(&th, dir->i_sb, jbegin_count);
- if (retval) {
- drop_new_inode(inode);
- goto out_failed;
- }
-
- retval =
- reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
- inode, &security);
- if (retval) {
- goto out_failed;
- }
-
- inode->i_op = &reiserfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, rdev);
-
- /* FIXME: needed for block and char devices only */
- reiserfs_update_sd(&th, inode);
-
- reiserfs_update_inode_transaction(inode);
- reiserfs_update_inode_transaction(dir);
-
- retval =
- reiserfs_add_entry(&th, dir, dentry->d_name.name,
- dentry->d_name.len, inode, 1 /*visible */ );
- if (retval) {
- int err;
- drop_nlink(inode);
- reiserfs_update_sd(&th, inode);
- err = journal_end(&th);
- if (err)
- retval = err;
- unlock_new_inode(inode);
- iput(inode);
- goto out_failed;
- }
-
- d_instantiate_new(dentry, inode);
- retval = journal_end(&th);
-
-out_failed:
- reiserfs_write_unlock(dir->i_sb);
- reiserfs_security_free(&security);
- return retval;
-}
-
-static int reiserfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
-{
- int retval;
- struct inode *inode;
- struct reiserfs_transaction_handle th;
- struct reiserfs_security_handle security;
- /*
- * We need blocks for transaction + (user+group)*(quotas
- * for new inode + update of quota for directory owner)
- */
- int jbegin_count =
- JOURNAL_PER_BALANCE_CNT * 3 +
- 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
- REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-
- retval = dquot_initialize(dir);
- if (retval)
- return retval;
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- /*
- * set flag that new packing locality created and new blocks
- * for the content of that directory are not displaced yet
- */
- REISERFS_I(dir)->new_packing_locality = 1;
-#endif
- mode = S_IFDIR | mode;
- if (!(inode = new_inode(dir->i_sb))) {
- return -ENOMEM;
- }
- retval = new_inode_init(inode, dir, mode);
- if (retval) {
- drop_new_inode(inode);
- return retval;
- }
-
- jbegin_count += reiserfs_cache_default_acl(dir);
- retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
- if (retval < 0) {
- drop_new_inode(inode);
- return retval;
- }
- jbegin_count += retval;
- reiserfs_write_lock(dir->i_sb);
-
- retval = journal_begin(&th, dir->i_sb, jbegin_count);
- if (retval) {
- drop_new_inode(inode);
- goto out_failed;
- }
-
- /*
- * inc the link count now, so another writer doesn't overflow
- * it while we sleep later on.
- */
- INC_DIR_INODE_NLINK(dir)
-
- retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */,
- old_format_only(dir->i_sb) ?
- EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
- dentry, inode, &security);
- if (retval) {
- DEC_DIR_INODE_NLINK(dir)
- goto out_failed;
- }
-
- reiserfs_update_inode_transaction(inode);
- reiserfs_update_inode_transaction(dir);
-
- inode->i_op = &reiserfs_dir_inode_operations;
- inode->i_fop = &reiserfs_dir_operations;
-
- /* note, _this_ add_entry will not update dir's stat data */
- retval =
- reiserfs_add_entry(&th, dir, dentry->d_name.name,
- dentry->d_name.len, inode, 1 /*visible */ );
- if (retval) {
- int err;
- clear_nlink(inode);
- DEC_DIR_INODE_NLINK(dir);
- reiserfs_update_sd(&th, inode);
- err = journal_end(&th);
- if (err)
- retval = err;
- unlock_new_inode(inode);
- iput(inode);
- goto out_failed;
- }
- /* the above add_entry did not update dir's stat data */
- reiserfs_update_sd(&th, dir);
-
- d_instantiate_new(dentry, inode);
- retval = journal_end(&th);
-out_failed:
- reiserfs_write_unlock(dir->i_sb);
- reiserfs_security_free(&security);
- return retval;
-}
-
-static inline int reiserfs_empty_dir(struct inode *inode)
-{
- /*
- * we can cheat because an old format dir cannot have
- * EMPTY_DIR_SIZE, and a new format dir cannot have
- * EMPTY_DIR_SIZE_V1. So, if the inode is either size,
- * regardless of disk format version, the directory is empty.
- */
- if (inode->i_size != EMPTY_DIR_SIZE &&
- inode->i_size != EMPTY_DIR_SIZE_V1) {
- return 0;
- }
- return 1;
-}
-
-static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
- int retval, err;
- struct inode *inode;
- struct reiserfs_transaction_handle th;
- int jbegin_count;
- INITIALIZE_PATH(path);
- struct reiserfs_dir_entry de;
-
- /*
- * we will be doing 2 balancings and update 2 stat data, we
- * change quotas of the owner of the directory and of the owner
- * of the parent directory. The quota structure is possibly
- * deleted only on last iput => outside of this transaction
- */
- jbegin_count =
- JOURNAL_PER_BALANCE_CNT * 2 + 2 +
- 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
- retval = dquot_initialize(dir);
- if (retval)
- return retval;
-
- reiserfs_write_lock(dir->i_sb);
- retval = journal_begin(&th, dir->i_sb, jbegin_count);
- if (retval)
- goto out_rmdir;
-
- de.de_gen_number_bit_string = NULL;
- if ((retval =
- reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
- &path, &de)) == NAME_NOT_FOUND) {
- retval = -ENOENT;
- goto end_rmdir;
- } else if (retval == IO_ERROR) {
- retval = -EIO;
- goto end_rmdir;
- }
-
- inode = d_inode(dentry);
-
- reiserfs_update_inode_transaction(inode);
- reiserfs_update_inode_transaction(dir);
-
- if (de.de_objectid != inode->i_ino) {
- /*
- * FIXME: compare key of an object and a key found in the entry
- */
- retval = -EIO;
- goto end_rmdir;
- }
- if (!reiserfs_empty_dir(inode)) {
- retval = -ENOTEMPTY;
- goto end_rmdir;
- }
-
- /* cut entry from dir directory */
- retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key,
- dir, NULL, /* page */
- 0 /*new file size - not used here */ );
- if (retval < 0)
- goto end_rmdir;
-
- if (inode->i_nlink != 2 && inode->i_nlink != 1)
- reiserfs_error(inode->i_sb, "reiserfs-7040",
- "empty directory has nlink != 2 (%d)",
- inode->i_nlink);
-
- clear_nlink(inode);
- inode_set_mtime_to_ts(dir,
- inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
- reiserfs_update_sd(&th, inode);
-
- DEC_DIR_INODE_NLINK(dir)
- dir->i_size -= (DEH_SIZE + de.de_entrylen);
- reiserfs_update_sd(&th, dir);
-
- /* prevent empty directory from getting lost */
- add_save_link(&th, inode, 0 /* not truncate */ );
-
- retval = journal_end(&th);
- reiserfs_check_path(&path);
-out_rmdir:
- reiserfs_write_unlock(dir->i_sb);
- return retval;
-
-end_rmdir:
- /*
- * we must release path, because we did not call
- * reiserfs_cut_from_item, or reiserfs_cut_from_item does not
- * release path if operation was not complete
- */
- pathrelse(&path);
- err = journal_end(&th);
- reiserfs_write_unlock(dir->i_sb);
- return err ? err : retval;
-}
-
-static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
-{
- int retval, err;
- struct inode *inode;
- struct reiserfs_dir_entry de;
- INITIALIZE_PATH(path);
- struct reiserfs_transaction_handle th;
- int jbegin_count;
- unsigned long savelink;
-
- retval = dquot_initialize(dir);
- if (retval)
- return retval;
-
- inode = d_inode(dentry);
-
- /*
- * in this transaction we can be doing at max two balancings and
- * update two stat datas, we change quotas of the owner of the
- * directory and of the owner of the parent directory. The quota
- * structure is possibly deleted only on iput => outside of
- * this transaction
- */
- jbegin_count =
- JOURNAL_PER_BALANCE_CNT * 2 + 2 +
- 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
- reiserfs_write_lock(dir->i_sb);
- retval = journal_begin(&th, dir->i_sb, jbegin_count);
- if (retval)
- goto out_unlink;
-
- de.de_gen_number_bit_string = NULL;
- if ((retval =
- reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
- &path, &de)) == NAME_NOT_FOUND) {
- retval = -ENOENT;
- goto end_unlink;
- } else if (retval == IO_ERROR) {
- retval = -EIO;
- goto end_unlink;
- }
-
- reiserfs_update_inode_transaction(inode);
- reiserfs_update_inode_transaction(dir);
-
- if (de.de_objectid != inode->i_ino) {
- /*
- * FIXME: compare key of an object and a key found in the entry
- */
- retval = -EIO;
- goto end_unlink;
- }
-
- if (!inode->i_nlink) {
- reiserfs_warning(inode->i_sb, "reiserfs-7042",
- "deleting nonexistent file (%lu), %d",
- inode->i_ino, inode->i_nlink);
- set_nlink(inode, 1);
- }
-
- drop_nlink(inode);
-
- /*
- * we schedule before doing the add_save_link call, save the link
- * count so we don't race
- */
- savelink = inode->i_nlink;
-
- retval =
- reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL,
- 0);
- if (retval < 0) {
- inc_nlink(inode);
- goto end_unlink;
- }
- inode_set_ctime_current(inode);
- reiserfs_update_sd(&th, inode);
-
- dir->i_size -= (de.de_entrylen + DEH_SIZE);
- inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- reiserfs_update_sd(&th, dir);
-
- if (!savelink)
- /* prevent file from getting lost */
- add_save_link(&th, inode, 0 /* not truncate */ );
-
- retval = journal_end(&th);
- reiserfs_check_path(&path);
- reiserfs_write_unlock(dir->i_sb);
- return retval;
-
-end_unlink:
- pathrelse(&path);
- err = journal_end(&th);
- reiserfs_check_path(&path);
- if (err)
- retval = err;
-out_unlink:
- reiserfs_write_unlock(dir->i_sb);
- return retval;
-}
-
-static int reiserfs_symlink(struct mnt_idmap *idmap,
- struct inode *parent_dir, struct dentry *dentry,
- const char *symname)
-{
- int retval;
- struct inode *inode;
- char *name;
- int item_len;
- struct reiserfs_transaction_handle th;
- struct reiserfs_security_handle security;
- int mode = S_IFLNK | S_IRWXUGO;
- /*
- * We need blocks for transaction + (user+group)*(quotas for
- * new inode + update of quota for directory owner)
- */
- int jbegin_count =
- JOURNAL_PER_BALANCE_CNT * 3 +
- 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
- REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
-
- retval = dquot_initialize(parent_dir);
- if (retval)
- return retval;
-
- if (!(inode = new_inode(parent_dir->i_sb))) {
- return -ENOMEM;
- }
- retval = new_inode_init(inode, parent_dir, mode);
- if (retval) {
- drop_new_inode(inode);
- return retval;
- }
-
- retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
- &security);
- if (retval < 0) {
- drop_new_inode(inode);
- return retval;
- }
- jbegin_count += retval;
-
- reiserfs_write_lock(parent_dir->i_sb);
- item_len = ROUND_UP(strlen(symname));
- if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
- retval = -ENAMETOOLONG;
- drop_new_inode(inode);
- goto out_failed;
- }
-
- name = kmalloc(item_len, GFP_NOFS);
- if (!name) {
- drop_new_inode(inode);
- retval = -ENOMEM;
- goto out_failed;
- }
- memcpy(name, symname, strlen(symname));
- padd_item(name, item_len, strlen(symname));
-
- retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
- if (retval) {
- drop_new_inode(inode);
- kfree(name);
- goto out_failed;
- }
-
- retval =
- reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
- dentry, inode, &security);
- kfree(name);
- if (retval) { /* reiserfs_new_inode iputs for us */
- goto out_failed;
- }
-
- reiserfs_update_inode_transaction(inode);
- reiserfs_update_inode_transaction(parent_dir);
-
- inode->i_op = &reiserfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-
- retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
- dentry->d_name.len, inode, 1 /*visible */ );
- if (retval) {
- int err;
- drop_nlink(inode);
- reiserfs_update_sd(&th, inode);
- err = journal_end(&th);
- if (err)
- retval = err;
- unlock_new_inode(inode);
- iput(inode);
- goto out_failed;
- }
-
- d_instantiate_new(dentry, inode);
- retval = journal_end(&th);
-out_failed:
- reiserfs_write_unlock(parent_dir->i_sb);
- reiserfs_security_free(&security);
- return retval;
-}
-
-static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *dentry)
-{
- int retval;
- struct inode *inode = d_inode(old_dentry);
- struct reiserfs_transaction_handle th;
- /*
- * We need blocks for transaction + update of quotas for
- * the owners of the directory
- */
- int jbegin_count =
- JOURNAL_PER_BALANCE_CNT * 3 +
- 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
- retval = dquot_initialize(dir);
- if (retval)
- return retval;
-
- reiserfs_write_lock(dir->i_sb);
- if (inode->i_nlink >= REISERFS_LINK_MAX) {
- /* FIXME: sd_nlink is 32 bit for new files */
- reiserfs_write_unlock(dir->i_sb);
- return -EMLINK;
- }
-
- /* inc before scheduling so reiserfs_unlink knows we are here */
- inc_nlink(inode);
-
- retval = journal_begin(&th, dir->i_sb, jbegin_count);
- if (retval) {
- drop_nlink(inode);
- reiserfs_write_unlock(dir->i_sb);
- return retval;
- }
-
- /* create new entry */
- retval =
- reiserfs_add_entry(&th, dir, dentry->d_name.name,
- dentry->d_name.len, inode, 1 /*visible */ );
-
- reiserfs_update_inode_transaction(inode);
- reiserfs_update_inode_transaction(dir);
-
- if (retval) {
- int err;
- drop_nlink(inode);
- err = journal_end(&th);
- reiserfs_write_unlock(dir->i_sb);
- return err ? err : retval;
- }
-
- inode_set_ctime_current(inode);
- reiserfs_update_sd(&th, inode);
-
- ihold(inode);
- d_instantiate(dentry, inode);
- retval = journal_end(&th);
- reiserfs_write_unlock(dir->i_sb);
- return retval;
-}
-
-/* de contains information pointing to an entry which */
-static int de_still_valid(const char *name, int len,
- struct reiserfs_dir_entry *de)
-{
- struct reiserfs_dir_entry tmp = *de;
-
- /* recalculate pointer to name and name length */
- set_de_name_and_namelen(&tmp);
- /* FIXME: could check more */
- if (tmp.de_namelen != len || memcmp(name, de->de_name, len))
- return 0;
- return 1;
-}
-
-static int entry_points_to_object(const char *name, int len,
- struct reiserfs_dir_entry *de,
- struct inode *inode)
-{
- if (!de_still_valid(name, len, de))
- return 0;
-
- if (inode) {
- if (!de_visible(de->de_deh + de->de_entry_num))
- reiserfs_panic(inode->i_sb, "vs-7042",
- "entry must be visible");
- return (de->de_objectid == inode->i_ino) ? 1 : 0;
- }
-
- /* this must be added hidden entry */
- if (de_visible(de->de_deh + de->de_entry_num))
- reiserfs_panic(NULL, "vs-7043", "entry must be visible");
-
- return 1;
-}
-
-/* sets key of objectid the entry has to point to */
-static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
- struct reiserfs_key *key)
-{
- /* JDM These operations are endian safe - both are le */
- de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
- de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
-}
-
-/*
- * process, that is going to call fix_nodes/do_balance must hold only
- * one path. If it holds 2 or more, it can get into endless waiting in
- * get_empty_nodes or its clones
- */
-static int reiserfs_rename(struct mnt_idmap *idmap,
- struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
-{
- int retval;
- INITIALIZE_PATH(old_entry_path);
- INITIALIZE_PATH(new_entry_path);
- INITIALIZE_PATH(dot_dot_entry_path);
- struct item_head new_entry_ih, old_entry_ih, dot_dot_ih;
- struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
- struct inode *old_inode, *new_dentry_inode;
- struct reiserfs_transaction_handle th;
- int jbegin_count;
- unsigned long savelink = 1;
- bool update_dir_parent = false;
-
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- /*
- * three balancings: (1) old name removal, (2) new name insertion
- * and (3) maybe "save" link insertion
- * stat data updates: (1) old directory,
- * (2) new directory and (3) maybe old object stat data (when it is
- * directory) and (4) maybe stat data of object to which new entry
- * pointed initially and (5) maybe block containing ".." of
- * renamed directory
- * quota updates: two parent directories
- */
- jbegin_count =
- JOURNAL_PER_BALANCE_CNT * 3 + 5 +
- 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
-
- retval = dquot_initialize(old_dir);
- if (retval)
- return retval;
- retval = dquot_initialize(new_dir);
- if (retval)
- return retval;
-
- old_inode = d_inode(old_dentry);
- new_dentry_inode = d_inode(new_dentry);
-
- /*
- * make sure that oldname still exists and points to an object we
- * are going to rename
- */
- old_de.de_gen_number_bit_string = NULL;
- reiserfs_write_lock(old_dir->i_sb);
- retval =
- reiserfs_find_entry(old_dir, old_dentry->d_name.name,
- old_dentry->d_name.len, &old_entry_path,
- &old_de);
- pathrelse(&old_entry_path);
- if (retval == IO_ERROR) {
- reiserfs_write_unlock(old_dir->i_sb);
- return -EIO;
- }
-
- if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) {
- reiserfs_write_unlock(old_dir->i_sb);
- return -ENOENT;
- }
-
- if (S_ISDIR(old_inode->i_mode)) {
- /*
- * make sure that directory being renamed has correct ".."
- * and that its new parent directory has not too many links
- * already
- */
- if (new_dentry_inode) {
- if (!reiserfs_empty_dir(new_dentry_inode)) {
- reiserfs_write_unlock(old_dir->i_sb);
- return -ENOTEMPTY;
- }
- }
-
- if (old_dir != new_dir) {
- /*
- * directory is renamed, its parent directory will be
- * changed, so find ".." entry
- */
- dot_dot_de.de_gen_number_bit_string = NULL;
- retval =
- reiserfs_find_entry(old_inode, "..", 2,
- &dot_dot_entry_path,
- &dot_dot_de);
- pathrelse(&dot_dot_entry_path);
- if (retval != NAME_FOUND) {
- reiserfs_write_unlock(old_dir->i_sb);
- return -EIO;
- }
-
- /* inode number of .. must equal old_dir->i_ino */
- if (dot_dot_de.de_objectid != old_dir->i_ino) {
- reiserfs_write_unlock(old_dir->i_sb);
- return -EIO;
- }
- update_dir_parent = true;
- }
- }
-
- retval = journal_begin(&th, old_dir->i_sb, jbegin_count);
- if (retval) {
- reiserfs_write_unlock(old_dir->i_sb);
- return retval;
- }
-
- /* add new entry (or find the existing one) */
- retval =
- reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name,
- new_dentry->d_name.len, old_inode, 0);
- if (retval == -EEXIST) {
- if (!new_dentry_inode) {
- reiserfs_panic(old_dir->i_sb, "vs-7050",
- "new entry is found, new inode == 0");
- }
- } else if (retval) {
- int err = journal_end(&th);
- reiserfs_write_unlock(old_dir->i_sb);
- return err ? err : retval;
- }
-
- reiserfs_update_inode_transaction(old_dir);
- reiserfs_update_inode_transaction(new_dir);
-
- /*
- * this makes it so an fsync on an open fd for the old name will
- * commit the rename operation
- */
- reiserfs_update_inode_transaction(old_inode);
-
- if (new_dentry_inode)
- reiserfs_update_inode_transaction(new_dentry_inode);
-
- while (1) {
- /*
- * look for old name using corresponding entry key
- * (found by reiserfs_find_entry)
- */
- if ((retval =
- search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key,
- &old_entry_path,
- &old_de)) != NAME_FOUND) {
- pathrelse(&old_entry_path);
- journal_end(&th);
- reiserfs_write_unlock(old_dir->i_sb);
- return -EIO;
- }
-
- copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path));
-
- reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1);
-
- /* look for new name by reiserfs_find_entry */
- new_de.de_gen_number_bit_string = NULL;
- retval =
- reiserfs_find_entry(new_dir, new_dentry->d_name.name,
- new_dentry->d_name.len, &new_entry_path,
- &new_de);
- /*
- * reiserfs_add_entry should not return IO_ERROR,
- * because it is called with essentially same parameters from
- * reiserfs_add_entry above, and we'll catch any i/o errors
- * before we get here.
- */
- if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
- pathrelse(&new_entry_path);
- pathrelse(&old_entry_path);
- journal_end(&th);
- reiserfs_write_unlock(old_dir->i_sb);
- return -EIO;
- }
-
- copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path));
-
- reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
-
- if (update_dir_parent) {
- if ((retval =
- search_by_entry_key(new_dir->i_sb,
- &dot_dot_de.de_entry_key,
- &dot_dot_entry_path,
- &dot_dot_de)) != NAME_FOUND) {
- pathrelse(&dot_dot_entry_path);
- pathrelse(&new_entry_path);
- pathrelse(&old_entry_path);
- journal_end(&th);
- reiserfs_write_unlock(old_dir->i_sb);
- return -EIO;
- }
- copy_item_head(&dot_dot_ih,
- tp_item_head(&dot_dot_entry_path));
- /* node containing ".." gets into transaction */
- reiserfs_prepare_for_journal(old_inode->i_sb,
- dot_dot_de.de_bh, 1);
- }
- /*
- * we should check seals here, not do
- * this stuff, yes? Then, having
- * gathered everything into RAM we
- * should lock the buffers, yes? -Hans
- */
- /*
- * probably. our rename needs to hold more
- * than one path at once. The seals would
- * have to be written to deal with multi-path
- * issues -chris
- */
- /*
- * sanity checking before doing the rename - avoid races many
- * of the above checks could have scheduled. We have to be
- * sure our items haven't been shifted by another process.
- */
- if (item_moved(&new_entry_ih, &new_entry_path) ||
- !entry_points_to_object(new_dentry->d_name.name,
- new_dentry->d_name.len,
- &new_de, new_dentry_inode) ||
- item_moved(&old_entry_ih, &old_entry_path) ||
- !entry_points_to_object(old_dentry->d_name.name,
- old_dentry->d_name.len,
- &old_de, old_inode)) {
- reiserfs_restore_prepared_buffer(old_inode->i_sb,
- new_de.de_bh);
- reiserfs_restore_prepared_buffer(old_inode->i_sb,
- old_de.de_bh);
- if (update_dir_parent)
- reiserfs_restore_prepared_buffer(old_inode->
- i_sb,
- dot_dot_de.
- de_bh);
- continue;
- }
- if (update_dir_parent) {
- if (item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
- !entry_points_to_object("..", 2, &dot_dot_de,
- old_dir)) {
- reiserfs_restore_prepared_buffer(old_inode->
- i_sb,
- old_de.de_bh);
- reiserfs_restore_prepared_buffer(old_inode->
- i_sb,
- new_de.de_bh);
- reiserfs_restore_prepared_buffer(old_inode->
- i_sb,
- dot_dot_de.
- de_bh);
- continue;
- }
- }
-
- RFALSE(update_dir_parent &&
- !buffer_journal_prepared(dot_dot_de.de_bh), "");
-
- break;
- }
-
- /*
- * ok, all the changes can be done in one fell swoop when we
- * have claimed all the buffers needed.
- */
-
- mark_de_visible(new_de.de_deh + new_de.de_entry_num);
- set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode));
- journal_mark_dirty(&th, new_de.de_bh);
-
- mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
- journal_mark_dirty(&th, old_de.de_bh);
- /*
- * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch
- * which adds ctime update of renamed object
- */
- simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
-
- if (new_dentry_inode) {
- /* adjust link number of the victim */
- if (S_ISDIR(new_dentry_inode->i_mode)) {
- clear_nlink(new_dentry_inode);
- } else {
- drop_nlink(new_dentry_inode);
- }
- savelink = new_dentry_inode->i_nlink;
- }
-
- if (update_dir_parent) {
- /* adjust ".." of renamed directory */
- set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
- journal_mark_dirty(&th, dot_dot_de.de_bh);
- }
- if (S_ISDIR(old_inode->i_mode)) {
- /*
- * there (in new_dir) was no directory, so it got new link
- * (".." of renamed directory)
- */
- if (!new_dentry_inode)
- INC_DIR_INODE_NLINK(new_dir);
-
- /* old directory lost one link - ".. " of renamed directory */
- DEC_DIR_INODE_NLINK(old_dir);
- }
- /*
- * looks like in 2.3.99pre3 brelse is atomic.
- * so we can use pathrelse
- */
- pathrelse(&new_entry_path);
- pathrelse(&dot_dot_entry_path);
-
- /*
- * FIXME: this reiserfs_cut_from_item's return value may screw up
- * anybody, but it will panic if will not be able to find the
- * entry. This needs one more clean up
- */
- if (reiserfs_cut_from_item
- (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL,
- 0) < 0)
- reiserfs_error(old_dir->i_sb, "vs-7060",
- "couldn't not cut old name. Fsck later?");
-
- old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
-
- reiserfs_update_sd(&th, old_dir);
- reiserfs_update_sd(&th, new_dir);
- reiserfs_update_sd(&th, old_inode);
-
- if (new_dentry_inode) {
- if (savelink == 0)
- add_save_link(&th, new_dentry_inode,
- 0 /* not truncate */ );
- reiserfs_update_sd(&th, new_dentry_inode);
- }
-
- retval = journal_end(&th);
- reiserfs_write_unlock(old_dir->i_sb);
- return retval;
-}
-
-static const struct inode_operations reiserfs_priv_dir_inode_operations = {
- .create = reiserfs_create,
- .lookup = reiserfs_lookup,
- .link = reiserfs_link,
- .unlink = reiserfs_unlink,
- .symlink = reiserfs_symlink,
- .mkdir = reiserfs_mkdir,
- .rmdir = reiserfs_rmdir,
- .mknod = reiserfs_mknod,
- .rename = reiserfs_rename,
- .setattr = reiserfs_setattr,
- .permission = reiserfs_permission,
- .fileattr_get = reiserfs_fileattr_get,
- .fileattr_set = reiserfs_fileattr_set,
-};
-
-static const struct inode_operations reiserfs_priv_symlink_inode_operations = {
- .get_link = page_get_link,
- .setattr = reiserfs_setattr,
- .permission = reiserfs_permission,
-};
-
-static const struct inode_operations reiserfs_priv_special_inode_operations = {
- .setattr = reiserfs_setattr,
- .permission = reiserfs_permission,
-};
-
-void reiserfs_init_priv_inode(struct inode *inode)
-{
- inode->i_flags |= S_PRIVATE;
- inode->i_opflags &= ~IOP_XATTR;
-
- if (S_ISREG(inode->i_mode))
- inode->i_op = &reiserfs_priv_file_inode_operations;
- else if (S_ISDIR(inode->i_mode))
- inode->i_op = &reiserfs_priv_dir_inode_operations;
- else if (S_ISLNK(inode->i_mode))
- inode->i_op = &reiserfs_priv_symlink_inode_operations;
- else
- inode->i_op = &reiserfs_priv_special_inode_operations;
-}
-
-/* directories can handle most operations... */
-const struct inode_operations reiserfs_dir_inode_operations = {
- .create = reiserfs_create,
- .lookup = reiserfs_lookup,
- .link = reiserfs_link,
- .unlink = reiserfs_unlink,
- .symlink = reiserfs_symlink,
- .mkdir = reiserfs_mkdir,
- .rmdir = reiserfs_rmdir,
- .mknod = reiserfs_mknod,
- .rename = reiserfs_rename,
- .setattr = reiserfs_setattr,
- .listxattr = reiserfs_listxattr,
- .permission = reiserfs_permission,
- .get_inode_acl = reiserfs_get_acl,
- .set_acl = reiserfs_set_acl,
- .fileattr_get = reiserfs_fileattr_get,
- .fileattr_set = reiserfs_fileattr_set,
-};
-
-/*
- * symlink operations.. same as page_symlink_inode_operations, with xattr
- * stuff added
- */
-const struct inode_operations reiserfs_symlink_inode_operations = {
- .get_link = page_get_link,
- .setattr = reiserfs_setattr,
- .listxattr = reiserfs_listxattr,
- .permission = reiserfs_permission,
-};
-
-/*
- * special file operations.. just xattr/acl stuff
- */
-const struct inode_operations reiserfs_special_inode_operations = {
- .setattr = reiserfs_setattr,
- .listxattr = reiserfs_listxattr,
- .permission = reiserfs_permission,
- .get_inode_acl = reiserfs_get_acl,
- .set_acl = reiserfs_set_acl,
-};
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
deleted file mode 100644
index 34baf5c0f265..000000000000
--- a/fs/reiserfs/objectid.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/uuid.h>
-#include "reiserfs.h"
-
-/* find where objectid map starts */
-#define objectid_map(s,rs) (old_format_only (s) ? \
- (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
- (__le32 *)((rs) + 1))
-
-#ifdef CONFIG_REISERFS_CHECK
-
-static void check_objectid_map(struct super_block *s, __le32 * map)
-{
- if (le32_to_cpu(map[0]) != 1)
- reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
- (long unsigned int)le32_to_cpu(map[0]));
-
- /* FIXME: add something else here */
-}
-
-#else
-static void check_objectid_map(struct super_block *s, __le32 * map)
-{;
-}
-#endif
-
-/*
- * When we allocate objectids we allocate the first unused objectid.
- * Each sequence of objectids in use (the odd sequences) is followed
- * by a sequence of objectids not in use (the even sequences). We
- * only need to record the last objectid in each of these sequences
- * (both the odd and even sequences) in order to fully define the
- * boundaries of the sequences. A consequence of allocating the first
- * objectid not in use is that under most conditions this scheme is
- * extremely compact. The exception is immediately after a sequence
- * of operations which deletes a large number of objects of
- * non-sequential objectids, and even then it will become compact
- * again as soon as more objects are created. Note that many
- * interesting optimizations of layout could result from complicating
- * objectid assignment, but we have deferred making them for now.
- */
-
-/* get unique object identifier */
-__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
-{
- struct super_block *s = th->t_super;
- struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
- __le32 *map = objectid_map(s, rs);
- __u32 unused_objectid;
-
- BUG_ON(!th->t_trans_id);
-
- check_objectid_map(s, map);
-
- reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
- /* comment needed -Hans */
- unused_objectid = le32_to_cpu(map[1]);
- if (unused_objectid == U32_MAX) {
- reiserfs_warning(s, "reiserfs-15100", "no more object ids");
- reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
- return 0;
- }
-
- /*
- * This incrementation allocates the first unused objectid. That
- * is to say, the first entry on the objectid map is the first
- * unused objectid, and by incrementing it we use it. See below
- * where we check to see if we eliminated a sequence of unused
- * objectids....
- */
- map[1] = cpu_to_le32(unused_objectid + 1);
-
- /*
- * Now we check to see if we eliminated the last remaining member of
- * the first even sequence (and can eliminate the sequence by
- * eliminating its last objectid from oids), and can collapse the
- * first two odd sequences into one sequence. If so, then the net
- * result is to eliminate a pair of objectids from oids. We do this
- * by shifting the entire map to the left.
- */
- if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
- memmove(map + 1, map + 3,
- (sb_oid_cursize(rs) - 3) * sizeof(__u32));
- set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
- }
-
- journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
- return unused_objectid;
-}
-
-/* makes object identifier unused */
-void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
- __u32 objectid_to_release)
-{
- struct super_block *s = th->t_super;
- struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
- __le32 *map = objectid_map(s, rs);
- int i = 0;
-
- BUG_ON(!th->t_trans_id);
- /*return; */
- check_objectid_map(s, map);
-
- reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
- journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
-
- /*
- * start at the beginning of the objectid map (i = 0) and go to
- * the end of it (i = disk_sb->s_oid_cursize). Linear search is
- * what we use, though it is possible that binary search would be
- * more efficient after performing lots of deletions (which is
- * when oids is large.) We only check even i's.
- */
- while (i < sb_oid_cursize(rs)) {
- if (objectid_to_release == le32_to_cpu(map[i])) {
- /* This incrementation unallocates the objectid. */
- le32_add_cpu(&map[i], 1);
-
- /*
- * Did we unallocate the last member of an
- * odd sequence, and can shrink oids?
- */
- if (map[i] == map[i + 1]) {
- /* shrink objectid map */
- memmove(map + i, map + i + 2,
- (sb_oid_cursize(rs) - i -
- 2) * sizeof(__u32));
- set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
-
- RFALSE(sb_oid_cursize(rs) < 2 ||
- sb_oid_cursize(rs) > sb_oid_maxsize(rs),
- "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
- sb_oid_cursize(rs), sb_oid_maxsize(rs));
- }
- return;
- }
-
- if (objectid_to_release > le32_to_cpu(map[i]) &&
- objectid_to_release < le32_to_cpu(map[i + 1])) {
- /* size of objectid map is not changed */
- if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
- le32_add_cpu(&map[i + 1], -1);
- return;
- }
-
- /*
- * JDM comparing two little-endian values for
- * equality -- safe
- */
- /*
- * objectid map must be expanded, but
- * there is no space
- */
- if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
- PROC_INFO_INC(s, leaked_oid);
- return;
- }
-
- /* expand the objectid map */
- memmove(map + i + 3, map + i + 1,
- (sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
- map[i + 1] = cpu_to_le32(objectid_to_release);
- map[i + 2] = cpu_to_le32(objectid_to_release + 1);
- set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2);
- return;
- }
- i += 2;
- }
-
- reiserfs_error(s, "vs-15011", "tried to free free object id (%lu)",
- (long unsigned)objectid_to_release);
-}
-
-int reiserfs_convert_objectid_map_v1(struct super_block *s)
-{
- struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s);
- int cur_size = sb_oid_cursize(disk_sb);
- int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
- int old_max = sb_oid_maxsize(disk_sb);
- struct reiserfs_super_block_v1 *disk_sb_v1;
- __le32 *objectid_map;
- int i;
-
- disk_sb_v1 =
- (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
- objectid_map = (__le32 *) (disk_sb_v1 + 1);
-
- if (cur_size > new_size) {
- /*
- * mark everyone used that was listed as free at
- * the end of the objectid map
- */
- objectid_map[new_size - 1] = objectid_map[cur_size - 1];
- set_sb_oid_cursize(disk_sb, new_size);
- }
- /* move the smaller objectid map past the end of the new super */
- for (i = new_size - 1; i >= 0; i--) {
- objectid_map[i + (old_max - new_size)] = objectid_map[i];
- }
-
- /* set the max size so we don't overflow later */
- set_sb_oid_maxsize(disk_sb, new_size);
-
- /* Zero out label and generate random UUID */
- memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label));
- generate_random_uuid(disk_sb->s_uuid);
-
- /* finally, zero out the unused chunk of the new super */
- memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused));
- return 0;
-}
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
deleted file mode 100644
index 84a194b77f19..000000000000
--- a/fs/reiserfs/prints.c
+++ /dev/null
@@ -1,792 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-#include <linux/time.h>
-#include <linux/fs.h>
-#include "reiserfs.h"
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-
-#include <linux/stdarg.h>
-
-static char error_buf[1024];
-static char fmt_buf[1024];
-static char off_buf[80];
-
-static char *reiserfs_cpu_offset(struct cpu_key *key)
-{
- if (cpu_key_k_type(key) == TYPE_DIRENTRY)
- sprintf(off_buf, "%llu(%llu)",
- (unsigned long long)
- GET_HASH_VALUE(cpu_key_k_offset(key)),
- (unsigned long long)
- GET_GENERATION_NUMBER(cpu_key_k_offset(key)));
- else
- sprintf(off_buf, "0x%Lx",
- (unsigned long long)cpu_key_k_offset(key));
- return off_buf;
-}
-
-static char *le_offset(struct reiserfs_key *key)
-{
- int version;
-
- version = le_key_version(key);
- if (le_key_k_type(version, key) == TYPE_DIRENTRY)
- sprintf(off_buf, "%llu(%llu)",
- (unsigned long long)
- GET_HASH_VALUE(le_key_k_offset(version, key)),
- (unsigned long long)
- GET_GENERATION_NUMBER(le_key_k_offset(version, key)));
- else
- sprintf(off_buf, "0x%Lx",
- (unsigned long long)le_key_k_offset(version, key));
- return off_buf;
-}
-
-static char *cpu_type(struct cpu_key *key)
-{
- if (cpu_key_k_type(key) == TYPE_STAT_DATA)
- return "SD";
- if (cpu_key_k_type(key) == TYPE_DIRENTRY)
- return "DIR";
- if (cpu_key_k_type(key) == TYPE_DIRECT)
- return "DIRECT";
- if (cpu_key_k_type(key) == TYPE_INDIRECT)
- return "IND";
- return "UNKNOWN";
-}
-
-static char *le_type(struct reiserfs_key *key)
-{
- int version;
-
- version = le_key_version(key);
-
- if (le_key_k_type(version, key) == TYPE_STAT_DATA)
- return "SD";
- if (le_key_k_type(version, key) == TYPE_DIRENTRY)
- return "DIR";
- if (le_key_k_type(version, key) == TYPE_DIRECT)
- return "DIRECT";
- if (le_key_k_type(version, key) == TYPE_INDIRECT)
- return "IND";
- return "UNKNOWN";
-}
-
-/* %k */
-static int scnprintf_le_key(char *buf, size_t size, struct reiserfs_key *key)
-{
- if (key)
- return scnprintf(buf, size, "[%d %d %s %s]",
- le32_to_cpu(key->k_dir_id),
- le32_to_cpu(key->k_objectid), le_offset(key),
- le_type(key));
- else
- return scnprintf(buf, size, "[NULL]");
-}
-
-/* %K */
-static int scnprintf_cpu_key(char *buf, size_t size, struct cpu_key *key)
-{
- if (key)
- return scnprintf(buf, size, "[%d %d %s %s]",
- key->on_disk_key.k_dir_id,
- key->on_disk_key.k_objectid,
- reiserfs_cpu_offset(key), cpu_type(key));
- else
- return scnprintf(buf, size, "[NULL]");
-}
-
-static int scnprintf_de_head(char *buf, size_t size,
- struct reiserfs_de_head *deh)
-{
- if (deh)
- return scnprintf(buf, size,
- "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]",
- deh_offset(deh), deh_dir_id(deh),
- deh_objectid(deh), deh_location(deh),
- deh_state(deh));
- else
- return scnprintf(buf, size, "[NULL]");
-
-}
-
-static int scnprintf_item_head(char *buf, size_t size, struct item_head *ih)
-{
- if (ih) {
- char *p = buf;
- char * const end = buf + size;
-
- p += scnprintf(p, end - p, "%s",
- (ih_version(ih) == KEY_FORMAT_3_6) ?
- "*3.6* " : "*3.5*");
-
- p += scnprintf_le_key(p, end - p, &ih->ih_key);
-
- p += scnprintf(p, end - p,
- ", item_len %d, item_location %d, free_space(entry_count) %d",
- ih_item_len(ih), ih_location(ih),
- ih_free_space(ih));
- return p - buf;
- } else
- return scnprintf(buf, size, "[NULL]");
-}
-
-static int scnprintf_direntry(char *buf, size_t size,
- struct reiserfs_dir_entry *de)
-{
- char name[20];
-
- memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen);
- name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0;
- return scnprintf(buf, size, "\"%s\"==>[%d %d]",
- name, de->de_dir_id, de->de_objectid);
-}
-
-static int scnprintf_block_head(char *buf, size_t size, struct buffer_head *bh)
-{
- return scnprintf(buf, size,
- "level=%d, nr_items=%d, free_space=%d rdkey ",
- B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh));
-}
-
-static int scnprintf_buffer_head(char *buf, size_t size, struct buffer_head *bh)
-{
- return scnprintf(buf, size,
- "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
- bh->b_bdev, bh->b_size,
- (unsigned long long)bh->b_blocknr,
- atomic_read(&(bh->b_count)),
- bh->b_state, bh->b_page,
- buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
- buffer_dirty(bh) ? "DIRTY" : "CLEAN",
- buffer_locked(bh) ? "LOCKED" : "UNLOCKED");
-}
-
-static int scnprintf_disk_child(char *buf, size_t size, struct disk_child *dc)
-{
- return scnprintf(buf, size, "[dc_number=%d, dc_size=%u]",
- dc_block_number(dc), dc_size(dc));
-}
-
-static char *is_there_reiserfs_struct(char *fmt, int *what)
-{
- char *k = fmt;
-
- while ((k = strchr(k, '%')) != NULL) {
- if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
- k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') {
- *what = k[1];
- break;
- }
- k++;
- }
- return k;
-}
-
-/*
- * debugging reiserfs we used to print out a lot of different
- * variables, like keys, item headers, buffer heads etc. Values of
- * most fields matter. So it took a long time just to write
- * appropriative printk. With this reiserfs_warning you can use format
- * specification for complex structures like you used to do with
- * printfs for integers, doubles and pointers. For instance, to print
- * out key structure you have to write just:
- * reiserfs_warning ("bad key %k", key);
- * instead of
- * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
- * key->k_offset, key->k_uniqueness);
- */
-static DEFINE_SPINLOCK(error_lock);
-static void prepare_error_buf(const char *fmt, va_list args)
-{
- char *fmt1 = fmt_buf;
- char *k;
- char *p = error_buf;
- char * const end = &error_buf[sizeof(error_buf)];
- int what;
-
- spin_lock(&error_lock);
-
- if (WARN_ON(strscpy(fmt_buf, fmt, sizeof(fmt_buf)) < 0)) {
- strscpy(error_buf, "format string too long", end - error_buf);
- goto out_unlock;
- }
-
- while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) {
- *k = 0;
-
- p += vscnprintf(p, end - p, fmt1, args);
-
- switch (what) {
- case 'k':
- p += scnprintf_le_key(p, end - p,
- va_arg(args, struct reiserfs_key *));
- break;
- case 'K':
- p += scnprintf_cpu_key(p, end - p,
- va_arg(args, struct cpu_key *));
- break;
- case 'h':
- p += scnprintf_item_head(p, end - p,
- va_arg(args, struct item_head *));
- break;
- case 't':
- p += scnprintf_direntry(p, end - p,
- va_arg(args, struct reiserfs_dir_entry *));
- break;
- case 'y':
- p += scnprintf_disk_child(p, end - p,
- va_arg(args, struct disk_child *));
- break;
- case 'z':
- p += scnprintf_block_head(p, end - p,
- va_arg(args, struct buffer_head *));
- break;
- case 'b':
- p += scnprintf_buffer_head(p, end - p,
- va_arg(args, struct buffer_head *));
- break;
- case 'a':
- p += scnprintf_de_head(p, end - p,
- va_arg(args, struct reiserfs_de_head *));
- break;
- }
-
- fmt1 = k + 2;
- }
- p += vscnprintf(p, end - p, fmt1, args);
-out_unlock:
- spin_unlock(&error_lock);
-
-}
-
-/*
- * in addition to usual conversion specifiers this accepts reiserfs
- * specific conversion specifiers:
- * %k to print little endian key,
- * %K to print cpu key,
- * %h to print item_head,
- * %t to print directory entry
- * %z to print block head (arg must be struct buffer_head *
- * %b to print buffer_head
- */
-
-#define do_reiserfs_warning(fmt)\
-{\
- va_list args;\
- va_start( args, fmt );\
- prepare_error_buf( fmt, args );\
- va_end( args );\
-}
-
-void __reiserfs_warning(struct super_block *sb, const char *id,
- const char *function, const char *fmt, ...)
-{
- do_reiserfs_warning(fmt);
- if (sb)
- printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: "
- "%s\n", sb->s_id, id ? id : "", id ? " " : "",
- function, error_buf);
- else
- printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n",
- id ? id : "", id ? " " : "", function, error_buf);
-}
-
-/* No newline.. reiserfs_info calls can be followed by printk's */
-void reiserfs_info(struct super_block *sb, const char *fmt, ...)
-{
- do_reiserfs_warning(fmt);
- if (sb)
- printk(KERN_NOTICE "REISERFS (device %s): %s",
- sb->s_id, error_buf);
- else
- printk(KERN_NOTICE "REISERFS %s:", error_buf);
-}
-
-/* No newline.. reiserfs_printk calls can be followed by printk's */
-static void reiserfs_printk(const char *fmt, ...)
-{
- do_reiserfs_warning(fmt);
- printk(error_buf);
-}
-
-void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
-{
-#ifdef CONFIG_REISERFS_CHECK
- do_reiserfs_warning(fmt);
- if (s)
- printk(KERN_DEBUG "REISERFS debug (device %s): %s\n",
- s->s_id, error_buf);
- else
- printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf);
-#endif
-}
-
-/*
- * The format:
- *
- * maintainer-errorid: [function-name:] message
- *
- * where errorid is unique to the maintainer and function-name is
- * optional, is recommended, so that anyone can easily find the bug
- * with a simple grep for the short to type string
- * maintainer-errorid. Don't bother with reusing errorids, there are
- * lots of numbers out there.
- *
- * Example:
- *
- * reiserfs_panic(
- * p_sb, "reiser-29: reiserfs_new_blocknrs: "
- * "one of search_start or rn(%d) is equal to MAX_B_NUM,"
- * "which means that we are optimizing location based on the "
- * "bogus location of a temp buffer (%p).",
- * rn, bh
- * );
- *
- * Regular panic()s sometimes clear the screen before the message can
- * be read, thus the need for the while loop.
- *
- * Numbering scheme for panic used by Vladimir and Anatoly( Hans completely
- * ignores this scheme, and considers it pointless complexity):
- *
- * panics in reiserfs_fs.h have numbers from 1000 to 1999
- * super.c 2000 to 2999
- * preserve.c (unused) 3000 to 3999
- * bitmap.c 4000 to 4999
- * stree.c 5000 to 5999
- * prints.c 6000 to 6999
- * namei.c 7000 to 7999
- * fix_nodes.c 8000 to 8999
- * dir.c 9000 to 9999
- * lbalance.c 10000 to 10999
- * ibalance.c 11000 to 11999 not ready
- * do_balan.c 12000 to 12999
- * inode.c 13000 to 13999
- * file.c 14000 to 14999
- * objectid.c 15000 - 15999
- * buffer.c 16000 - 16999
- * symlink.c 17000 - 17999
- *
- * . */
-
-void __reiserfs_panic(struct super_block *sb, const char *id,
- const char *function, const char *fmt, ...)
-{
- do_reiserfs_warning(fmt);
-
-#ifdef CONFIG_REISERFS_CHECK
- dump_stack();
-#endif
- if (sb)
- printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n",
- sb->s_id, id ? id : "", id ? " " : "",
- function, error_buf);
- else
- printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n",
- id ? id : "", id ? " " : "", function, error_buf);
- BUG();
-}
-
-void __reiserfs_error(struct super_block *sb, const char *id,
- const char *function, const char *fmt, ...)
-{
- do_reiserfs_warning(fmt);
-
- BUG_ON(sb == NULL);
-
- if (reiserfs_error_panic(sb))
- __reiserfs_panic(sb, id, function, error_buf);
-
- if (id && id[0])
- printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n",
- sb->s_id, id, function, error_buf);
- else
- printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
-
- if (sb_rdonly(sb))
- return;
-
- reiserfs_info(sb, "Remounting filesystem read-only\n");
- sb->s_flags |= SB_RDONLY;
- reiserfs_abort_journal(sb, -EIO);
-}
-
-void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
-{
- do_reiserfs_warning(fmt);
-
- if (reiserfs_error_panic(sb)) {
- panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id,
- error_buf);
- }
-
- if (reiserfs_is_journal_aborted(SB_JOURNAL(sb)))
- return;
-
- printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id,
- error_buf);
-
- sb->s_flags |= SB_RDONLY;
- reiserfs_abort_journal(sb, errno);
-}
-
-/*
- * this prints internal nodes (4 keys/items in line) (dc_number,
- * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
- * dc_size)...
- */
-static int print_internal(struct buffer_head *bh, int first, int last)
-{
- struct reiserfs_key *key;
- struct disk_child *dc;
- int i;
- int from, to;
-
- if (!B_IS_KEYS_LEVEL(bh))
- return 1;
-
- check_internal(bh);
-
- if (first == -1) {
- from = 0;
- to = B_NR_ITEMS(bh);
- } else {
- from = first;
- to = min_t(int, last, B_NR_ITEMS(bh));
- }
-
- reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
-
- dc = B_N_CHILD(bh, from);
- reiserfs_printk("PTR %d: %y ", from, dc);
-
- for (i = from, key = internal_key(bh, from), dc++; i < to;
- i++, key++, dc++) {
- reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
- if (i && i % 4 == 0)
- printk("\n");
- }
- printk("\n");
- return 0;
-}
-
-static int print_leaf(struct buffer_head *bh, int print_mode, int first,
- int last)
-{
- struct block_head *blkh;
- struct item_head *ih;
- int i, nr;
- int from, to;
-
- if (!B_IS_ITEMS_LEVEL(bh))
- return 1;
-
- check_leaf(bh);
-
- blkh = B_BLK_HEAD(bh);
- ih = item_head(bh, 0);
- nr = blkh_nr_item(blkh);
-
- printk
- ("\n===================================================================\n");
- reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh);
-
- if (!(print_mode & PRINT_LEAF_ITEMS)) {
- reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n",
- &(ih->ih_key), &((ih + nr - 1)->ih_key));
- return 0;
- }
-
- if (first < 0 || first > nr - 1)
- from = 0;
- else
- from = first;
-
- if (last < 0 || last > nr)
- to = nr;
- else
- to = last;
-
- ih += from;
- printk
- ("-------------------------------------------------------------------------------\n");
- printk
- ("|##| type | key | ilen | free_space | version | loc |\n");
- for (i = from; i < to; i++, ih++) {
- printk
- ("-------------------------------------------------------------------------------\n");
- reiserfs_printk("|%2d| %h |\n", i, ih);
- if (print_mode & PRINT_LEAF_ITEMS)
- op_print_item(ih, ih_item_body(bh, ih));
- }
-
- printk
- ("===================================================================\n");
-
- return 0;
-}
-
-char *reiserfs_hashname(int code)
-{
- if (code == YURA_HASH)
- return "rupasov";
- if (code == TEA_HASH)
- return "tea";
- if (code == R5_HASH)
- return "r5";
-
- return "unknown";
-}
-
-/* return 1 if this is not super block */
-static int print_super_block(struct buffer_head *bh)
-{
- struct reiserfs_super_block *rs =
- (struct reiserfs_super_block *)(bh->b_data);
- int skipped, data_blocks;
- char *version;
-
- if (is_reiserfs_3_5(rs)) {
- version = "3.5";
- } else if (is_reiserfs_3_6(rs)) {
- version = "3.6";
- } else if (is_reiserfs_jr(rs)) {
- version = ((sb_version(rs) == REISERFS_VERSION_2) ?
- "3.6" : "3.5");
- } else {
- return 1;
- }
-
- printk("%pg\'s super block is in block %llu\n", bh->b_bdev,
- (unsigned long long)bh->b_blocknr);
- printk("Reiserfs version %s\n", version);
- printk("Block count %u\n", sb_block_count(rs));
- printk("Blocksize %d\n", sb_blocksize(rs));
- printk("Free blocks %u\n", sb_free_blocks(rs));
- /*
- * FIXME: this would be confusing if
- * someone stores reiserfs super block in some data block ;)
-// skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
- */
- skipped = bh->b_blocknr;
- data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
- (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) +
- 1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs);
- printk
- ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n"
- "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs),
- (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) :
- sb_reserved_for_journal(rs)), data_blocks);
- printk("Root block %u\n", sb_root_block(rs));
- printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs));
- printk("Journal dev %d\n", sb_jp_journal_dev(rs));
- printk("Journal orig size %d\n", sb_jp_journal_size(rs));
- printk("FS state %d\n", sb_fs_state(rs));
- printk("Hash function \"%s\"\n",
- reiserfs_hashname(sb_hash_function_code(rs)));
-
- printk("Tree height %d\n", sb_tree_height(rs));
- return 0;
-}
-
-static int print_desc_block(struct buffer_head *bh)
-{
- struct reiserfs_journal_desc *desc;
-
- if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8))
- return 1;
-
- desc = (struct reiserfs_journal_desc *)(bh->b_data);
- printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)",
- (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc),
- get_desc_mount_id(desc), get_desc_trans_len(desc));
-
- return 0;
-}
-/* ..., int print_mode, int first, int last) */
-void print_block(struct buffer_head *bh, ...)
-{
- va_list args;
- int mode, first, last;
-
- if (!bh) {
- printk("print_block: buffer is NULL\n");
- return;
- }
-
- va_start(args, bh);
-
- mode = va_arg(args, int);
- first = va_arg(args, int);
- last = va_arg(args, int);
- if (print_leaf(bh, mode, first, last))
- if (print_internal(bh, first, last))
- if (print_super_block(bh))
- if (print_desc_block(bh))
- printk
- ("Block %llu contains unformatted data\n",
- (unsigned long long)bh->b_blocknr);
-
- va_end(args);
-}
-
-static char print_tb_buf[2048];
-
-/* this stores initial state of tree balance in the print_tb_buf */
-void store_print_tb(struct tree_balance *tb)
-{
- int h = 0;
- int i;
- struct buffer_head *tbSh, *tbFh;
-
- if (!tb)
- return;
-
- sprintf(print_tb_buf, "\n"
- "BALANCING %d\n"
- "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n"
- "=====================================================================\n"
- "* h * S * L * R * F * FL * FR * CFL * CFR *\n",
- REISERFS_SB(tb->tb_sb)->s_do_balance,
- tb->tb_mode, PATH_LAST_POSITION(tb->tb_path),
- tb->tb_path->pos_in_item);
-
- for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) {
- if (PATH_H_PATH_OFFSET(tb->tb_path, h) <=
- tb->tb_path->path_length
- && PATH_H_PATH_OFFSET(tb->tb_path,
- h) > ILLEGAL_PATH_ELEMENT_OFFSET) {
- tbSh = PATH_H_PBUFFER(tb->tb_path, h);
- tbFh = PATH_H_PPARENT(tb->tb_path, h);
- } else {
- tbSh = NULL;
- tbFh = NULL;
- }
- sprintf(print_tb_buf + strlen(print_tb_buf),
- "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
- h,
- (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL),
- (tbSh) ? atomic_read(&tbSh->b_count) : -1,
- (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL),
- (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1,
- (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL),
- (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1,
- (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL),
- (tb->FL[h]) ? (long long)(tb->FL[h]->
- b_blocknr) : (-1LL),
- (tb->FR[h]) ? (long long)(tb->FR[h]->
- b_blocknr) : (-1LL),
- (tb->CFL[h]) ? (long long)(tb->CFL[h]->
- b_blocknr) : (-1LL),
- (tb->CFR[h]) ? (long long)(tb->CFR[h]->
- b_blocknr) : (-1LL));
- }
-
- sprintf(print_tb_buf + strlen(print_tb_buf),
- "=====================================================================\n"
- "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
- "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
- tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],
- tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0],
- tb->sbytes[0], tb->snum[1], tb->sbytes[1],
- tb->cur_blknum, tb->lkey[0], tb->rkey[0]);
-
- /* this prints balance parameters for non-leaf levels */
- h = 0;
- do {
- h++;
- sprintf(print_tb_buf + strlen(print_tb_buf),
- "* %d * %4d * %2d * * %2d * * %2d *\n",
- h, tb->insert_size[h], tb->lnum[h], tb->rnum[h],
- tb->blknum[h]);
- } while (tb->insert_size[h]);
-
- sprintf(print_tb_buf + strlen(print_tb_buf),
- "=====================================================================\n"
- "FEB list: ");
-
- /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */
- h = 0;
- for (i = 0; i < ARRAY_SIZE(tb->FEB); i++)
- sprintf(print_tb_buf + strlen(print_tb_buf),
- "%p (%llu %d)%s", tb->FEB[i],
- tb->FEB[i] ? (unsigned long long)tb->FEB[i]->
- b_blocknr : 0ULL,
- tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0,
- (i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", ");
-
- sprintf(print_tb_buf + strlen(print_tb_buf),
- "======================== the end ====================================\n");
-}
-
-void print_cur_tb(char *mes)
-{
- printk("%s\n%s", mes, print_tb_buf);
-}
-
-static void check_leaf_block_head(struct buffer_head *bh)
-{
- struct block_head *blkh;
- int nr;
-
- blkh = B_BLK_HEAD(bh);
- nr = blkh_nr_item(blkh);
- if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
- reiserfs_panic(NULL, "vs-6010", "invalid item number %z",
- bh);
- if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
- reiserfs_panic(NULL, "vs-6020", "invalid free space %z",
- bh);
-
-}
-
-static void check_internal_block_head(struct buffer_head *bh)
-{
- if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
- reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh);
-
- if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
- reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh);
-
- if (B_FREE_SPACE(bh) !=
- bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
- DC_SIZE * (B_NR_ITEMS(bh) + 1))
- reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh);
-
-}
-
-void check_leaf(struct buffer_head *bh)
-{
- int i;
- struct item_head *ih;
-
- if (!bh)
- return;
- check_leaf_block_head(bh);
- for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
- op_check_item(ih, ih_item_body(bh, ih));
-}
-
-void check_internal(struct buffer_head *bh)
-{
- if (!bh)
- return;
- check_internal_block_head(bh);
-}
-
-void print_statistics(struct super_block *s)
-{
-
- /*
- printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \
- bmap with search %d, without %d, dir2ind %d, ind2dir %d\n",
- REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes,
- REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search,
- REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct);
- */
-
-}
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
deleted file mode 100644
index 5c68a4a52d78..000000000000
--- a/fs/reiserfs/procfs.c
+++ /dev/null
@@ -1,490 +0,0 @@
-/* -*- linux-c -*- */
-
-/* fs/reiserfs/procfs.c */
-
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/* proc info support a la one created by Sizif@Botik.RU for PGC */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include <linux/uaccess.h>
-#include "reiserfs.h"
-#include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <linux/blkdev.h>
-
-/*
- * LOCKING:
- *
- * These guys are evicted from procfs as the very first step in ->kill_sb().
- *
- */
-
-static int show_version(struct seq_file *m, void *unused)
-{
- struct super_block *sb = m->private;
- char *format;
-
- if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
- format = "3.6";
- } else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) {
- format = "3.5";
- } else {
- format = "unknown";
- }
-
- seq_printf(m, "%s format\twith checks %s\n", format,
-#if defined( CONFIG_REISERFS_CHECK )
- "on"
-#else
- "off"
-#endif
- );
- return 0;
-}
-
-#define SF( x ) ( r -> x )
-#define SFP( x ) SF( s_proc_info_data.x )
-#define SFPL( x ) SFP( x[ level ] )
-#define SFPF( x ) SFP( scan_bitmap.x )
-#define SFPJ( x ) SFP( journal.x )
-
-#define D2C( x ) le16_to_cpu( x )
-#define D4C( x ) le32_to_cpu( x )
-#define DF( x ) D2C( rs -> s_v1.x )
-#define DFL( x ) D4C( rs -> s_v1.x )
-
-#define objectid_map( s, rs ) (old_format_only (s) ? \
- (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \
- (__le32 *)(rs + 1))
-#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] )
-
-#define DJF( x ) le32_to_cpu( rs -> x )
-#define DJP( x ) le32_to_cpu( jp -> x )
-#define JF( x ) ( r -> s_journal -> x )
-
-static int show_super(struct seq_file *m, void *unused)
-{
- struct super_block *sb = m->private;
- struct reiserfs_sb_info *r = REISERFS_SB(sb);
-
- seq_printf(m, "state: \t%s\n"
- "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
- "gen. counter: \t%i\n"
- "s_disk_reads: \t%i\n"
- "s_disk_writes: \t%i\n"
- "s_fix_nodes: \t%i\n"
- "s_do_balance: \t%i\n"
- "s_unneeded_left_neighbor: \t%i\n"
- "s_good_search_by_key_reada: \t%i\n"
- "s_bmaps: \t%i\n"
- "s_bmaps_without_search: \t%i\n"
- "s_direct2indirect: \t%i\n"
- "s_indirect2direct: \t%i\n"
- "\n"
- "max_hash_collisions: \t%i\n"
- "breads: \t%lu\n"
- "bread_misses: \t%lu\n"
- "search_by_key: \t%lu\n"
- "search_by_key_fs_changed: \t%lu\n"
- "search_by_key_restarted: \t%lu\n"
- "insert_item_restarted: \t%lu\n"
- "paste_into_item_restarted: \t%lu\n"
- "cut_from_item_restarted: \t%lu\n"
- "delete_solid_item_restarted: \t%lu\n"
- "delete_item_restarted: \t%lu\n"
- "leaked_oid: \t%lu\n"
- "leaves_removable: \t%lu\n",
- SF(s_mount_state) == REISERFS_VALID_FS ?
- "REISERFS_VALID_FS" : "REISERFS_ERROR_FS",
- reiserfs_r5_hash(sb) ? "FORCE_R5 " : "",
- reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "",
- reiserfs_tea_hash(sb) ? "FORCE_TEA " : "",
- reiserfs_hash_detect(sb) ? "DETECT_HASH " : "",
- reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ",
- reiserfs_no_unhashed_relocation(sb) ?
- "NO_UNHASHED_RELOCATION " : "",
- reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "",
- reiserfs_test4(sb) ? "TEST4 " : "",
- have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ?
- "SMALL_TAILS " : "NO_TAILS ",
- replay_only(sb) ? "REPLAY_ONLY " : "",
- convert_reiserfs(sb) ? "CONV " : "",
- atomic_read(&r->s_generation_counter),
- SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes),
- SF(s_do_balance), SF(s_unneeded_left_neighbor),
- SF(s_good_search_by_key_reada), SF(s_bmaps),
- SF(s_bmaps_without_search), SF(s_direct2indirect),
- SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads),
- SFP(bread_miss), SFP(search_by_key),
- SFP(search_by_key_fs_changed), SFP(search_by_key_restarted),
- SFP(insert_item_restarted), SFP(paste_into_item_restarted),
- SFP(cut_from_item_restarted),
- SFP(delete_solid_item_restarted), SFP(delete_item_restarted),
- SFP(leaked_oid), SFP(leaves_removable));
-
- return 0;
-}
-
-static int show_per_level(struct seq_file *m, void *unused)
-{
- struct super_block *sb = m->private;
- struct reiserfs_sb_info *r = REISERFS_SB(sb);
- int level;
-
- seq_printf(m, "level\t"
- " balances"
- " [sbk: reads"
- " fs_changed"
- " restarted]"
- " free space"
- " items"
- " can_remove"
- " lnum"
- " rnum"
- " lbytes"
- " rbytes"
- " get_neig"
- " get_neig_res" " need_l_neig" " need_r_neig" "\n");
-
- for (level = 0; level < MAX_HEIGHT; ++level) {
- seq_printf(m, "%i\t"
- " %12lu"
- " %12lu"
- " %12lu"
- " %12lu"
- " %12lu"
- " %12lu"
- " %12lu"
- " %12li"
- " %12li"
- " %12li"
- " %12li"
- " %12lu"
- " %12lu"
- " %12lu"
- " %12lu"
- "\n",
- level,
- SFPL(balance_at),
- SFPL(sbk_read_at),
- SFPL(sbk_fs_changed),
- SFPL(sbk_restarted),
- SFPL(free_at),
- SFPL(items_at),
- SFPL(can_node_be_removed),
- SFPL(lnum),
- SFPL(rnum),
- SFPL(lbytes),
- SFPL(rbytes),
- SFPL(get_neighbors),
- SFPL(get_neighbors_restart),
- SFPL(need_l_neighbor), SFPL(need_r_neighbor)
- );
- }
- return 0;
-}
-
-static int show_bitmap(struct seq_file *m, void *unused)
-{
- struct super_block *sb = m->private;
- struct reiserfs_sb_info *r = REISERFS_SB(sb);
-
- seq_printf(m, "free_block: %lu\n"
- " scan_bitmap:"
- " wait"
- " bmap"
- " retry"
- " stolen"
- " journal_hint"
- "journal_nohint"
- "\n"
- " %14lu"
- " %14lu"
- " %14lu"
- " %14lu"
- " %14lu"
- " %14lu"
- " %14lu"
- "\n",
- SFP(free_block),
- SFPF(call),
- SFPF(wait),
- SFPF(bmap),
- SFPF(retry),
- SFPF(stolen),
- SFPF(in_journal_hint), SFPF(in_journal_nohint));
-
- return 0;
-}
-
-static int show_on_disk_super(struct seq_file *m, void *unused)
-{
- struct super_block *sb = m->private;
- struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
- struct reiserfs_super_block *rs = sb_info->s_rs;
- int hash_code = DFL(s_hash_function_code);
- __u32 flags = DJF(s_flags);
-
- seq_printf(m, "block_count: \t%i\n"
- "free_blocks: \t%i\n"
- "root_block: \t%i\n"
- "blocksize: \t%i\n"
- "oid_maxsize: \t%i\n"
- "oid_cursize: \t%i\n"
- "umount_state: \t%i\n"
- "magic: \t%10.10s\n"
- "fs_state: \t%i\n"
- "hash: \t%s\n"
- "tree_height: \t%i\n"
- "bmap_nr: \t%i\n"
- "version: \t%i\n"
- "flags: \t%x[%s]\n"
- "reserved_for_journal: \t%i\n",
- DFL(s_block_count),
- DFL(s_free_blocks),
- DFL(s_root_block),
- DF(s_blocksize),
- DF(s_oid_maxsize),
- DF(s_oid_cursize),
- DF(s_umount_state),
- rs->s_v1.s_magic,
- DF(s_fs_state),
- hash_code == TEA_HASH ? "tea" :
- (hash_code == YURA_HASH) ? "rupasov" :
- (hash_code == R5_HASH) ? "r5" :
- (hash_code == UNSET_HASH) ? "unset" : "unknown",
- DF(s_tree_height),
- DF(s_bmap_nr),
- DF(s_version), flags, (flags & reiserfs_attrs_cleared)
- ? "attrs_cleared" : "", DF(s_reserved_for_journal));
-
- return 0;
-}
-
-static int show_oidmap(struct seq_file *m, void *unused)
-{
- struct super_block *sb = m->private;
- struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
- struct reiserfs_super_block *rs = sb_info->s_rs;
- unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
- unsigned long total_used = 0;
- int i;
-
- for (i = 0; i < mapsize; ++i) {
- __u32 right;
-
- right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1);
- seq_printf(m, "%s: [ %x .. %x )\n",
- (i & 1) ? "free" : "used", MAP(i), right);
- if (!(i & 1)) {
- total_used += right - MAP(i);
- }
- }
-#if defined( REISERFS_USE_OIDMAPF )
- if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) {
- loff_t size = file_inode(sb_info->oidmap.mapf)->i_size;
- total_used += size / sizeof(reiserfs_oidinterval_d_t);
- }
-#endif
- seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n",
- mapsize,
- mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used);
- return 0;
-}
-
-static time64_t ktime_mono_to_real_seconds(time64_t mono)
-{
- ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2);
-
- return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC);
-}
-
-static int show_journal(struct seq_file *m, void *unused)
-{
- struct super_block *sb = m->private;
- struct reiserfs_sb_info *r = REISERFS_SB(sb);
- struct reiserfs_super_block *rs = r->s_rs;
- struct journal_params *jp = &rs->s_v1.s_journal;
-
- seq_printf(m, /* on-disk fields */
- "jp_journal_1st_block: \t%i\n"
- "jp_journal_dev: \t%pg[%x]\n"
- "jp_journal_size: \t%i\n"
- "jp_journal_trans_max: \t%i\n"
- "jp_journal_magic: \t%i\n"
- "jp_journal_max_batch: \t%i\n"
- "jp_journal_max_commit_age: \t%i\n"
- "jp_journal_max_trans_age: \t%i\n"
- /* incore fields */
- "j_1st_reserved_block: \t%i\n"
- "j_state: \t%li\n"
- "j_trans_id: \t%u\n"
- "j_mount_id: \t%lu\n"
- "j_start: \t%lu\n"
- "j_len: \t%lu\n"
- "j_len_alloc: \t%lu\n"
- "j_wcount: \t%i\n"
- "j_bcount: \t%lu\n"
- "j_first_unflushed_offset: \t%lu\n"
- "j_last_flush_trans_id: \t%u\n"
- "j_trans_start_time: \t%lli\n"
- "j_list_bitmap_index: \t%i\n"
- "j_must_wait: \t%i\n"
- "j_next_full_flush: \t%i\n"
- "j_next_async_flush: \t%i\n"
- "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n"
- /* reiserfs_proc_info_data_t.journal fields */
- "in_journal: \t%12lu\n"
- "in_journal_bitmap: \t%12lu\n"
- "in_journal_reusable: \t%12lu\n"
- "lock_journal: \t%12lu\n"
- "lock_journal_wait: \t%12lu\n"
- "journal_begin: \t%12lu\n"
- "journal_relock_writers: \t%12lu\n"
- "journal_relock_wcount: \t%12lu\n"
- "mark_dirty: \t%12lu\n"
- "mark_dirty_already: \t%12lu\n"
- "mark_dirty_notjournal: \t%12lu\n"
- "restore_prepared: \t%12lu\n"
- "prepare: \t%12lu\n"
- "prepare_retry: \t%12lu\n",
- DJP(jp_journal_1st_block),
- file_bdev(SB_JOURNAL(sb)->j_bdev_file),
- DJP(jp_journal_dev),
- DJP(jp_journal_size),
- DJP(jp_journal_trans_max),
- DJP(jp_journal_magic),
- DJP(jp_journal_max_batch),
- SB_JOURNAL(sb)->j_max_commit_age,
- DJP(jp_journal_max_trans_age),
- JF(j_1st_reserved_block),
- JF(j_state),
- JF(j_trans_id),
- JF(j_mount_id),
- JF(j_start),
- JF(j_len),
- JF(j_len_alloc),
- atomic_read(&r->s_journal->j_wcount),
- JF(j_bcount),
- JF(j_first_unflushed_offset),
- JF(j_last_flush_trans_id),
- ktime_mono_to_real_seconds(JF(j_trans_start_time)),
- JF(j_list_bitmap_index),
- JF(j_must_wait),
- JF(j_next_full_flush),
- JF(j_next_async_flush),
- JF(j_cnode_used),
- JF(j_cnode_free),
- SFPJ(in_journal),
- SFPJ(in_journal_bitmap),
- SFPJ(in_journal_reusable),
- SFPJ(lock_journal),
- SFPJ(lock_journal_wait),
- SFPJ(journal_being),
- SFPJ(journal_relock_writers),
- SFPJ(journal_relock_wcount),
- SFPJ(mark_dirty),
- SFPJ(mark_dirty_already),
- SFPJ(mark_dirty_notjournal),
- SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry)
- );
- return 0;
-}
-
-static struct proc_dir_entry *proc_info_root = NULL;
-static const char proc_info_root_name[] = "fs/reiserfs";
-
-static void add_file(struct super_block *sb, char *name,
- int (*func) (struct seq_file *, void *))
-{
- proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb);
-}
-
-int reiserfs_proc_info_init(struct super_block *sb)
-{
- char b[BDEVNAME_SIZE];
- char *s;
-
- /* Some block devices use /'s */
- strscpy(b, sb->s_id, BDEVNAME_SIZE);
- s = strchr(b, '/');
- if (s)
- *s = '!';
-
- spin_lock_init(&__PINFO(sb).lock);
- REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb);
- if (REISERFS_SB(sb)->procdir) {
- add_file(sb, "version", show_version);
- add_file(sb, "super", show_super);
- add_file(sb, "per-level", show_per_level);
- add_file(sb, "bitmap", show_bitmap);
- add_file(sb, "on-disk-super", show_on_disk_super);
- add_file(sb, "oidmap", show_oidmap);
- add_file(sb, "journal", show_journal);
- return 0;
- }
- reiserfs_warning(sb, "cannot create /proc/%s/%s",
- proc_info_root_name, b);
- return 1;
-}
-
-int reiserfs_proc_info_done(struct super_block *sb)
-{
- struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
- if (de) {
- char b[BDEVNAME_SIZE];
- char *s;
-
- /* Some block devices use /'s */
- strscpy(b, sb->s_id, BDEVNAME_SIZE);
- s = strchr(b, '/');
- if (s)
- *s = '!';
-
- remove_proc_subtree(b, proc_info_root);
- REISERFS_SB(sb)->procdir = NULL;
- }
- return 0;
-}
-
-int reiserfs_proc_info_global_init(void)
-{
- if (proc_info_root == NULL) {
- proc_info_root = proc_mkdir(proc_info_root_name, NULL);
- if (!proc_info_root) {
- reiserfs_warning(NULL, "cannot create /proc/%s",
- proc_info_root_name);
- return 1;
- }
- }
- return 0;
-}
-
-int reiserfs_proc_info_global_done(void)
-{
- if (proc_info_root != NULL) {
- proc_info_root = NULL;
- remove_proc_entry(proc_info_root_name, NULL);
- }
- return 0;
-}
-/*
- * Revision 1.1.8.2 2001/07/15 17:08:42 god
- * . use get_super() in procfs.c
- * . remove remove_save_link() from reiserfs_do_truncate()
- *
- * I accept terms and conditions stated in the Legal Agreement
- * (available at http://www.namesys.com/legalese.html)
- *
- * Revision 1.1.8.1 2001/07/11 16:48:50 god
- * proc info support
- *
- * I accept terms and conditions stated in the Legal Agreement
- * (available at http://www.namesys.com/legalese.html)
- *
- */
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
deleted file mode 100644
index 12fc20af8e17..000000000000
--- a/fs/reiserfs/reiserfs.h
+++ /dev/null
@@ -1,3419 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for
- * licensing and copyright details
- */
-
-#include <linux/reiserfs_fs.h>
-
-#include <linux/slab.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/bug.h>
-#include <linux/workqueue.h>
-#include <linux/unaligned.h>
-#include <linux/bitops.h>
-#include <linux/proc_fs.h>
-#include <linux/buffer_head.h>
-
-/* the 32 bit compat definitions with int argument */
-#define REISERFS_IOC32_UNPACK _IOW(0xCD, 1, int)
-#define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION
-#define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION
-
-struct reiserfs_journal_list;
-
-/* bitmasks for i_flags field in reiserfs-specific part of inode */
-typedef enum {
- /*
- * this says what format of key do all items (but stat data) of
- * an object have. If this is set, that format is 3.6 otherwise - 3.5
- */
- i_item_key_version_mask = 0x0001,
-
- /*
- * If this is unset, object has 3.5 stat data, otherwise,
- * it has 3.6 stat data with 64bit size, 32bit nlink etc.
- */
- i_stat_data_version_mask = 0x0002,
-
- /* file might need tail packing on close */
- i_pack_on_close_mask = 0x0004,
-
- /* don't pack tail of file */
- i_nopack_mask = 0x0008,
-
- /*
- * If either of these are set, "safe link" was created for this
- * file during truncate or unlink. Safe link is used to avoid
- * leakage of disk space on crash with some files open, but unlinked.
- */
- i_link_saved_unlink_mask = 0x0010,
- i_link_saved_truncate_mask = 0x0020,
-
- i_has_xattr_dir = 0x0040,
- i_data_log = 0x0080,
-} reiserfs_inode_flags;
-
-struct reiserfs_inode_info {
- __u32 i_key[4]; /* key is still 4 32 bit integers */
-
- /*
- * transient inode flags that are never stored on disk. Bitmasks
- * for this field are defined above.
- */
- __u32 i_flags;
-
- /* offset of first byte stored in direct item. */
- __u32 i_first_direct_byte;
-
- /* copy of persistent inode flags read from sd_attrs. */
- __u32 i_attrs;
-
- /* first unused block of a sequence of unused blocks */
- int i_prealloc_block;
- int i_prealloc_count; /* length of that sequence */
-
- /* per-transaction list of inodes which have preallocated blocks */
- struct list_head i_prealloc_list;
-
- /*
- * new_packing_locality is created; new blocks for the contents
- * of this directory should be displaced
- */
- unsigned new_packing_locality:1;
-
- /*
- * we use these for fsync or O_SYNC to decide which transaction
- * needs to be committed in order for this inode to be properly
- * flushed
- */
- unsigned int i_trans_id;
-
- struct reiserfs_journal_list *i_jl;
- atomic_t openers;
- struct mutex tailpack;
-#ifdef CONFIG_REISERFS_FS_XATTR
- struct rw_semaphore i_xattr_sem;
-#endif
-#ifdef CONFIG_QUOTA
- struct dquot __rcu *i_dquot[MAXQUOTAS];
-#endif
-
- struct inode vfs_inode;
-};
-
-typedef enum {
- reiserfs_attrs_cleared = 0x00000001,
-} reiserfs_super_block_flags;
-
-/*
- * struct reiserfs_super_block accessors/mutators since this is a disk
- * structure, it will always be in little endian format.
- */
-#define sb_block_count(sbp) (le32_to_cpu((sbp)->s_v1.s_block_count))
-#define set_sb_block_count(sbp,v) ((sbp)->s_v1.s_block_count = cpu_to_le32(v))
-#define sb_free_blocks(sbp) (le32_to_cpu((sbp)->s_v1.s_free_blocks))
-#define set_sb_free_blocks(sbp,v) ((sbp)->s_v1.s_free_blocks = cpu_to_le32(v))
-#define sb_root_block(sbp) (le32_to_cpu((sbp)->s_v1.s_root_block))
-#define set_sb_root_block(sbp,v) ((sbp)->s_v1.s_root_block = cpu_to_le32(v))
-
-#define sb_jp_journal_1st_block(sbp) \
- (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_1st_block))
-#define set_sb_jp_journal_1st_block(sbp,v) \
- ((sbp)->s_v1.s_journal.jp_journal_1st_block = cpu_to_le32(v))
-#define sb_jp_journal_dev(sbp) \
- (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_dev))
-#define set_sb_jp_journal_dev(sbp,v) \
- ((sbp)->s_v1.s_journal.jp_journal_dev = cpu_to_le32(v))
-#define sb_jp_journal_size(sbp) \
- (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_size))
-#define set_sb_jp_journal_size(sbp,v) \
- ((sbp)->s_v1.s_journal.jp_journal_size = cpu_to_le32(v))
-#define sb_jp_journal_trans_max(sbp) \
- (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_trans_max))
-#define set_sb_jp_journal_trans_max(sbp,v) \
- ((sbp)->s_v1.s_journal.jp_journal_trans_max = cpu_to_le32(v))
-#define sb_jp_journal_magic(sbp) \
- (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_magic))
-#define set_sb_jp_journal_magic(sbp,v) \
- ((sbp)->s_v1.s_journal.jp_journal_magic = cpu_to_le32(v))
-#define sb_jp_journal_max_batch(sbp) \
- (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_batch))
-#define set_sb_jp_journal_max_batch(sbp,v) \
- ((sbp)->s_v1.s_journal.jp_journal_max_batch = cpu_to_le32(v))
-#define sb_jp_jourmal_max_commit_age(sbp) \
- (le32_to_cpu((sbp)->s_v1.s_journal.jp_journal_max_commit_age))
-#define set_sb_jp_journal_max_commit_age(sbp,v) \
- ((sbp)->s_v1.s_journal.jp_journal_max_commit_age = cpu_to_le32(v))
-
-#define sb_blocksize(sbp) (le16_to_cpu((sbp)->s_v1.s_blocksize))
-#define set_sb_blocksize(sbp,v) ((sbp)->s_v1.s_blocksize = cpu_to_le16(v))
-#define sb_oid_maxsize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_maxsize))
-#define set_sb_oid_maxsize(sbp,v) ((sbp)->s_v1.s_oid_maxsize = cpu_to_le16(v))
-#define sb_oid_cursize(sbp) (le16_to_cpu((sbp)->s_v1.s_oid_cursize))
-#define set_sb_oid_cursize(sbp,v) ((sbp)->s_v1.s_oid_cursize = cpu_to_le16(v))
-#define sb_umount_state(sbp) (le16_to_cpu((sbp)->s_v1.s_umount_state))
-#define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v))
-#define sb_fs_state(sbp) (le16_to_cpu((sbp)->s_v1.s_fs_state))
-#define set_sb_fs_state(sbp,v) ((sbp)->s_v1.s_fs_state = cpu_to_le16(v))
-#define sb_hash_function_code(sbp) \
- (le32_to_cpu((sbp)->s_v1.s_hash_function_code))
-#define set_sb_hash_function_code(sbp,v) \
- ((sbp)->s_v1.s_hash_function_code = cpu_to_le32(v))
-#define sb_tree_height(sbp) (le16_to_cpu((sbp)->s_v1.s_tree_height))
-#define set_sb_tree_height(sbp,v) ((sbp)->s_v1.s_tree_height = cpu_to_le16(v))
-#define sb_bmap_nr(sbp) (le16_to_cpu((sbp)->s_v1.s_bmap_nr))
-#define set_sb_bmap_nr(sbp,v) ((sbp)->s_v1.s_bmap_nr = cpu_to_le16(v))
-#define sb_version(sbp) (le16_to_cpu((sbp)->s_v1.s_version))
-#define set_sb_version(sbp,v) ((sbp)->s_v1.s_version = cpu_to_le16(v))
-
-#define sb_mnt_count(sbp) (le16_to_cpu((sbp)->s_mnt_count))
-#define set_sb_mnt_count(sbp, v) ((sbp)->s_mnt_count = cpu_to_le16(v))
-
-#define sb_reserved_for_journal(sbp) \
- (le16_to_cpu((sbp)->s_v1.s_reserved_for_journal))
-#define set_sb_reserved_for_journal(sbp,v) \
- ((sbp)->s_v1.s_reserved_for_journal = cpu_to_le16(v))
-
-/* LOGGING -- */
-
-/*
- * These all interelate for performance.
- *
- * If the journal block count is smaller than n transactions, you lose speed.
- * I don't know what n is yet, I'm guessing 8-16.
- *
- * typical transaction size depends on the application, how often fsync is
- * called, and how many metadata blocks you dirty in a 30 second period.
- * The more small files (<16k) you use, the larger your transactions will
- * be.
- *
- * If your journal fills faster than dirty buffers get flushed to disk, it
- * must flush them before allowing the journal to wrap, which slows things
- * down. If you need high speed meta data updates, the journal should be
- * big enough to prevent wrapping before dirty meta blocks get to disk.
- *
- * If the batch max is smaller than the transaction max, you'll waste space
- * at the end of the journal because journal_end sets the next transaction
- * to start at 0 if the next transaction has any chance of wrapping.
- *
- * The large the batch max age, the better the speed, and the more meta
- * data changes you'll lose after a crash.
- */
-
-/* don't mess with these for a while */
-/* we have a node size define somewhere in reiserfs_fs.h. -Hans */
-#define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */
-#define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */
-#define JOURNAL_HASH_SIZE 8192
-
-/* number of copies of the bitmaps to have floating. Must be >= 2 */
-#define JOURNAL_NUM_BITMAPS 5
-
-/*
- * One of these for every block in every transaction
- * Each one is in two hash tables. First, a hash of the current transaction,
- * and after journal_end, a hash of all the in memory transactions.
- * next and prev are used by the current transaction (journal_hash).
- * hnext and hprev are used by journal_list_hash. If a block is in more
- * than one transaction, the journal_list_hash links it in multiple times.
- * This allows flush_journal_list to remove just the cnode belonging to a
- * given transaction.
- */
-struct reiserfs_journal_cnode {
- struct buffer_head *bh; /* real buffer head */
- struct super_block *sb; /* dev of real buffer head */
-
- /* block number of real buffer head, == 0 when buffer on disk */
- __u32 blocknr;
-
- unsigned long state;
-
- /* journal list this cnode lives in */
- struct reiserfs_journal_list *jlist;
-
- struct reiserfs_journal_cnode *next; /* next in transaction list */
- struct reiserfs_journal_cnode *prev; /* prev in transaction list */
- struct reiserfs_journal_cnode *hprev; /* prev in hash list */
- struct reiserfs_journal_cnode *hnext; /* next in hash list */
-};
-
-struct reiserfs_bitmap_node {
- int id;
- char *data;
- struct list_head list;
-};
-
-struct reiserfs_list_bitmap {
- struct reiserfs_journal_list *journal_list;
- struct reiserfs_bitmap_node **bitmaps;
-};
-
-/*
- * one of these for each transaction. The most important part here is the
- * j_realblock. this list of cnodes is used to hash all the blocks in all
- * the commits, to mark all the real buffer heads dirty once all the commits
- * hit the disk, and to make sure every real block in a transaction is on
- * disk before allowing the log area to be overwritten
- */
-struct reiserfs_journal_list {
- unsigned long j_start;
- unsigned long j_state;
- unsigned long j_len;
- atomic_t j_nonzerolen;
- atomic_t j_commit_left;
-
- /* all commits older than this on disk */
- atomic_t j_older_commits_done;
-
- struct mutex j_commit_mutex;
- unsigned int j_trans_id;
- time64_t j_timestamp; /* write-only but useful for crash dump analysis */
- struct reiserfs_list_bitmap *j_list_bitmap;
- struct buffer_head *j_commit_bh; /* commit buffer head */
- struct reiserfs_journal_cnode *j_realblock;
- struct reiserfs_journal_cnode *j_freedlist; /* list of buffers that were freed during this trans. free each of these on flush */
- /* time ordered list of all active transactions */
- struct list_head j_list;
-
- /*
- * time ordered list of all transactions we haven't tried
- * to flush yet
- */
- struct list_head j_working_list;
-
- /* list of tail conversion targets in need of flush before commit */
- struct list_head j_tail_bh_list;
-
- /* list of data=ordered buffers in need of flush before commit */
- struct list_head j_bh_list;
- int j_refcount;
-};
-
-struct reiserfs_journal {
- struct buffer_head **j_ap_blocks; /* journal blocks on disk */
- /* newest journal block */
- struct reiserfs_journal_cnode *j_last;
-
- /* oldest journal block. start here for traverse */
- struct reiserfs_journal_cnode *j_first;
-
- struct file *j_bdev_file;
-
- /* first block on s_dev of reserved area journal */
- int j_1st_reserved_block;
-
- unsigned long j_state;
- unsigned int j_trans_id;
- unsigned long j_mount_id;
-
- /* start of current waiting commit (index into j_ap_blocks) */
- unsigned long j_start;
- unsigned long j_len; /* length of current waiting commit */
-
- /* number of buffers requested by journal_begin() */
- unsigned long j_len_alloc;
-
- atomic_t j_wcount; /* count of writers for current commit */
-
- /* batch count. allows turning X transactions into 1 */
- unsigned long j_bcount;
-
- /* first unflushed transactions offset */
- unsigned long j_first_unflushed_offset;
-
- /* last fully flushed journal timestamp */
- unsigned j_last_flush_trans_id;
-
- struct buffer_head *j_header_bh;
-
- time64_t j_trans_start_time; /* time this transaction started */
- struct mutex j_mutex;
- struct mutex j_flush_mutex;
-
- /* wait for current transaction to finish before starting new one */
- wait_queue_head_t j_join_wait;
-
- atomic_t j_jlock; /* lock for j_join_wait */
- int j_list_bitmap_index; /* number of next list bitmap to use */
-
- /* no more journal begins allowed. MUST sleep on j_join_wait */
- int j_must_wait;
-
- /* next journal_end will flush all journal list */
- int j_next_full_flush;
-
- /* next journal_end will flush all async commits */
- int j_next_async_flush;
-
- int j_cnode_used; /* number of cnodes on the used list */
- int j_cnode_free; /* number of cnodes on the free list */
-
- /* max number of blocks in a transaction. */
- unsigned int j_trans_max;
-
- /* max number of blocks to batch into a trans */
- unsigned int j_max_batch;
-
- /* in seconds, how old can an async commit be */
- unsigned int j_max_commit_age;
-
- /* in seconds, how old can a transaction be */
- unsigned int j_max_trans_age;
-
- /* the default for the max commit age */
- unsigned int j_default_max_commit_age;
-
- struct reiserfs_journal_cnode *j_cnode_free_list;
-
- /* orig pointer returned from vmalloc */
- struct reiserfs_journal_cnode *j_cnode_free_orig;
-
- struct reiserfs_journal_list *j_current_jl;
- int j_free_bitmap_nodes;
- int j_used_bitmap_nodes;
-
- int j_num_lists; /* total number of active transactions */
- int j_num_work_lists; /* number that need attention from kreiserfsd */
-
- /* debugging to make sure things are flushed in order */
- unsigned int j_last_flush_id;
-
- /* debugging to make sure things are committed in order */
- unsigned int j_last_commit_id;
-
- struct list_head j_bitmap_nodes;
- struct list_head j_dirty_buffers;
- spinlock_t j_dirty_buffers_lock; /* protects j_dirty_buffers */
-
- /* list of all active transactions */
- struct list_head j_journal_list;
-
- /* lists that haven't been touched by writeback attempts */
- struct list_head j_working_list;
-
- /* hash table for real buffer heads in current trans */
- struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];
-
- /* hash table for all the real buffer heads in all the transactions */
- struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];
-
- /* array of bitmaps to record the deleted blocks */
- struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];
-
- /* list of inodes which have preallocated blocks */
- struct list_head j_prealloc_list;
- int j_persistent_trans;
- unsigned long j_max_trans_size;
- unsigned long j_max_batch_size;
-
- int j_errno;
-
- /* when flushing ordered buffers, throttle new ordered writers */
- struct delayed_work j_work;
- struct super_block *j_work_sb;
- atomic_t j_async_throttle;
-};
-
-enum journal_state_bits {
- J_WRITERS_BLOCKED = 1, /* set when new writers not allowed */
- J_WRITERS_QUEUED, /* set when log is full due to too many writers */
- J_ABORTED, /* set when log is aborted */
-};
-
-/* ick. magic string to find desc blocks in the journal */
-#define JOURNAL_DESC_MAGIC "ReIsErLB"
-
-typedef __u32(*hashf_t) (const signed char *, int);
-
-struct reiserfs_bitmap_info {
- __u32 free_count;
-};
-
-struct proc_dir_entry;
-
-#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO )
-typedef unsigned long int stat_cnt_t;
-typedef struct reiserfs_proc_info_data {
- spinlock_t lock;
- int exiting;
- int max_hash_collisions;
-
- stat_cnt_t breads;
- stat_cnt_t bread_miss;
- stat_cnt_t search_by_key;
- stat_cnt_t search_by_key_fs_changed;
- stat_cnt_t search_by_key_restarted;
-
- stat_cnt_t insert_item_restarted;
- stat_cnt_t paste_into_item_restarted;
- stat_cnt_t cut_from_item_restarted;
- stat_cnt_t delete_solid_item_restarted;
- stat_cnt_t delete_item_restarted;
-
- stat_cnt_t leaked_oid;
- stat_cnt_t leaves_removable;
-
- /*
- * balances per level.
- * Use explicit 5 as MAX_HEIGHT is not visible yet.
- */
- stat_cnt_t balance_at[5]; /* XXX */
- /* sbk == search_by_key */
- stat_cnt_t sbk_read_at[5]; /* XXX */
- stat_cnt_t sbk_fs_changed[5];
- stat_cnt_t sbk_restarted[5];
- stat_cnt_t items_at[5]; /* XXX */
- stat_cnt_t free_at[5]; /* XXX */
- stat_cnt_t can_node_be_removed[5]; /* XXX */
- long int lnum[5]; /* XXX */
- long int rnum[5]; /* XXX */
- long int lbytes[5]; /* XXX */
- long int rbytes[5]; /* XXX */
- stat_cnt_t get_neighbors[5];
- stat_cnt_t get_neighbors_restart[5];
- stat_cnt_t need_l_neighbor[5];
- stat_cnt_t need_r_neighbor[5];
-
- stat_cnt_t free_block;
- struct __scan_bitmap_stats {
- stat_cnt_t call;
- stat_cnt_t wait;
- stat_cnt_t bmap;
- stat_cnt_t retry;
- stat_cnt_t in_journal_hint;
- stat_cnt_t in_journal_nohint;
- stat_cnt_t stolen;
- } scan_bitmap;
- struct __journal_stats {
- stat_cnt_t in_journal;
- stat_cnt_t in_journal_bitmap;
- stat_cnt_t in_journal_reusable;
- stat_cnt_t lock_journal;
- stat_cnt_t lock_journal_wait;
- stat_cnt_t journal_being;
- stat_cnt_t journal_relock_writers;
- stat_cnt_t journal_relock_wcount;
- stat_cnt_t mark_dirty;
- stat_cnt_t mark_dirty_already;
- stat_cnt_t mark_dirty_notjournal;
- stat_cnt_t restore_prepared;
- stat_cnt_t prepare;
- stat_cnt_t prepare_retry;
- } journal;
-} reiserfs_proc_info_data_t;
-#else
-typedef struct reiserfs_proc_info_data {
-} reiserfs_proc_info_data_t;
-#endif
-
-/* Number of quota types we support */
-#define REISERFS_MAXQUOTAS 2
-
-/* reiserfs union of in-core super block data */
-struct reiserfs_sb_info {
- /* Buffer containing the super block */
- struct buffer_head *s_sbh;
-
- /* Pointer to the on-disk super block in the buffer */
- struct reiserfs_super_block *s_rs;
- struct reiserfs_bitmap_info *s_ap_bitmap;
-
- /* pointer to journal information */
- struct reiserfs_journal *s_journal;
-
- unsigned short s_mount_state; /* reiserfs state (valid, invalid) */
-
- /* Serialize writers access, replace the old bkl */
- struct mutex lock;
-
- /* Owner of the lock (can be recursive) */
- struct task_struct *lock_owner;
-
- /* Depth of the lock, start from -1 like the bkl */
- int lock_depth;
-
- struct workqueue_struct *commit_wq;
-
- /* Comment? -Hans */
- void (*end_io_handler) (struct buffer_head *, int);
-
- /*
- * pointer to function which is used to sort names in directory.
- * Set on mount
- */
- hashf_t s_hash_function;
-
- /* reiserfs's mount options are set here */
- unsigned long s_mount_opt;
-
- /* This is a structure that describes block allocator options */
- struct {
- /* Bitfield for enable/disable kind of options */
- unsigned long bits;
-
- /*
- * size started from which we consider file
- * to be a large one (in blocks)
- */
- unsigned long large_file_size;
-
- int border; /* percentage of disk, border takes */
-
- /*
- * Minimal file size (in blocks) starting
- * from which we do preallocations
- */
- int preallocmin;
-
- /*
- * Number of blocks we try to prealloc when file
- * reaches preallocmin size (in blocks) or prealloc_list
- is empty.
- */
- int preallocsize;
- } s_alloc_options;
-
- /* Comment? -Hans */
- wait_queue_head_t s_wait;
- /* increased by one every time the tree gets re-balanced */
- atomic_t s_generation_counter;
-
- /* File system properties. Currently holds on-disk FS format */
- unsigned long s_properties;
-
- /* session statistics */
- int s_disk_reads;
- int s_disk_writes;
- int s_fix_nodes;
- int s_do_balance;
- int s_unneeded_left_neighbor;
- int s_good_search_by_key_reada;
- int s_bmaps;
- int s_bmaps_without_search;
- int s_direct2indirect;
- int s_indirect2direct;
-
- /*
- * set up when it's ok for reiserfs_read_inode2() to read from
- * disk inode with nlink==0. Currently this is only used during
- * finish_unfinished() processing at mount time
- */
- int s_is_unlinked_ok;
-
- reiserfs_proc_info_data_t s_proc_info_data;
- struct proc_dir_entry *procdir;
-
- /* amount of blocks reserved for further allocations */
- int reserved_blocks;
-
-
- /* this lock on now only used to protect reserved_blocks variable */
- spinlock_t bitmap_lock;
- struct dentry *priv_root; /* root of /.reiserfs_priv */
- struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */
- int j_errno;
-
- int work_queued; /* non-zero delayed work is queued */
- struct delayed_work old_work; /* old transactions flush delayed work */
- spinlock_t old_work_lock; /* protects old_work and work_queued */
-
-#ifdef CONFIG_QUOTA
- char *s_qf_names[REISERFS_MAXQUOTAS];
- int s_jquota_fmt;
-#endif
- char *s_jdev; /* Stored jdev for mount option showing */
-#ifdef CONFIG_REISERFS_CHECK
-
- /*
- * Detects whether more than one copy of tb exists per superblock
- * as a means of checking whether do_balance is executing
- * concurrently against another tree reader/writer on a same
- * mount point.
- */
- struct tree_balance *cur_tb;
-#endif
-};
-
-/* Definitions of reiserfs on-disk properties: */
-#define REISERFS_3_5 0
-#define REISERFS_3_6 1
-#define REISERFS_OLD_FORMAT 2
-
-/* Mount options */
-enum reiserfs_mount_options {
- /* large tails will be created in a session */
- REISERFS_LARGETAIL,
- /*
- * small (for files less than block size) tails will
- * be created in a session
- */
- REISERFS_SMALLTAIL,
-
- /* replay journal and return 0. Use by fsck */
- REPLAYONLY,
-
- /*
- * -o conv: causes conversion of old format super block to the
- * new format. If not specified - old partition will be dealt
- * with in a manner of 3.5.x
- */
- REISERFS_CONVERT,
-
- /*
- * -o hash={tea, rupasov, r5, detect} is meant for properly mounting
- * reiserfs disks from 3.5.19 or earlier. 99% of the time, this
- * option is not required. If the normal autodection code can't
- * determine which hash to use (because both hashes had the same
- * value for a file) use this option to force a specific hash.
- * It won't allow you to override the existing hash on the FS, so
- * if you have a tea hash disk, and mount with -o hash=rupasov,
- * the mount will fail.
- */
- FORCE_TEA_HASH, /* try to force tea hash on mount */
- FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */
- FORCE_R5_HASH, /* try to force rupasov hash on mount */
- FORCE_HASH_DETECT, /* try to detect hash function on mount */
-
- REISERFS_DATA_LOG,
- REISERFS_DATA_ORDERED,
- REISERFS_DATA_WRITEBACK,
-
- /*
- * used for testing experimental features, makes benchmarking new
- * features with and without more convenient, should never be used by
- * users in any code shipped to users (ideally)
- */
-
- REISERFS_NO_BORDER,
- REISERFS_NO_UNHASHED_RELOCATION,
- REISERFS_HASHED_RELOCATION,
- REISERFS_ATTRS,
- REISERFS_XATTRS_USER,
- REISERFS_POSIXACL,
- REISERFS_EXPOSE_PRIVROOT,
- REISERFS_BARRIER_NONE,
- REISERFS_BARRIER_FLUSH,
-
- /* Actions on error */
- REISERFS_ERROR_PANIC,
- REISERFS_ERROR_RO,
- REISERFS_ERROR_CONTINUE,
-
- REISERFS_USRQUOTA, /* User quota option specified */
- REISERFS_GRPQUOTA, /* Group quota option specified */
-
- REISERFS_TEST1,
- REISERFS_TEST2,
- REISERFS_TEST3,
- REISERFS_TEST4,
- REISERFS_UNSUPPORTED_OPT,
-};
-
-#define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
-#define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
-#define reiserfs_tea_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_TEA_HASH))
-#define reiserfs_hash_detect(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_HASH_DETECT))
-#define reiserfs_no_border(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_BORDER))
-#define reiserfs_no_unhashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION))
-#define reiserfs_hashed_relocation(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_HASHED_RELOCATION))
-#define reiserfs_test4(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_TEST4))
-
-#define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
-#define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
-#define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
-#define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
-#define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
-#define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
-#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
-#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
-#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
-#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
-#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
-#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
-#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
-#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
-#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
-
-#define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC))
-#define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO))
-
-void reiserfs_file_buffer(struct buffer_head *bh, int list);
-extern struct file_system_type reiserfs_fs_type;
-int reiserfs_resize(struct super_block *, unsigned long);
-
-#define CARRY_ON 0
-#define SCHEDULE_OCCURRED 1
-
-#define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
-#define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
-#define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
-#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free)
-#define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
-
-#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
-
-#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
-static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
- *journal)
-{
- return test_bit(J_ABORTED, &journal->j_state);
-}
-
-/*
- * Locking primitives. The write lock is a per superblock
- * special mutex that has properties close to the Big Kernel Lock
- * which was used in the previous locking scheme.
- */
-void reiserfs_write_lock(struct super_block *s);
-void reiserfs_write_unlock(struct super_block *s);
-int __must_check reiserfs_write_unlock_nested(struct super_block *s);
-void reiserfs_write_lock_nested(struct super_block *s, int depth);
-
-#ifdef CONFIG_REISERFS_CHECK
-void reiserfs_lock_check_recursive(struct super_block *s);
-#else
-static inline void reiserfs_lock_check_recursive(struct super_block *s) { }
-#endif
-
-/*
- * Several mutexes depend on the write lock.
- * However sometimes we want to relax the write lock while we hold
- * these mutexes, according to the release/reacquire on schedule()
- * properties of the Bkl that were used.
- * Reiserfs performances and locking were based on this scheme.
- * Now that the write lock is a mutex and not the bkl anymore, doing so
- * may result in a deadlock:
- *
- * A acquire write_lock
- * A acquire j_commit_mutex
- * A release write_lock and wait for something
- * B acquire write_lock
- * B can't acquire j_commit_mutex and sleep
- * A can't acquire write lock anymore
- * deadlock
- *
- * What we do here is avoiding such deadlock by playing the same game
- * than the Bkl: if we can't acquire a mutex that depends on the write lock,
- * we release the write lock, wait a bit and then retry.
- *
- * The mutexes concerned by this hack are:
- * - The commit mutex of a journal list
- * - The flush mutex
- * - The journal lock
- * - The inode mutex
- */
-static inline void reiserfs_mutex_lock_safe(struct mutex *m,
- struct super_block *s)
-{
- int depth;
-
- depth = reiserfs_write_unlock_nested(s);
- mutex_lock(m);
- reiserfs_write_lock_nested(s, depth);
-}
-
-static inline void
-reiserfs_mutex_lock_nested_safe(struct mutex *m, unsigned int subclass,
- struct super_block *s)
-{
- int depth;
-
- depth = reiserfs_write_unlock_nested(s);
- mutex_lock_nested(m, subclass);
- reiserfs_write_lock_nested(s, depth);
-}
-
-static inline void
-reiserfs_down_read_safe(struct rw_semaphore *sem, struct super_block *s)
-{
- int depth;
- depth = reiserfs_write_unlock_nested(s);
- down_read(sem);
- reiserfs_write_lock_nested(s, depth);
-}
-
-/*
- * When we schedule, we usually want to also release the write lock,
- * according to the previous bkl based locking scheme of reiserfs.
- */
-static inline void reiserfs_cond_resched(struct super_block *s)
-{
- if (need_resched()) {
- int depth;
-
- depth = reiserfs_write_unlock_nested(s);
- schedule();
- reiserfs_write_lock_nested(s, depth);
- }
-}
-
-struct fid;
-
-/*
- * in reading the #defines, it may help to understand that they employ
- * the following abbreviations:
- *
- * B = Buffer
- * I = Item header
- * H = Height within the tree (should be changed to LEV)
- * N = Number of the item in the node
- * STAT = stat data
- * DEH = Directory Entry Header
- * EC = Entry Count
- * E = Entry number
- * UL = Unsigned Long
- * BLKH = BLocK Header
- * UNFM = UNForMatted node
- * DC = Disk Child
- * P = Path
- *
- * These #defines are named by concatenating these abbreviations,
- * where first comes the arguments, and last comes the return value,
- * of the macro.
- */
-
-#define USE_INODE_GENERATION_COUNTER
-
-#define REISERFS_PREALLOCATE
-#define DISPLACE_NEW_PACKING_LOCALITIES
-#define PREALLOCATION_SIZE 9
-
-/* n must be power of 2 */
-#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u))
-
-/*
- * to be ok for alpha and others we have to align structures to 8 byte
- * boundary.
- * FIXME: do not change 4 by anything else: there is code which relies on that
- */
-#define ROUND_UP(x) _ROUND_UP(x,8LL)
-
-/*
- * debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug
- * messages.
- */
-#define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */
-
-void __reiserfs_warning(struct super_block *s, const char *id,
- const char *func, const char *fmt, ...);
-#define reiserfs_warning(s, id, fmt, args...) \
- __reiserfs_warning(s, id, __func__, fmt, ##args)
-/* assertions handling */
-
-/* always check a condition and panic if it's false. */
-#define __RASSERT(cond, scond, format, args...) \
-do { \
- if (!(cond)) \
- reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \
- __FILE__ ":%i:%s: " format "\n", \
- __LINE__, __func__ , ##args); \
-} while (0)
-
-#define RASSERT(cond, format, args...) __RASSERT(cond, #cond, format, ##args)
-
-#if defined( CONFIG_REISERFS_CHECK )
-#define RFALSE(cond, format, args...) __RASSERT(!(cond), "!(" #cond ")", format, ##args)
-#else
-#define RFALSE( cond, format, args... ) do {;} while( 0 )
-#endif
-
-#define CONSTF __attribute_const__
-/*
- * Disk Data Structures
- */
-
-/***************************************************************************
- * SUPER BLOCK *
- ***************************************************************************/
-
-/*
- * Structure of super block on disk, a version of which in RAM is often
- * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger
- * structure containing fields never written to disk.
- */
-#define UNSET_HASH 0 /* Detect hash on disk */
-#define TEA_HASH 1
-#define YURA_HASH 2
-#define R5_HASH 3
-#define DEFAULT_HASH R5_HASH
-
-struct journal_params {
- /* where does journal start from on its * device */
- __le32 jp_journal_1st_block;
-
- /* journal device st_rdev */
- __le32 jp_journal_dev;
-
- /* size of the journal */
- __le32 jp_journal_size;
-
- /* max number of blocks in a transaction. */
- __le32 jp_journal_trans_max;
-
- /*
- * random value made on fs creation
- * (this was sb_journal_block_count)
- */
- __le32 jp_journal_magic;
-
- /* max number of blocks to batch into a trans */
- __le32 jp_journal_max_batch;
-
- /* in seconds, how old can an async commit be */
- __le32 jp_journal_max_commit_age;
-
- /* in seconds, how old can a transaction be */
- __le32 jp_journal_max_trans_age;
-};
-
-/* this is the super from 3.5.X, where X >= 10 */
-struct reiserfs_super_block_v1 {
- __le32 s_block_count; /* blocks count */
- __le32 s_free_blocks; /* free blocks count */
- __le32 s_root_block; /* root block number */
- struct journal_params s_journal;
- __le16 s_blocksize; /* block size */
-
- /* max size of object id array, see get_objectid() commentary */
- __le16 s_oid_maxsize;
- __le16 s_oid_cursize; /* current size of object id array */
-
- /* this is set to 1 when filesystem was umounted, to 2 - when not */
- __le16 s_umount_state;
-
- /*
- * reiserfs magic string indicates that file system is reiserfs:
- * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs"
- */
- char s_magic[10];
-
- /*
- * it is set to used by fsck to mark which
- * phase of rebuilding is done
- */
- __le16 s_fs_state;
- /*
- * indicate, what hash function is being use
- * to sort names in a directory
- */
- __le32 s_hash_function_code;
- __le16 s_tree_height; /* height of disk tree */
-
- /*
- * amount of bitmap blocks needed to address
- * each block of file system
- */
- __le16 s_bmap_nr;
-
- /*
- * this field is only reliable on filesystem with non-standard journal
- */
- __le16 s_version;
-
- /*
- * size in blocks of journal area on main device, we need to
- * keep after making fs with non-standard journal
- */
- __le16 s_reserved_for_journal;
-} __attribute__ ((__packed__));
-
-#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1))
-
-/* this is the on disk super block */
-struct reiserfs_super_block {
- struct reiserfs_super_block_v1 s_v1;
- __le32 s_inode_generation;
-
- /* Right now used only by inode-attributes, if enabled */
- __le32 s_flags;
-
- unsigned char s_uuid[16]; /* filesystem unique identifier */
- unsigned char s_label[16]; /* filesystem volume label */
- __le16 s_mnt_count; /* Count of mounts since last fsck */
- __le16 s_max_mnt_count; /* Maximum mounts before check */
- __le32 s_lastcheck; /* Timestamp of last fsck */
- __le32 s_check_interval; /* Interval between checks */
-
- /*
- * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1()
- * so any additions must be updated there as well. */
- char s_unused[76];
-} __attribute__ ((__packed__));
-
-#define SB_SIZE (sizeof(struct reiserfs_super_block))
-
-#define REISERFS_VERSION_1 0
-#define REISERFS_VERSION_2 2
-
-/* on-disk super block fields converted to cpu form */
-#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs)
-#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1))
-#define SB_BLOCKSIZE(s) \
- le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_blocksize))
-#define SB_BLOCK_COUNT(s) \
- le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_block_count))
-#define SB_FREE_BLOCKS(s) \
- le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks))
-#define SB_REISERFS_MAGIC(s) \
- (SB_V1_DISK_SUPER_BLOCK(s)->s_magic)
-#define SB_ROOT_BLOCK(s) \
- le32_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_root_block))
-#define SB_TREE_HEIGHT(s) \
- le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height))
-#define SB_REISERFS_STATE(s) \
- le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state))
-#define SB_VERSION(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_version))
-#define SB_BMAP_NR(s) le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr))
-
-#define PUT_SB_BLOCK_COUNT(s, val) \
- do { SB_V1_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0)
-#define PUT_SB_FREE_BLOCKS(s, val) \
- do { SB_V1_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0)
-#define PUT_SB_ROOT_BLOCK(s, val) \
- do { SB_V1_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0)
-#define PUT_SB_TREE_HEIGHT(s, val) \
- do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0)
-#define PUT_SB_REISERFS_STATE(s, val) \
- do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0)
-#define PUT_SB_VERSION(s, val) \
- do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0)
-#define PUT_SB_BMAP_NR(s, val) \
- do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0)
-
-#define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal)
-#define SB_ONDISK_JOURNAL_SIZE(s) \
- le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size))
-#define SB_ONDISK_JOURNAL_1st_BLOCK(s) \
- le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_1st_block))
-#define SB_ONDISK_JOURNAL_DEVICE(s) \
- le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_dev))
-#define SB_ONDISK_RESERVED_FOR_JOURNAL(s) \
- le16_to_cpu ((SB_V1_DISK_SUPER_BLOCK(s)->s_reserved_for_journal))
-
-#define is_block_in_log_or_reserved_area(s, block) \
- block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \
- && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) + \
- ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \
- SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s)))
-
-int is_reiserfs_3_5(struct reiserfs_super_block *rs);
-int is_reiserfs_3_6(struct reiserfs_super_block *rs);
-int is_reiserfs_jr(struct reiserfs_super_block *rs);
-
-/*
- * ReiserFS leaves the first 64k unused, so that partition labels have
- * enough space. If someone wants to write a fancy bootloader that
- * needs more than 64k, let us know, and this will be increased in size.
- * This number must be larger than the largest block size on any
- * platform, or code will break. -Hans
- */
-#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024)
-#define REISERFS_FIRST_BLOCK unused_define
-#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES
-
-/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */
-#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024)
-
-/* reiserfs internal error code (used by search_by_key and fix_nodes)) */
-#define CARRY_ON 0
-#define REPEAT_SEARCH -1
-#define IO_ERROR -2
-#define NO_DISK_SPACE -3
-#define NO_BALANCING_NEEDED (-4)
-#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5)
-#define QUOTA_EXCEEDED -6
-
-typedef __u32 b_blocknr_t;
-typedef __le32 unp_t;
-
-struct unfm_nodeinfo {
- unp_t unfm_nodenum;
- unsigned short unfm_freespace;
-};
-
-/* there are two formats of keys: 3.5 and 3.6 */
-#define KEY_FORMAT_3_5 0
-#define KEY_FORMAT_3_6 1
-
-/* there are two stat datas */
-#define STAT_DATA_V1 0
-#define STAT_DATA_V2 1
-
-static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode)
-{
- return container_of(inode, struct reiserfs_inode_info, vfs_inode);
-}
-
-static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-/*
- * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
- * which overflows on large file systems.
- */
-static inline __u32 reiserfs_bmap_count(struct super_block *sb)
-{
- return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1;
-}
-
-static inline int bmap_would_wrap(unsigned bmap_nr)
-{
- return bmap_nr > ((1LL << 16) - 1);
-}
-
-extern const struct xattr_handler * const reiserfs_xattr_handlers[];
-
-/*
- * this says about version of key of all items (but stat data) the
- * object consists of
- */
-#define get_inode_item_key_version( inode ) \
- ((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5)
-
-#define set_inode_item_key_version( inode, version ) \
- ({ if((version)==KEY_FORMAT_3_6) \
- REISERFS_I(inode)->i_flags |= i_item_key_version_mask; \
- else \
- REISERFS_I(inode)->i_flags &= ~i_item_key_version_mask; })
-
-#define get_inode_sd_version(inode) \
- ((REISERFS_I(inode)->i_flags & i_stat_data_version_mask) ? STAT_DATA_V2 : STAT_DATA_V1)
-
-#define set_inode_sd_version(inode, version) \
- ({ if((version)==STAT_DATA_V2) \
- REISERFS_I(inode)->i_flags |= i_stat_data_version_mask; \
- else \
- REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; })
-
-/*
- * This is an aggressive tail suppression policy, I am hoping it
- * improves our benchmarks. The principle behind it is that percentage
- * space saving is what matters, not absolute space saving. This is
- * non-intuitive, but it helps to understand it if you consider that the
- * cost to access 4 blocks is not much more than the cost to access 1
- * block, if you have to do a seek and rotate. A tail risks a
- * non-linear disk access that is significant as a percentage of total
- * time cost for a 4 block file and saves an amount of space that is
- * less significant as a percentage of space, or so goes the hypothesis.
- * -Hans
- */
-#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \
-(\
- (!(n_tail_size)) || \
- (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \
- ( (n_file_size) >= (n_block_size) * 4 ) || \
- ( ( (n_file_size) >= (n_block_size) * 3 ) && \
- ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \
- ( ( (n_file_size) >= (n_block_size) * 2 ) && \
- ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \
- ( ( (n_file_size) >= (n_block_size) ) && \
- ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \
-)
-
-/*
- * Another strategy for tails, this one means only create a tail if all the
- * file would fit into one DIRECT item.
- * Primary intention for this one is to increase performance by decreasing
- * seeking.
-*/
-#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \
-(\
- (!(n_tail_size)) || \
- (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \
-)
-
-/*
- * values for s_umount_state field
- */
-#define REISERFS_VALID_FS 1
-#define REISERFS_ERROR_FS 2
-
-/*
- * there are 5 item types currently
- */
-#define TYPE_STAT_DATA 0
-#define TYPE_INDIRECT 1
-#define TYPE_DIRECT 2
-#define TYPE_DIRENTRY 3
-#define TYPE_MAXTYPE 3
-#define TYPE_ANY 15 /* FIXME: comment is required */
-
-/***************************************************************************
- * KEY & ITEM HEAD *
- ***************************************************************************/
-
-/* * directories use this key as well as old files */
-struct offset_v1 {
- __le32 k_offset;
- __le32 k_uniqueness;
-} __attribute__ ((__packed__));
-
-struct offset_v2 {
- __le64 v;
-} __attribute__ ((__packed__));
-
-static inline __u16 offset_v2_k_type(const struct offset_v2 *v2)
-{
- __u8 type = le64_to_cpu(v2->v) >> 60;
- return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY;
-}
-
-static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type)
-{
- v2->v =
- (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60);
-}
-
-static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2)
-{
- return le64_to_cpu(v2->v) & (~0ULL >> 4);
-}
-
-static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset)
-{
- offset &= (~0ULL >> 4);
- v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset);
-}
-
-/*
- * Key of an item determines its location in the S+tree, and
- * is composed of 4 components
- */
-struct reiserfs_key {
- /* packing locality: by default parent directory object id */
- __le32 k_dir_id;
-
- __le32 k_objectid; /* object identifier */
- union {
- struct offset_v1 k_offset_v1;
- struct offset_v2 k_offset_v2;
- } __attribute__ ((__packed__)) u;
-} __attribute__ ((__packed__));
-
-struct in_core_key {
- /* packing locality: by default parent directory object id */
- __u32 k_dir_id;
- __u32 k_objectid; /* object identifier */
- __u64 k_offset;
- __u8 k_type;
-};
-
-struct cpu_key {
- struct in_core_key on_disk_key;
- int version;
- /* 3 in all cases but direct2indirect and indirect2direct conversion */
- int key_length;
-};
-
-/*
- * Our function for comparing keys can compare keys of different
- * lengths. It takes as a parameter the length of the keys it is to
- * compare. These defines are used in determining what is to be passed
- * to it as that parameter.
- */
-#define REISERFS_FULL_KEY_LEN 4
-#define REISERFS_SHORT_KEY_LEN 2
-
-/* The result of the key compare */
-#define FIRST_GREATER 1
-#define SECOND_GREATER -1
-#define KEYS_IDENTICAL 0
-#define KEY_FOUND 1
-#define KEY_NOT_FOUND 0
-
-#define KEY_SIZE (sizeof(struct reiserfs_key))
-
-/* return values for search_by_key and clones */
-#define ITEM_FOUND 1
-#define ITEM_NOT_FOUND 0
-#define ENTRY_FOUND 1
-#define ENTRY_NOT_FOUND 0
-#define DIRECTORY_NOT_FOUND -1
-#define REGULAR_FILE_FOUND -2
-#define DIRECTORY_FOUND -3
-#define BYTE_FOUND 1
-#define BYTE_NOT_FOUND 0
-#define FILE_NOT_FOUND -1
-
-#define POSITION_FOUND 1
-#define POSITION_NOT_FOUND 0
-
-/* return values for reiserfs_find_entry and search_by_entry_key */
-#define NAME_FOUND 1
-#define NAME_NOT_FOUND 0
-#define GOTO_PREVIOUS_ITEM 2
-#define NAME_FOUND_INVISIBLE 3
-
-/*
- * Everything in the filesystem is stored as a set of items. The
- * item head contains the key of the item, its free space (for
- * indirect items) and specifies the location of the item itself
- * within the block.
- */
-
-struct item_head {
- /*
- * Everything in the tree is found by searching for it based on
- * its key.
- */
- struct reiserfs_key ih_key;
- union {
- /*
- * The free space in the last unformatted node of an
- * indirect item if this is an indirect item. This
- * equals 0xFFFF iff this is a direct item or stat data
- * item. Note that the key, not this field, is used to
- * determine the item type, and thus which field this
- * union contains.
- */
- __le16 ih_free_space_reserved;
-
- /*
- * Iff this is a directory item, this field equals the
- * number of directory entries in the directory item.
- */
- __le16 ih_entry_count;
- } __attribute__ ((__packed__)) u;
- __le16 ih_item_len; /* total size of the item body */
-
- /* an offset to the item body within the block */
- __le16 ih_item_location;
-
- /*
- * 0 for all old items, 2 for new ones. Highest bit is set by fsck
- * temporary, cleaned after all done
- */
- __le16 ih_version;
-} __attribute__ ((__packed__));
-/* size of item header */
-#define IH_SIZE (sizeof(struct item_head))
-
-#define ih_free_space(ih) le16_to_cpu((ih)->u.ih_free_space_reserved)
-#define ih_version(ih) le16_to_cpu((ih)->ih_version)
-#define ih_entry_count(ih) le16_to_cpu((ih)->u.ih_entry_count)
-#define ih_location(ih) le16_to_cpu((ih)->ih_item_location)
-#define ih_item_len(ih) le16_to_cpu((ih)->ih_item_len)
-
-#define put_ih_free_space(ih, val) do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0)
-#define put_ih_version(ih, val) do { (ih)->ih_version = cpu_to_le16(val); } while (0)
-#define put_ih_entry_count(ih, val) do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0)
-#define put_ih_location(ih, val) do { (ih)->ih_item_location = cpu_to_le16(val); } while (0)
-#define put_ih_item_len(ih, val) do { (ih)->ih_item_len = cpu_to_le16(val); } while (0)
-
-#define unreachable_item(ih) (ih_version(ih) & (1 << 15))
-
-#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih))
-#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val)))
-
-/*
- * these operate on indirect items, where you've got an array of ints
- * at a possibly unaligned location. These are a noop on ia32
- *
- * p is the array of __u32, i is the index into the array, v is the value
- * to store there.
- */
-#define get_block_num(p, i) get_unaligned_le32((p) + (i))
-#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i))
-
-/* * in old version uniqueness field shows key type */
-#define V1_SD_UNIQUENESS 0
-#define V1_INDIRECT_UNIQUENESS 0xfffffffe
-#define V1_DIRECT_UNIQUENESS 0xffffffff
-#define V1_DIRENTRY_UNIQUENESS 500
-#define V1_ANY_UNIQUENESS 555 /* FIXME: comment is required */
-
-/* here are conversion routines */
-static inline int uniqueness2type(__u32 uniqueness) CONSTF;
-static inline int uniqueness2type(__u32 uniqueness)
-{
- switch ((int)uniqueness) {
- case V1_SD_UNIQUENESS:
- return TYPE_STAT_DATA;
- case V1_INDIRECT_UNIQUENESS:
- return TYPE_INDIRECT;
- case V1_DIRECT_UNIQUENESS:
- return TYPE_DIRECT;
- case V1_DIRENTRY_UNIQUENESS:
- return TYPE_DIRENTRY;
- case V1_ANY_UNIQUENESS:
- default:
- return TYPE_ANY;
- }
-}
-
-static inline __u32 type2uniqueness(int type) CONSTF;
-static inline __u32 type2uniqueness(int type)
-{
- switch (type) {
- case TYPE_STAT_DATA:
- return V1_SD_UNIQUENESS;
- case TYPE_INDIRECT:
- return V1_INDIRECT_UNIQUENESS;
- case TYPE_DIRECT:
- return V1_DIRECT_UNIQUENESS;
- case TYPE_DIRENTRY:
- return V1_DIRENTRY_UNIQUENESS;
- case TYPE_ANY:
- default:
- return V1_ANY_UNIQUENESS;
- }
-}
-
-/*
- * key is pointer to on disk key which is stored in le, result is cpu,
- * there is no way to get version of object from key, so, provide
- * version to these defines
- */
-static inline loff_t le_key_k_offset(int version,
- const struct reiserfs_key *key)
-{
- return (version == KEY_FORMAT_3_5) ?
- le32_to_cpu(key->u.k_offset_v1.k_offset) :
- offset_v2_k_offset(&(key->u.k_offset_v2));
-}
-
-static inline loff_t le_ih_k_offset(const struct item_head *ih)
-{
- return le_key_k_offset(ih_version(ih), &(ih->ih_key));
-}
-
-static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key)
-{
- if (version == KEY_FORMAT_3_5) {
- loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness);
- return uniqueness2type(val);
- } else
- return offset_v2_k_type(&(key->u.k_offset_v2));
-}
-
-static inline loff_t le_ih_k_type(const struct item_head *ih)
-{
- return le_key_k_type(ih_version(ih), &(ih->ih_key));
-}
-
-static inline void set_le_key_k_offset(int version, struct reiserfs_key *key,
- loff_t offset)
-{
- if (version == KEY_FORMAT_3_5)
- key->u.k_offset_v1.k_offset = cpu_to_le32(offset);
- else
- set_offset_v2_k_offset(&key->u.k_offset_v2, offset);
-}
-
-static inline void add_le_key_k_offset(int version, struct reiserfs_key *key,
- loff_t offset)
-{
- set_le_key_k_offset(version, key,
- le_key_k_offset(version, key) + offset);
-}
-
-static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset)
-{
- add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
-}
-
-static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
-{
- set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
-}
-
-static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
- int type)
-{
- if (version == KEY_FORMAT_3_5) {
- type = type2uniqueness(type);
- key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type);
- } else
- set_offset_v2_k_type(&key->u.k_offset_v2, type);
-}
-
-static inline void set_le_ih_k_type(struct item_head *ih, int type)
-{
- set_le_key_k_type(ih_version(ih), &(ih->ih_key), type);
-}
-
-static inline int is_direntry_le_key(int version, struct reiserfs_key *key)
-{
- return le_key_k_type(version, key) == TYPE_DIRENTRY;
-}
-
-static inline int is_direct_le_key(int version, struct reiserfs_key *key)
-{
- return le_key_k_type(version, key) == TYPE_DIRECT;
-}
-
-static inline int is_indirect_le_key(int version, struct reiserfs_key *key)
-{
- return le_key_k_type(version, key) == TYPE_INDIRECT;
-}
-
-static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
-{
- return le_key_k_type(version, key) == TYPE_STAT_DATA;
-}
-
-/* item header has version. */
-static inline int is_direntry_le_ih(struct item_head *ih)
-{
- return is_direntry_le_key(ih_version(ih), &ih->ih_key);
-}
-
-static inline int is_direct_le_ih(struct item_head *ih)
-{
- return is_direct_le_key(ih_version(ih), &ih->ih_key);
-}
-
-static inline int is_indirect_le_ih(struct item_head *ih)
-{
- return is_indirect_le_key(ih_version(ih), &ih->ih_key);
-}
-
-static inline int is_statdata_le_ih(struct item_head *ih)
-{
- return is_statdata_le_key(ih_version(ih), &ih->ih_key);
-}
-
-/* key is pointer to cpu key, result is cpu */
-static inline loff_t cpu_key_k_offset(const struct cpu_key *key)
-{
- return key->on_disk_key.k_offset;
-}
-
-static inline loff_t cpu_key_k_type(const struct cpu_key *key)
-{
- return key->on_disk_key.k_type;
-}
-
-static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset)
-{
- key->on_disk_key.k_offset = offset;
-}
-
-static inline void set_cpu_key_k_type(struct cpu_key *key, int type)
-{
- key->on_disk_key.k_type = type;
-}
-
-static inline void cpu_key_k_offset_dec(struct cpu_key *key)
-{
- key->on_disk_key.k_offset--;
-}
-
-#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY)
-#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT)
-#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT)
-#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA)
-
-/* are these used ? */
-#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key)))
-#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key)))
-#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key)))
-#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key)))
-
-#define I_K_KEY_IN_ITEM(ih, key, n_blocksize) \
- (!COMP_SHORT_KEYS(ih, key) && \
- I_OFF_BYTE_IN_ITEM(ih, k_offset(key), n_blocksize))
-
-/* maximal length of item */
-#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE)
-#define MIN_ITEM_LEN 1
-
-/* object identifier for root dir */
-#define REISERFS_ROOT_OBJECTID 2
-#define REISERFS_ROOT_PARENT_OBJECTID 1
-
-extern struct reiserfs_key root_key;
-
-/*
- * Picture represents a leaf of the S+tree
- * ______________________________________________________
- * | | Array of | | |
- * |Block | Object-Item | F r e e | Objects- |
- * | head | Headers | S p a c e | Items |
- * |______|_______________|___________________|___________|
- */
-
-/*
- * Header of a disk block. More precisely, header of a formatted leaf
- * or internal node, and not the header of an unformatted node.
- */
-struct block_head {
- __le16 blk_level; /* Level of a block in the tree. */
- __le16 blk_nr_item; /* Number of keys/items in a block. */
- __le16 blk_free_space; /* Block free space in bytes. */
- __le16 blk_reserved;
- /* dump this in v4/planA */
-
- /* kept only for compatibility */
- struct reiserfs_key blk_right_delim_key;
-};
-
-#define BLKH_SIZE (sizeof(struct block_head))
-#define blkh_level(p_blkh) (le16_to_cpu((p_blkh)->blk_level))
-#define blkh_nr_item(p_blkh) (le16_to_cpu((p_blkh)->blk_nr_item))
-#define blkh_free_space(p_blkh) (le16_to_cpu((p_blkh)->blk_free_space))
-#define blkh_reserved(p_blkh) (le16_to_cpu((p_blkh)->blk_reserved))
-#define set_blkh_level(p_blkh,val) ((p_blkh)->blk_level = cpu_to_le16(val))
-#define set_blkh_nr_item(p_blkh,val) ((p_blkh)->blk_nr_item = cpu_to_le16(val))
-#define set_blkh_free_space(p_blkh,val) ((p_blkh)->blk_free_space = cpu_to_le16(val))
-#define set_blkh_reserved(p_blkh,val) ((p_blkh)->blk_reserved = cpu_to_le16(val))
-#define blkh_right_delim_key(p_blkh) ((p_blkh)->blk_right_delim_key)
-#define set_blkh_right_delim_key(p_blkh,val) ((p_blkh)->blk_right_delim_key = val)
-
-/* values for blk_level field of the struct block_head */
-
-/*
- * When node gets removed from the tree its blk_level is set to FREE_LEVEL.
- * It is then used to see whether the node is still in the tree
- */
-#define FREE_LEVEL 0
-
-#define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level. */
-
-/*
- * Given the buffer head of a formatted node, resolve to the
- * block head of that node.
- */
-#define B_BLK_HEAD(bh) ((struct block_head *)((bh)->b_data))
-/* Number of items that are in buffer. */
-#define B_NR_ITEMS(bh) (blkh_nr_item(B_BLK_HEAD(bh)))
-#define B_LEVEL(bh) (blkh_level(B_BLK_HEAD(bh)))
-#define B_FREE_SPACE(bh) (blkh_free_space(B_BLK_HEAD(bh)))
-
-#define PUT_B_NR_ITEMS(bh, val) do { set_blkh_nr_item(B_BLK_HEAD(bh), val); } while (0)
-#define PUT_B_LEVEL(bh, val) do { set_blkh_level(B_BLK_HEAD(bh), val); } while (0)
-#define PUT_B_FREE_SPACE(bh, val) do { set_blkh_free_space(B_BLK_HEAD(bh), val); } while (0)
-
-/* Get right delimiting key. -- little endian */
-#define B_PRIGHT_DELIM_KEY(bh) (&(blk_right_delim_key(B_BLK_HEAD(bh))))
-
-/* Does the buffer contain a disk leaf. */
-#define B_IS_ITEMS_LEVEL(bh) (B_LEVEL(bh) == DISK_LEAF_NODE_LEVEL)
-
-/* Does the buffer contain a disk internal node */
-#define B_IS_KEYS_LEVEL(bh) (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \
- && B_LEVEL(bh) <= MAX_HEIGHT)
-
-/***************************************************************************
- * STAT DATA *
- ***************************************************************************/
-
-/*
- * old stat data is 32 bytes long. We are going to distinguish new one by
- * different size
-*/
-struct stat_data_v1 {
- __le16 sd_mode; /* file type, permissions */
- __le16 sd_nlink; /* number of hard links */
- __le16 sd_uid; /* owner */
- __le16 sd_gid; /* group */
- __le32 sd_size; /* file size */
- __le32 sd_atime; /* time of last access */
- __le32 sd_mtime; /* time file was last modified */
-
- /*
- * time inode (stat data) was last changed
- * (except changes to sd_atime and sd_mtime)
- */
- __le32 sd_ctime;
- union {
- __le32 sd_rdev;
- __le32 sd_blocks; /* number of blocks file uses */
- } __attribute__ ((__packed__)) u;
-
- /*
- * first byte of file which is stored in a direct item: except that if
- * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no
- * direct item. The existence of this field really grates on me.
- * Let's replace it with a macro based on sd_size and our tail
- * suppression policy. Someday. -Hans
- */
- __le32 sd_first_direct_byte;
-} __attribute__ ((__packed__));
-
-#define SD_V1_SIZE (sizeof(struct stat_data_v1))
-#define stat_data_v1(ih) (ih_version (ih) == KEY_FORMAT_3_5)
-#define sd_v1_mode(sdp) (le16_to_cpu((sdp)->sd_mode))
-#define set_sd_v1_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v))
-#define sd_v1_nlink(sdp) (le16_to_cpu((sdp)->sd_nlink))
-#define set_sd_v1_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le16(v))
-#define sd_v1_uid(sdp) (le16_to_cpu((sdp)->sd_uid))
-#define set_sd_v1_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le16(v))
-#define sd_v1_gid(sdp) (le16_to_cpu((sdp)->sd_gid))
-#define set_sd_v1_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le16(v))
-#define sd_v1_size(sdp) (le32_to_cpu((sdp)->sd_size))
-#define set_sd_v1_size(sdp,v) ((sdp)->sd_size = cpu_to_le32(v))
-#define sd_v1_atime(sdp) (le32_to_cpu((sdp)->sd_atime))
-#define set_sd_v1_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v))
-#define sd_v1_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime))
-#define set_sd_v1_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v))
-#define sd_v1_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime))
-#define set_sd_v1_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v))
-#define sd_v1_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev))
-#define set_sd_v1_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v))
-#define sd_v1_blocks(sdp) (le32_to_cpu((sdp)->u.sd_blocks))
-#define set_sd_v1_blocks(sdp,v) ((sdp)->u.sd_blocks = cpu_to_le32(v))
-#define sd_v1_first_direct_byte(sdp) \
- (le32_to_cpu((sdp)->sd_first_direct_byte))
-#define set_sd_v1_first_direct_byte(sdp,v) \
- ((sdp)->sd_first_direct_byte = cpu_to_le32(v))
-
-/* inode flags stored in sd_attrs (nee sd_reserved) */
-
-/*
- * we want common flags to have the same values as in ext2,
- * so chattr(1) will work without problems
- */
-#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL
-#define REISERFS_APPEND_FL FS_APPEND_FL
-#define REISERFS_SYNC_FL FS_SYNC_FL
-#define REISERFS_NOATIME_FL FS_NOATIME_FL
-#define REISERFS_NODUMP_FL FS_NODUMP_FL
-#define REISERFS_SECRM_FL FS_SECRM_FL
-#define REISERFS_UNRM_FL FS_UNRM_FL
-#define REISERFS_COMPR_FL FS_COMPR_FL
-#define REISERFS_NOTAIL_FL FS_NOTAIL_FL
-
-/* persistent flags that file inherits from the parent directory */
-#define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL | \
- REISERFS_SYNC_FL | \
- REISERFS_NOATIME_FL | \
- REISERFS_NODUMP_FL | \
- REISERFS_SECRM_FL | \
- REISERFS_COMPR_FL | \
- REISERFS_NOTAIL_FL )
-
-/*
- * Stat Data on disk (reiserfs version of UFS disk inode minus the
- * address blocks)
- */
-struct stat_data {
- __le16 sd_mode; /* file type, permissions */
- __le16 sd_attrs; /* persistent inode flags */
- __le32 sd_nlink; /* number of hard links */
- __le64 sd_size; /* file size */
- __le32 sd_uid; /* owner */
- __le32 sd_gid; /* group */
- __le32 sd_atime; /* time of last access */
- __le32 sd_mtime; /* time file was last modified */
-
- /*
- * time inode (stat data) was last changed
- * (except changes to sd_atime and sd_mtime)
- */
- __le32 sd_ctime;
- __le32 sd_blocks;
- union {
- __le32 sd_rdev;
- __le32 sd_generation;
- } __attribute__ ((__packed__)) u;
-} __attribute__ ((__packed__));
-
-/* this is 44 bytes long */
-#define SD_SIZE (sizeof(struct stat_data))
-#define SD_V2_SIZE SD_SIZE
-#define stat_data_v2(ih) (ih_version (ih) == KEY_FORMAT_3_6)
-#define sd_v2_mode(sdp) (le16_to_cpu((sdp)->sd_mode))
-#define set_sd_v2_mode(sdp,v) ((sdp)->sd_mode = cpu_to_le16(v))
-/* sd_reserved */
-/* set_sd_reserved */
-#define sd_v2_nlink(sdp) (le32_to_cpu((sdp)->sd_nlink))
-#define set_sd_v2_nlink(sdp,v) ((sdp)->sd_nlink = cpu_to_le32(v))
-#define sd_v2_size(sdp) (le64_to_cpu((sdp)->sd_size))
-#define set_sd_v2_size(sdp,v) ((sdp)->sd_size = cpu_to_le64(v))
-#define sd_v2_uid(sdp) (le32_to_cpu((sdp)->sd_uid))
-#define set_sd_v2_uid(sdp,v) ((sdp)->sd_uid = cpu_to_le32(v))
-#define sd_v2_gid(sdp) (le32_to_cpu((sdp)->sd_gid))
-#define set_sd_v2_gid(sdp,v) ((sdp)->sd_gid = cpu_to_le32(v))
-#define sd_v2_atime(sdp) (le32_to_cpu((sdp)->sd_atime))
-#define set_sd_v2_atime(sdp,v) ((sdp)->sd_atime = cpu_to_le32(v))
-#define sd_v2_mtime(sdp) (le32_to_cpu((sdp)->sd_mtime))
-#define set_sd_v2_mtime(sdp,v) ((sdp)->sd_mtime = cpu_to_le32(v))
-#define sd_v2_ctime(sdp) (le32_to_cpu((sdp)->sd_ctime))
-#define set_sd_v2_ctime(sdp,v) ((sdp)->sd_ctime = cpu_to_le32(v))
-#define sd_v2_blocks(sdp) (le32_to_cpu((sdp)->sd_blocks))
-#define set_sd_v2_blocks(sdp,v) ((sdp)->sd_blocks = cpu_to_le32(v))
-#define sd_v2_rdev(sdp) (le32_to_cpu((sdp)->u.sd_rdev))
-#define set_sd_v2_rdev(sdp,v) ((sdp)->u.sd_rdev = cpu_to_le32(v))
-#define sd_v2_generation(sdp) (le32_to_cpu((sdp)->u.sd_generation))
-#define set_sd_v2_generation(sdp,v) ((sdp)->u.sd_generation = cpu_to_le32(v))
-#define sd_v2_attrs(sdp) (le16_to_cpu((sdp)->sd_attrs))
-#define set_sd_v2_attrs(sdp,v) ((sdp)->sd_attrs = cpu_to_le16(v))
-
-/***************************************************************************
- * DIRECTORY STRUCTURE *
- ***************************************************************************/
-/*
- * Picture represents the structure of directory items
- * ________________________________________________
- * | Array of | | | | | |
- * | directory |N-1| N-2 | .... | 1st |0th|
- * | entry headers | | | | | |
- * |_______________|___|_____|________|_______|___|
- * <---- directory entries ------>
- *
- * First directory item has k_offset component 1. We store "." and ".."
- * in one item, always, we never split "." and ".." into differing
- * items. This makes, among other things, the code for removing
- * directories simpler.
- */
-#define SD_OFFSET 0
-#define SD_UNIQUENESS 0
-#define DOT_OFFSET 1
-#define DOT_DOT_OFFSET 2
-#define DIRENTRY_UNIQUENESS 500
-
-#define FIRST_ITEM_OFFSET 1
-
-/*
- * Q: How to get key of object pointed to by entry from entry?
- *
- * A: Each directory entry has its header. This header has deh_dir_id
- * and deh_objectid fields, those are key of object, entry points to
- */
-
-/*
- * NOT IMPLEMENTED:
- * Directory will someday contain stat data of object
- */
-
-struct reiserfs_de_head {
- __le32 deh_offset; /* third component of the directory entry key */
-
- /*
- * objectid of the parent directory of the object, that is referenced
- * by directory entry
- */
- __le32 deh_dir_id;
-
- /* objectid of the object, that is referenced by directory entry */
- __le32 deh_objectid;
- __le16 deh_location; /* offset of name in the whole item */
-
- /*
- * whether 1) entry contains stat data (for future), and
- * 2) whether entry is hidden (unlinked)
- */
- __le16 deh_state;
-} __attribute__ ((__packed__));
-#define DEH_SIZE sizeof(struct reiserfs_de_head)
-#define deh_offset(p_deh) (le32_to_cpu((p_deh)->deh_offset))
-#define deh_dir_id(p_deh) (le32_to_cpu((p_deh)->deh_dir_id))
-#define deh_objectid(p_deh) (le32_to_cpu((p_deh)->deh_objectid))
-#define deh_location(p_deh) (le16_to_cpu((p_deh)->deh_location))
-#define deh_state(p_deh) (le16_to_cpu((p_deh)->deh_state))
-
-#define put_deh_offset(p_deh,v) ((p_deh)->deh_offset = cpu_to_le32((v)))
-#define put_deh_dir_id(p_deh,v) ((p_deh)->deh_dir_id = cpu_to_le32((v)))
-#define put_deh_objectid(p_deh,v) ((p_deh)->deh_objectid = cpu_to_le32((v)))
-#define put_deh_location(p_deh,v) ((p_deh)->deh_location = cpu_to_le16((v)))
-#define put_deh_state(p_deh,v) ((p_deh)->deh_state = cpu_to_le16((v)))
-
-/* empty directory contains two entries "." and ".." and their headers */
-#define EMPTY_DIR_SIZE \
-(DEH_SIZE * 2 + ROUND_UP (sizeof(".") - 1) + ROUND_UP (sizeof("..") - 1))
-
-/* old format directories have this size when empty */
-#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3)
-
-#define DEH_Statdata 0 /* not used now */
-#define DEH_Visible 2
-
-/* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */
-#if BITS_PER_LONG == 64 || defined(__s390__) || defined(__hppa__)
-# define ADDR_UNALIGNED_BITS (3)
-#endif
-
-/*
- * These are only used to manipulate deh_state.
- * Because of this, we'll use the ext2_ bit routines,
- * since they are little endian
- */
-#ifdef ADDR_UNALIGNED_BITS
-
-# define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1)))
-# define unaligned_offset(addr) (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3)
-
-# define set_bit_unaligned(nr, addr) \
- __test_and_set_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
-# define clear_bit_unaligned(nr, addr) \
- __test_and_clear_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
-# define test_bit_unaligned(nr, addr) \
- test_bit_le((nr) + unaligned_offset(addr), aligned_address(addr))
-
-#else
-
-# define set_bit_unaligned(nr, addr) __test_and_set_bit_le(nr, addr)
-# define clear_bit_unaligned(nr, addr) __test_and_clear_bit_le(nr, addr)
-# define test_bit_unaligned(nr, addr) test_bit_le(nr, addr)
-
-#endif
-
-#define mark_de_with_sd(deh) set_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
-#define mark_de_without_sd(deh) clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
-#define mark_de_visible(deh) set_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-#define mark_de_hidden(deh) clear_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-
-#define de_with_sd(deh) test_bit_unaligned (DEH_Statdata, &((deh)->deh_state))
-#define de_visible(deh) test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-#define de_hidden(deh) !test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
-
-extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
- __le32 par_dirid, __le32 par_objid);
-extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
- __le32 par_dirid, __le32 par_objid);
-
-/* two entries per block (at least) */
-#define REISERFS_MAX_NAME(block_size) 255
-
-/*
- * this structure is used for operations on directory entries. It is
- * not a disk structure.
- *
- * When reiserfs_find_entry or search_by_entry_key find directory
- * entry, they return filled reiserfs_dir_entry structure
- */
-struct reiserfs_dir_entry {
- struct buffer_head *de_bh;
- int de_item_num;
- struct item_head *de_ih;
- int de_entry_num;
- struct reiserfs_de_head *de_deh;
- int de_entrylen;
- int de_namelen;
- char *de_name;
- unsigned long *de_gen_number_bit_string;
-
- __u32 de_dir_id;
- __u32 de_objectid;
-
- struct cpu_key de_entry_key;
-};
-
-/*
- * these defines are useful when a particular member of
- * a reiserfs_dir_entry is needed
- */
-
-/* pointer to file name, stored in entry */
-#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \
- (ih_item_body(bh, ih) + deh_location(deh))
-
-/* length of name */
-#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \
-(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0))
-
-/* hash value occupies bits from 7 up to 30 */
-#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL)
-/* generation number occupies 7 bits starting from 0 up to 6 */
-#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL)
-#define MAX_GENERATION_NUMBER 127
-
-#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number))
-
-/*
- * Picture represents an internal node of the reiserfs tree
- * ______________________________________________________
- * | | Array of | Array of | Free |
- * |block | keys | pointers | space |
- * | head | N | N+1 | |
- * |______|_______________|___________________|___________|
- */
-
-/***************************************************************************
- * DISK CHILD *
- ***************************************************************************/
-/*
- * Disk child pointer:
- * The pointer from an internal node of the tree to a node that is on disk.
- */
-struct disk_child {
- __le32 dc_block_number; /* Disk child's block number. */
- __le16 dc_size; /* Disk child's used space. */
- __le16 dc_reserved;
-};
-
-#define DC_SIZE (sizeof(struct disk_child))
-#define dc_block_number(dc_p) (le32_to_cpu((dc_p)->dc_block_number))
-#define dc_size(dc_p) (le16_to_cpu((dc_p)->dc_size))
-#define put_dc_block_number(dc_p, val) do { (dc_p)->dc_block_number = cpu_to_le32(val); } while(0)
-#define put_dc_size(dc_p, val) do { (dc_p)->dc_size = cpu_to_le16(val); } while(0)
-
-/* Get disk child by buffer header and position in the tree node. */
-#define B_N_CHILD(bh, n_pos) ((struct disk_child *)\
-((bh)->b_data + BLKH_SIZE + B_NR_ITEMS(bh) * KEY_SIZE + DC_SIZE * (n_pos)))
-
-/* Get disk child number by buffer header and position in the tree node. */
-#define B_N_CHILD_NUM(bh, n_pos) (dc_block_number(B_N_CHILD(bh, n_pos)))
-#define PUT_B_N_CHILD_NUM(bh, n_pos, val) \
- (put_dc_block_number(B_N_CHILD(bh, n_pos), val))
-
- /* maximal value of field child_size in structure disk_child */
- /* child size is the combined size of all items and their headers */
-#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE ))
-
-/* amount of used space in buffer (not including block head) */
-#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur)))
-
-/* max and min number of keys in internal node */
-#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) )
-#define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2)
-
-/***************************************************************************
- * PATH STRUCTURES AND DEFINES *
- ***************************************************************************/
-
-/*
- * search_by_key fills up the path from the root to the leaf as it descends
- * the tree looking for the key. It uses reiserfs_bread to try to find
- * buffers in the cache given their block number. If it does not find
- * them in the cache it reads them from disk. For each node search_by_key
- * finds using reiserfs_bread it then uses bin_search to look through that
- * node. bin_search will find the position of the block_number of the next
- * node if it is looking through an internal node. If it is looking through
- * a leaf node bin_search will find the position of the item which has key
- * either equal to given key, or which is the maximal key less than the
- * given key.
- */
-
-struct path_element {
- /* Pointer to the buffer at the path in the tree. */
- struct buffer_head *pe_buffer;
- /* Position in the tree node which is placed in the buffer above. */
- int pe_position;
-};
-
-/*
- * maximal height of a tree. don't change this without
- * changing JOURNAL_PER_BALANCE_CNT
- */
-#define MAX_HEIGHT 5
-
-/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
-#define EXTENDED_MAX_HEIGHT 7
-
-/* Must be equal to at least 2. */
-#define FIRST_PATH_ELEMENT_OFFSET 2
-
-/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
-#define ILLEGAL_PATH_ELEMENT_OFFSET 1
-
-/* this MUST be MAX_HEIGHT + 1. See about FEB below */
-#define MAX_FEB_SIZE 6
-
-/*
- * We need to keep track of who the ancestors of nodes are. When we
- * perform a search we record which nodes were visited while
- * descending the tree looking for the node we searched for. This list
- * of nodes is called the path. This information is used while
- * performing balancing. Note that this path information may become
- * invalid, and this means we must check it when using it to see if it
- * is still valid. You'll need to read search_by_key and the comments
- * in it, especially about decrement_counters_in_path(), to understand
- * this structure.
- *
- * Paths make the code so much harder to work with and debug.... An
- * enormous number of bugs are due to them, and trying to write or modify
- * code that uses them just makes my head hurt. They are based on an
- * excessive effort to avoid disturbing the precious VFS code.:-( The
- * gods only know how we are going to SMP the code that uses them.
- * znodes are the way!
- */
-
-#define PATH_READA 0x1 /* do read ahead */
-#define PATH_READA_BACK 0x2 /* read backwards */
-
-struct treepath {
- int path_length; /* Length of the array above. */
- int reada;
- /* Array of the path elements. */
- struct path_element path_elements[EXTENDED_MAX_HEIGHT];
- int pos_in_item;
-};
-
-#define pos_in_item(path) ((path)->pos_in_item)
-
-#define INITIALIZE_PATH(var) \
-struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
-
-/* Get path element by path and path position. */
-#define PATH_OFFSET_PELEMENT(path, n_offset) ((path)->path_elements + (n_offset))
-
-/* Get buffer header at the path by path and path position. */
-#define PATH_OFFSET_PBUFFER(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_buffer)
-
-/* Get position in the element at the path by path and path position. */
-#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position)
-
-#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length))
-
-/*
- * you know, to the person who didn't write this the macro name does not
- * at first suggest what it does. Maybe POSITION_FROM_PATH_END? Or
- * maybe we should just focus on dumping paths... -Hans
- */
-#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length))
-
-/*
- * in do_balance leaf has h == 0 in contrast with path structure,
- * where root has level == 0. That is why we need these defines
- */
-
-/* tb->S[h] */
-#define PATH_H_PBUFFER(path, h) \
- PATH_OFFSET_PBUFFER(path, path->path_length - (h))
-
-/* tb->F[h] or tb->S[0]->b_parent */
-#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1)
-
-#define PATH_H_POSITION(path, h) \
- PATH_OFFSET_POSITION(path, path->path_length - (h))
-
-/* tb->S[h]->b_item_order */
-#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)
-
-#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h))
-
-static inline void *reiserfs_node_data(const struct buffer_head *bh)
-{
- return bh->b_data + sizeof(struct block_head);
-}
-
-/* get key from internal node */
-static inline struct reiserfs_key *internal_key(struct buffer_head *bh,
- int item_num)
-{
- struct reiserfs_key *key = reiserfs_node_data(bh);
-
- return &key[item_num];
-}
-
-/* get the item header from leaf node */
-static inline struct item_head *item_head(const struct buffer_head *bh,
- int item_num)
-{
- struct item_head *ih = reiserfs_node_data(bh);
-
- return &ih[item_num];
-}
-
-/* get the key from leaf node */
-static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh,
- int item_num)
-{
- return &item_head(bh, item_num)->ih_key;
-}
-
-static inline void *ih_item_body(const struct buffer_head *bh,
- const struct item_head *ih)
-{
- return bh->b_data + ih_location(ih);
-}
-
-/* get item body from leaf node */
-static inline void *item_body(const struct buffer_head *bh, int item_num)
-{
- return ih_item_body(bh, item_head(bh, item_num));
-}
-
-static inline struct item_head *tp_item_head(const struct treepath *path)
-{
- return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
-}
-
-static inline void *tp_item_body(const struct treepath *path)
-{
- return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
-}
-
-#define get_last_bh(path) PATH_PLAST_BUFFER(path)
-#define get_item_pos(path) PATH_LAST_POSITION(path)
-#define item_moved(ih,path) comp_items(ih, path)
-#define path_changed(ih,path) comp_items (ih, path)
-
-/* array of the entry headers */
- /* get item body */
-#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih)))
-
-/*
- * length of the directory entry in directory item. This define
- * calculates length of i-th directory entry using directory entry
- * locations from dir entry head. When it calculates length of 0-th
- * directory entry, it uses length of whole item in place of entry
- * location of the non-existent following entry in the calculation.
- * See picture above.
- */
-static inline int entry_length(const struct buffer_head *bh,
- const struct item_head *ih, int pos_in_item)
-{
- struct reiserfs_de_head *deh;
-
- deh = B_I_DEH(bh, ih) + pos_in_item;
- if (pos_in_item)
- return deh_location(deh - 1) - deh_location(deh);
-
- return ih_item_len(ih) - deh_location(deh);
-}
-
-/***************************************************************************
- * MISC *
- ***************************************************************************/
-
-/* Size of pointer to the unformatted node. */
-#define UNFM_P_SIZE (sizeof(unp_t))
-#define UNFM_P_SHIFT 2
-
-/* in in-core inode key is stored on le form */
-#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key))
-
-#define MAX_UL_INT 0xffffffff
-#define MAX_INT 0x7ffffff
-#define MAX_US_INT 0xffff
-
-// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
-static inline loff_t max_reiserfs_offset(struct inode *inode)
-{
- if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
- return (loff_t) U32_MAX;
-
- return (loff_t) ((~(__u64) 0) >> 4);
-}
-
-#define MAX_KEY_OBJECTID MAX_UL_INT
-
-#define MAX_B_NUM MAX_UL_INT
-#define MAX_FC_NUM MAX_US_INT
-
-/* the purpose is to detect overflow of an unsigned short */
-#define REISERFS_LINK_MAX (MAX_US_INT - 1000)
-
-/*
- * The following defines are used in reiserfs_insert_item
- * and reiserfs_append_item
- */
-#define REISERFS_KERNEL_MEM 0 /* kernel memory mode */
-#define REISERFS_USER_MEM 1 /* user memory mode */
-
-#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter)
-#define get_generation(s) atomic_read (&fs_generation(s))
-#define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen)
-#define __fs_changed(gen,s) (gen != get_generation (s))
-#define fs_changed(gen,s) \
-({ \
- reiserfs_cond_resched(s); \
- __fs_changed(gen, s); \
-})
-
-/***************************************************************************
- * FIXATE NODES *
- ***************************************************************************/
-
-#define VI_TYPE_LEFT_MERGEABLE 1
-#define VI_TYPE_RIGHT_MERGEABLE 2
-
-/*
- * To make any changes in the tree we always first find node, that
- * contains item to be changed/deleted or place to insert a new
- * item. We call this node S. To do balancing we need to decide what
- * we will shift to left/right neighbor, or to a new node, where new
- * item will be etc. To make this analysis simpler we build virtual
- * node. Virtual node is an array of items, that will replace items of
- * node S. (For instance if we are going to delete an item, virtual
- * node does not contain it). Virtual node keeps information about
- * item sizes and types, mergeability of first and last items, sizes
- * of all entries in directory item. We use this array of items when
- * calculating what we can shift to neighbors and how many nodes we
- * have to have if we do not any shiftings, if we shift to left/right
- * neighbor or to both.
- */
-struct virtual_item {
- int vi_index; /* index in the array of item operations */
- unsigned short vi_type; /* left/right mergeability */
-
- /* length of item that it will have after balancing */
- unsigned short vi_item_len;
-
- struct item_head *vi_ih;
- const char *vi_item; /* body of item (old or new) */
- const void *vi_new_data; /* 0 always but paste mode */
- void *vi_uarea; /* item specific area */
-};
-
-struct virtual_node {
- /* this is a pointer to the free space in the buffer */
- char *vn_free_ptr;
-
- unsigned short vn_nr_item; /* number of items in virtual node */
-
- /*
- * size of node , that node would have if it has
- * unlimited size and no balancing is performed
- */
- short vn_size;
-
- /* mode of balancing (paste, insert, delete, cut) */
- short vn_mode;
-
- short vn_affected_item_num;
- short vn_pos_in_item;
-
- /* item header of inserted item, 0 for other modes */
- struct item_head *vn_ins_ih;
- const void *vn_data;
-
- /* array of items (including a new one, excluding item to be deleted) */
- struct virtual_item *vn_vi;
-};
-
-/* used by directory items when creating virtual nodes */
-struct direntry_uarea {
- int flags;
- __u16 entry_count;
- __u16 entry_sizes[];
-} __attribute__ ((__packed__));
-
-/***************************************************************************
- * TREE BALANCE *
- ***************************************************************************/
-
-/*
- * This temporary structure is used in tree balance algorithms, and
- * constructed as we go to the extent that its various parts are
- * needed. It contains arrays of nodes that can potentially be
- * involved in the balancing of node S, and parameters that define how
- * each of the nodes must be balanced. Note that in these algorithms
- * for balancing the worst case is to need to balance the current node
- * S and the left and right neighbors and all of their parents plus
- * create a new node. We implement S1 balancing for the leaf nodes
- * and S0 balancing for the internal nodes (S1 and S0 are defined in
- * our papers.)
- */
-
-/* size of the array of buffers to free at end of do_balance */
-#define MAX_FREE_BLOCK 7
-
-/* maximum number of FEB blocknrs on a single level */
-#define MAX_AMOUNT_NEEDED 2
-
-/* someday somebody will prefix every field in this struct with tb_ */
-struct tree_balance {
- int tb_mode;
- int need_balance_dirty;
- struct super_block *tb_sb;
- struct reiserfs_transaction_handle *transaction_handle;
- struct treepath *tb_path;
-
- /* array of left neighbors of nodes in the path */
- struct buffer_head *L[MAX_HEIGHT];
-
- /* array of right neighbors of nodes in the path */
- struct buffer_head *R[MAX_HEIGHT];
-
- /* array of fathers of the left neighbors */
- struct buffer_head *FL[MAX_HEIGHT];
-
- /* array of fathers of the right neighbors */
- struct buffer_head *FR[MAX_HEIGHT];
- /* array of common parents of center node and its left neighbor */
- struct buffer_head *CFL[MAX_HEIGHT];
-
- /* array of common parents of center node and its right neighbor */
- struct buffer_head *CFR[MAX_HEIGHT];
-
- /*
- * array of empty buffers. Number of buffers in array equals
- * cur_blknum.
- */
- struct buffer_head *FEB[MAX_FEB_SIZE];
- struct buffer_head *used[MAX_FEB_SIZE];
- struct buffer_head *thrown[MAX_FEB_SIZE];
-
- /*
- * array of number of items which must be shifted to the left in
- * order to balance the current node; for leaves includes item that
- * will be partially shifted; for internal nodes, it is the number
- * of child pointers rather than items. It includes the new item
- * being created. The code sometimes subtracts one to get the
- * number of wholly shifted items for other purposes.
- */
- int lnum[MAX_HEIGHT];
-
- /* substitute right for left in comment above */
- int rnum[MAX_HEIGHT];
-
- /*
- * array indexed by height h mapping the key delimiting L[h] and
- * S[h] to its item number within the node CFL[h]
- */
- int lkey[MAX_HEIGHT];
-
- /* substitute r for l in comment above */
- int rkey[MAX_HEIGHT];
-
- /*
- * the number of bytes by we are trying to add or remove from
- * S[h]. A negative value means removing.
- */
- int insert_size[MAX_HEIGHT];
-
- /*
- * number of nodes that will replace node S[h] after balancing
- * on the level h of the tree. If 0 then S is being deleted,
- * if 1 then S is remaining and no new nodes are being created,
- * if 2 or 3 then 1 or 2 new nodes is being created
- */
- int blknum[MAX_HEIGHT];
-
- /* fields that are used only for balancing leaves of the tree */
-
- /* number of empty blocks having been already allocated */
- int cur_blknum;
-
- /* number of items that fall into left most node when S[0] splits */
- int s0num;
-
- /*
- * number of bytes which can flow to the left neighbor from the left
- * most liquid item that cannot be shifted from S[0] entirely
- * if -1 then nothing will be partially shifted
- */
- int lbytes;
-
- /*
- * number of bytes which will flow to the right neighbor from the right
- * most liquid item that cannot be shifted from S[0] entirely
- * if -1 then nothing will be partially shifted
- */
- int rbytes;
-
-
- /*
- * index into the array of item headers in
- * S[0] of the affected item
- */
- int item_pos;
-
- /* new nodes allocated to hold what could not fit into S */
- struct buffer_head *S_new[2];
-
- /*
- * number of items that will be placed into nodes in S_new
- * when S[0] splits
- */
- int snum[2];
-
- /*
- * number of bytes which flow to nodes in S_new when S[0] splits
- * note: if S[0] splits into 3 nodes, then items do not need to be cut
- */
- int sbytes[2];
-
- int pos_in_item;
- int zeroes_num;
-
- /*
- * buffers which are to be freed after do_balance finishes
- * by unfix_nodes
- */
- struct buffer_head *buf_to_free[MAX_FREE_BLOCK];
-
- /*
- * kmalloced memory. Used to create virtual node and keep
- * map of dirtied bitmap blocks
- */
- char *vn_buf;
-
- int vn_buf_size; /* size of the vn_buf */
-
- /* VN starts after bitmap of bitmap blocks */
- struct virtual_node *tb_vn;
-
- /*
- * saved value of `reiserfs_generation' counter see
- * FILESYSTEM_CHANGED() macro in reiserfs_fs.h
- */
- int fs_gen;
-
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- /*
- * key pointer, to pass to block allocator or
- * another low-level subsystem
- */
- struct in_core_key key;
-#endif
-};
-
-/* These are modes of balancing */
-
-/* When inserting an item. */
-#define M_INSERT 'i'
-/*
- * When inserting into (directories only) or appending onto an already
- * existent item.
- */
-#define M_PASTE 'p'
-/* When deleting an item. */
-#define M_DELETE 'd'
-/* When truncating an item or removing an entry from a (directory) item. */
-#define M_CUT 'c'
-
-/* used when balancing on leaf level skipped (in reiserfsck) */
-#define M_INTERNAL 'n'
-
-/*
- * When further balancing is not needed, then do_balance does not need
- * to be called.
- */
-#define M_SKIP_BALANCING 's'
-#define M_CONVERT 'v'
-
-/* modes of leaf_move_items */
-#define LEAF_FROM_S_TO_L 0
-#define LEAF_FROM_S_TO_R 1
-#define LEAF_FROM_R_TO_L 2
-#define LEAF_FROM_L_TO_R 3
-#define LEAF_FROM_S_TO_SNEW 4
-
-#define FIRST_TO_LAST 0
-#define LAST_TO_FIRST 1
-
-/*
- * used in do_balance for passing parent of node information that has
- * been gotten from tb struct
- */
-struct buffer_info {
- struct tree_balance *tb;
- struct buffer_head *bi_bh;
- struct buffer_head *bi_parent;
- int bi_position;
-};
-
-static inline struct super_block *sb_from_tb(struct tree_balance *tb)
-{
- return tb ? tb->tb_sb : NULL;
-}
-
-static inline struct super_block *sb_from_bi(struct buffer_info *bi)
-{
- return bi ? sb_from_tb(bi->tb) : NULL;
-}
-
-/*
- * there are 4 types of items: stat data, directory item, indirect, direct.
- * +-------------------+------------+--------------+------------+
- * | | k_offset | k_uniqueness | mergeable? |
- * +-------------------+------------+--------------+------------+
- * | stat data | 0 | 0 | no |
- * +-------------------+------------+--------------+------------+
- * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. | no |
- * | non 1st directory | hash value | UNIQUENESS | yes |
- * | item | | | |
- * +-------------------+------------+--------------+------------+
- * | indirect item | offset + 1 |TYPE_INDIRECT | [1] |
- * +-------------------+------------+--------------+------------+
- * | direct item | offset + 1 |TYPE_DIRECT | [2] |
- * +-------------------+------------+--------------+------------+
- *
- * [1] if this is not the first indirect item of the object
- * [2] if this is not the first direct item of the object
-*/
-
-struct item_operations {
- int (*bytes_number) (struct item_head * ih, int block_size);
- void (*decrement_key) (struct cpu_key *);
- int (*is_left_mergeable) (struct reiserfs_key * ih,
- unsigned long bsize);
- void (*print_item) (struct item_head *, char *item);
- void (*check_item) (struct item_head *, char *item);
-
- int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi,
- int is_affected, int insert_size);
- int (*check_left) (struct virtual_item * vi, int free,
- int start_skip, int end_skip);
- int (*check_right) (struct virtual_item * vi, int free);
- int (*part_size) (struct virtual_item * vi, int from, int to);
- int (*unit_num) (struct virtual_item * vi);
- void (*print_vi) (struct virtual_item * vi);
-};
-
-extern struct item_operations *item_ops[TYPE_ANY + 1];
-
-#define op_bytes_number(ih,bsize) item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize)
-#define op_is_left_mergeable(key,bsize) item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize)
-#define op_print_item(ih,item) item_ops[le_ih_k_type (ih)]->print_item (ih, item)
-#define op_check_item(ih,item) item_ops[le_ih_k_type (ih)]->check_item (ih, item)
-#define op_create_vi(vn,vi,is_affected,insert_size) item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size)
-#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip)
-#define op_check_right(vi,free) item_ops[(vi)->vi_index]->check_right (vi, free)
-#define op_part_size(vi,from,to) item_ops[(vi)->vi_index]->part_size (vi, from, to)
-#define op_unit_num(vi) item_ops[(vi)->vi_index]->unit_num (vi)
-#define op_print_vi(vi) item_ops[(vi)->vi_index]->print_vi (vi)
-
-#define COMP_SHORT_KEYS comp_short_keys
-
-/* number of blocks pointed to by the indirect item */
-#define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE)
-
-/*
- * the used space within the unformatted node corresponding
- * to pos within the item pointed to by ih
- */
-#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size))
-
-/*
- * number of bytes contained by the direct item or the
- * unformatted nodes the indirect item points to
- */
-
-/* following defines use reiserfs buffer header and item header */
-
-/* get stat-data */
-#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) )
-
-/* this is 3976 for size==4096 */
-#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE)
-
-/*
- * indirect items consist of entries which contain blocknrs, pos
- * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
- * blocknr contained by the entry pos points to
- */
-#define B_I_POS_UNFM_POINTER(bh, ih, pos) \
- le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos)))
-#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val) \
- (*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val))
-
-struct reiserfs_iget_args {
- __u32 objectid;
- __u32 dirid;
-};
-
-/***************************************************************************
- * FUNCTION DECLARATIONS *
- ***************************************************************************/
-
-#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
-
-#define journal_trans_half(blocksize) \
- ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32))
-
-/* journal.c see journal.c for all the comments here */
-
-/* first block written in a commit. */
-struct reiserfs_journal_desc {
- __le32 j_trans_id; /* id of commit */
-
- /* length of commit. len +1 is the commit block */
- __le32 j_len;
-
- __le32 j_mount_id; /* mount id of this trans */
- __le32 j_realblock[]; /* real locations for each block */
-};
-
-#define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id)
-#define get_desc_trans_len(d) le32_to_cpu((d)->j_len)
-#define get_desc_mount_id(d) le32_to_cpu((d)->j_mount_id)
-
-#define set_desc_trans_id(d,val) do { (d)->j_trans_id = cpu_to_le32 (val); } while (0)
-#define set_desc_trans_len(d,val) do { (d)->j_len = cpu_to_le32 (val); } while (0)
-#define set_desc_mount_id(d,val) do { (d)->j_mount_id = cpu_to_le32 (val); } while (0)
-
-/* last block written in a commit */
-struct reiserfs_journal_commit {
- __le32 j_trans_id; /* must match j_trans_id from the desc block */
- __le32 j_len; /* ditto */
- __le32 j_realblock[]; /* real locations for each block */
-};
-
-#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
-#define get_commit_trans_len(c) le32_to_cpu((c)->j_len)
-#define get_commit_mount_id(c) le32_to_cpu((c)->j_mount_id)
-
-#define set_commit_trans_id(c,val) do { (c)->j_trans_id = cpu_to_le32 (val); } while (0)
-#define set_commit_trans_len(c,val) do { (c)->j_len = cpu_to_le32 (val); } while (0)
-
-/*
- * this header block gets written whenever a transaction is considered
- * fully flushed, and is more recent than the last fully flushed transaction.
- * fully flushed means all the log blocks and all the real blocks are on
- * disk, and this transaction does not need to be replayed.
- */
-struct reiserfs_journal_header {
- /* id of last fully flushed transaction */
- __le32 j_last_flush_trans_id;
-
- /* offset in the log of where to start replay after a crash */
- __le32 j_first_unflushed_offset;
-
- __le32 j_mount_id;
- /* 12 */ struct journal_params jh_journal;
-};
-
-/* biggest tunable defines are right here */
-#define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */
-
-/* biggest possible single transaction, don't change for now (8/3/99) */
-#define JOURNAL_TRANS_MAX_DEFAULT 1024
-#define JOURNAL_TRANS_MIN_DEFAULT 256
-
-/*
- * max blocks to batch into one transaction,
- * don't make this any bigger than 900
- */
-#define JOURNAL_MAX_BATCH_DEFAULT 900
-#define JOURNAL_MIN_RATIO 2
-#define JOURNAL_MAX_COMMIT_AGE 30
-#define JOURNAL_MAX_TRANS_AGE 30
-#define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9)
-#define JOURNAL_BLOCKS_PER_OBJECT(sb) (JOURNAL_PER_BALANCE_CNT * 3 + \
- 2 * (REISERFS_QUOTA_INIT_BLOCKS(sb) + \
- REISERFS_QUOTA_TRANS_BLOCKS(sb)))
-
-#ifdef CONFIG_QUOTA
-#define REISERFS_QUOTA_OPTS ((1 << REISERFS_USRQUOTA) | (1 << REISERFS_GRPQUOTA))
-/* We need to update data and inode (atime) */
-#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? 2 : 0)
-/* 1 balancing, 1 bitmap, 1 data per write + stat data update */
-#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
-(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0)
-/* same as with INIT */
-#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & REISERFS_QUOTA_OPTS ? \
-(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0)
-#else
-#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0
-#define REISERFS_QUOTA_INIT_BLOCKS(s) 0
-#define REISERFS_QUOTA_DEL_BLOCKS(s) 0
-#endif
-
-/*
- * both of these can be as low as 1, or as high as you want. The min is the
- * number of 4k bitmap nodes preallocated on mount. New nodes are allocated
- * as needed, and released when transactions are committed. On release, if
- * the current number of nodes is > max, the node is freed, otherwise,
- * it is put on a free list for faster use later.
-*/
-#define REISERFS_MIN_BITMAP_NODES 10
-#define REISERFS_MAX_BITMAP_NODES 100
-
-/* these are based on journal hash size of 8192 */
-#define JBH_HASH_SHIFT 13
-#define JBH_HASH_MASK 8191
-
-#define _jhashfn(sb,block) \
- (((unsigned long)sb>>L1_CACHE_SHIFT) ^ \
- (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
-#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
-
-/* We need these to make journal.c code more readable */
-#define journal_find_get_block(s, block) __find_get_block(\
- file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
- block, s->s_blocksize)
-#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
- block, s->s_blocksize)
-
-enum reiserfs_bh_state_bits {
- BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */
- BH_JDirty_wait,
- /*
- * disk block was taken off free list before being in a
- * finished transaction, or written to disk. Can be reused immed.
- */
- BH_JNew,
- BH_JPrepared,
- BH_JRestore_dirty,
- BH_JTest, /* debugging only will go away */
-};
-
-BUFFER_FNS(JDirty, journaled);
-TAS_BUFFER_FNS(JDirty, journaled);
-BUFFER_FNS(JDirty_wait, journal_dirty);
-TAS_BUFFER_FNS(JDirty_wait, journal_dirty);
-BUFFER_FNS(JNew, journal_new);
-TAS_BUFFER_FNS(JNew, journal_new);
-BUFFER_FNS(JPrepared, journal_prepared);
-TAS_BUFFER_FNS(JPrepared, journal_prepared);
-BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
-TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
-BUFFER_FNS(JTest, journal_test);
-TAS_BUFFER_FNS(JTest, journal_test);
-
-/* transaction handle which is passed around for all journal calls */
-struct reiserfs_transaction_handle {
- /*
- * super for this FS when journal_begin was called. saves calls to
- * reiserfs_get_super also used by nested transactions to make
- * sure they are nesting on the right FS _must_ be first
- * in the handle
- */
- struct super_block *t_super;
-
- int t_refcount;
- int t_blocks_logged; /* number of blocks this writer has logged */
- int t_blocks_allocated; /* number of blocks this writer allocated */
-
- /* sanity check, equals the current trans id */
- unsigned int t_trans_id;
-
- void *t_handle_save; /* save existing current->journal_info */
-
- /*
- * if new block allocation occurres, that block
- * should be displaced from others
- */
- unsigned displace_new_blocks:1;
-
- struct list_head t_list;
-};
-
-/*
- * used to keep track of ordered and tail writes, attached to the buffer
- * head through b_journal_head.
- */
-struct reiserfs_jh {
- struct reiserfs_journal_list *jl;
- struct buffer_head *bh;
- struct list_head list;
-};
-
-void reiserfs_free_jh(struct buffer_head *bh);
-int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
-int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
-int journal_mark_dirty(struct reiserfs_transaction_handle *,
- struct buffer_head *bh);
-
-static inline int reiserfs_file_data_log(struct inode *inode)
-{
- if (reiserfs_data_log(inode->i_sb) ||
- (REISERFS_I(inode)->i_flags & i_data_log))
- return 1;
- return 0;
-}
-
-static inline int reiserfs_transaction_running(struct super_block *s)
-{
- struct reiserfs_transaction_handle *th = current->journal_info;
- if (th && th->t_super == s)
- return 1;
- if (th && th->t_super == NULL)
- BUG();
- return 0;
-}
-
-static inline int reiserfs_transaction_free_space(struct reiserfs_transaction_handle *th)
-{
- return th->t_blocks_allocated - th->t_blocks_logged;
-}
-
-struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
- super_block
- *,
- int count);
-int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
-void reiserfs_vfs_truncate_file(struct inode *inode);
-int reiserfs_commit_page(struct inode *inode, struct page *page,
- unsigned from, unsigned to);
-void reiserfs_flush_old_commits(struct super_block *);
-int reiserfs_commit_for_inode(struct inode *);
-int reiserfs_inode_needs_commit(struct inode *);
-void reiserfs_update_inode_transaction(struct inode *);
-void reiserfs_wait_on_write_block(struct super_block *s);
-void reiserfs_block_writes(struct reiserfs_transaction_handle *th);
-void reiserfs_allow_writes(struct super_block *s);
-void reiserfs_check_lock_depth(struct super_block *s, char *caller);
-int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh,
- int wait);
-void reiserfs_restore_prepared_buffer(struct super_block *,
- struct buffer_head *bh);
-int journal_init(struct super_block *, const char *j_dev_name, int old_format,
- unsigned int);
-int journal_release(struct reiserfs_transaction_handle *, struct super_block *);
-int journal_release_error(struct reiserfs_transaction_handle *,
- struct super_block *);
-int journal_end(struct reiserfs_transaction_handle *);
-int journal_end_sync(struct reiserfs_transaction_handle *);
-int journal_mark_freed(struct reiserfs_transaction_handle *,
- struct super_block *, b_blocknr_t blocknr);
-int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
-int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr,
- int bit_nr, int searchall, b_blocknr_t *next);
-int journal_begin(struct reiserfs_transaction_handle *,
- struct super_block *sb, unsigned long);
-int journal_join_abort(struct reiserfs_transaction_handle *,
- struct super_block *sb);
-void reiserfs_abort_journal(struct super_block *sb, int errno);
-void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
-int reiserfs_allocate_list_bitmaps(struct super_block *s,
- struct reiserfs_list_bitmap *, unsigned int);
-
-void reiserfs_schedule_old_flush(struct super_block *s);
-void reiserfs_cancel_old_flush(struct super_block *s);
-void add_save_link(struct reiserfs_transaction_handle *th,
- struct inode *inode, int truncate);
-int remove_save_link(struct inode *inode, int truncate);
-
-/* objectid.c */
-__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th);
-void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
- __u32 objectid_to_release);
-int reiserfs_convert_objectid_map_v1(struct super_block *);
-
-/* stree.c */
-int B_IS_IN_TREE(const struct buffer_head *);
-extern void copy_item_head(struct item_head *to,
- const struct item_head *from);
-
-/* first key is in cpu form, second - le */
-extern int comp_short_keys(const struct reiserfs_key *le_key,
- const struct cpu_key *cpu_key);
-extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from);
-
-/* both are in le form */
-extern int comp_le_keys(const struct reiserfs_key *,
- const struct reiserfs_key *);
-extern int comp_short_le_keys(const struct reiserfs_key *,
- const struct reiserfs_key *);
-
-/* * get key version from on disk key - kludge */
-static inline int le_key_version(const struct reiserfs_key *key)
-{
- int type;
-
- type = offset_v2_k_type(&(key->u.k_offset_v2));
- if (type != TYPE_DIRECT && type != TYPE_INDIRECT
- && type != TYPE_DIRENTRY)
- return KEY_FORMAT_3_5;
-
- return KEY_FORMAT_3_6;
-
-}
-
-static inline void copy_key(struct reiserfs_key *to,
- const struct reiserfs_key *from)
-{
- memcpy(to, from, KEY_SIZE);
-}
-
-int comp_items(const struct item_head *stored_ih, const struct treepath *path);
-const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
- const struct super_block *sb);
-int search_by_key(struct super_block *, const struct cpu_key *,
- struct treepath *, int);
-#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL)
-int search_for_position_by_key(struct super_block *sb,
- const struct cpu_key *cpu_key,
- struct treepath *search_path);
-extern void decrement_bcount(struct buffer_head *bh);
-void decrement_counters_in_path(struct treepath *search_path);
-void pathrelse(struct treepath *search_path);
-int reiserfs_check_path(struct treepath *p);
-void pathrelse_and_restore(struct super_block *s, struct treepath *search_path);
-
-int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
- struct treepath *path,
- const struct cpu_key *key,
- struct item_head *ih,
- struct inode *inode, const char *body);
-
-int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
- struct treepath *path,
- const struct cpu_key *key,
- struct inode *inode,
- const char *body, int paste_size);
-
-int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
- struct treepath *path,
- struct cpu_key *key,
- struct inode *inode,
- struct page *page, loff_t new_file_size);
-
-int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
- struct treepath *path,
- const struct cpu_key *key,
- struct inode *inode, struct buffer_head *un_bh);
-
-void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
- struct inode *inode, struct reiserfs_key *key);
-int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
- struct inode *inode);
-int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
- struct inode *inode, struct page *,
- int update_timestamps);
-
-#define i_block_size(inode) ((inode)->i_sb->s_blocksize)
-#define file_size(inode) ((inode)->i_size)
-#define tail_size(inode) (file_size (inode) & (i_block_size (inode) - 1))
-
-#define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\
-!STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 )
-
-void padd_item(char *item, int total_length, int length);
-
-/* inode.c */
-/* args for the create parameter of reiserfs_get_block */
-#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
-#define GET_BLOCK_CREATE 1 /* add anything you need to find block */
-#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */
-#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */
-#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */
-#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */
-
-void reiserfs_read_locked_inode(struct inode *inode,
- struct reiserfs_iget_args *args);
-int reiserfs_find_actor(struct inode *inode, void *p);
-int reiserfs_init_locked_inode(struct inode *inode, void *p);
-void reiserfs_evict_inode(struct inode *inode);
-int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-int reiserfs_get_block(struct inode *inode, sector_t block,
- struct buffer_head *bh_result, int create);
-struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type);
-struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type);
-int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
- struct inode *parent);
-
-int reiserfs_truncate_file(struct inode *, int update_timestamps);
-void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
- int type, int key_length);
-void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
- int version,
- loff_t offset, int type, int length, int entry_count);
-struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key);
-
-struct reiserfs_security_handle;
-int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
- struct inode *dir, umode_t mode,
- const char *symname, loff_t i_size,
- struct dentry *dentry, struct inode *inode,
- struct reiserfs_security_handle *security);
-
-void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
- struct inode *inode, loff_t size);
-
-static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
- struct inode *inode)
-{
- reiserfs_update_sd_size(th, inode, inode->i_size);
-}
-
-void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
-int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct iattr *attr);
-
-int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
-
-/* namei.c */
-void reiserfs_init_priv_inode(struct inode *inode);
-void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
-int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
- struct treepath *path, struct reiserfs_dir_entry *de);
-struct dentry *reiserfs_get_parent(struct dentry *);
-
-#ifdef CONFIG_REISERFS_PROC_INFO
-int reiserfs_proc_info_init(struct super_block *sb);
-int reiserfs_proc_info_done(struct super_block *sb);
-int reiserfs_proc_info_global_init(void);
-int reiserfs_proc_info_global_done(void);
-
-#define PROC_EXP( e ) e
-
-#define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data
-#define PROC_INFO_MAX( sb, field, value ) \
- __PINFO( sb ).field = \
- max( REISERFS_SB( sb ) -> s_proc_info_data.field, value )
-#define PROC_INFO_INC( sb, field ) ( ++ ( __PINFO( sb ).field ) )
-#define PROC_INFO_ADD( sb, field, val ) ( __PINFO( sb ).field += ( val ) )
-#define PROC_INFO_BH_STAT( sb, bh, level ) \
- PROC_INFO_INC( sb, sbk_read_at[ ( level ) ] ); \
- PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) ); \
- PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) )
-#else
-static inline int reiserfs_proc_info_init(struct super_block *sb)
-{
- return 0;
-}
-
-static inline int reiserfs_proc_info_done(struct super_block *sb)
-{
- return 0;
-}
-
-static inline int reiserfs_proc_info_global_init(void)
-{
- return 0;
-}
-
-static inline int reiserfs_proc_info_global_done(void)
-{
- return 0;
-}
-
-#define PROC_EXP( e )
-#define VOID_V ( ( void ) 0 )
-#define PROC_INFO_MAX( sb, field, value ) VOID_V
-#define PROC_INFO_INC( sb, field ) VOID_V
-#define PROC_INFO_ADD( sb, field, val ) VOID_V
-#define PROC_INFO_BH_STAT(sb, bh, n_node_level) VOID_V
-#endif
-
-/* dir.c */
-extern const struct inode_operations reiserfs_dir_inode_operations;
-extern const struct inode_operations reiserfs_symlink_inode_operations;
-extern const struct inode_operations reiserfs_special_inode_operations;
-extern const struct file_operations reiserfs_dir_operations;
-int reiserfs_readdir_inode(struct inode *, struct dir_context *);
-
-/* tail_conversion.c */
-int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
- struct treepath *, struct buffer_head *, loff_t);
-int indirect2direct(struct reiserfs_transaction_handle *, struct inode *,
- struct page *, struct treepath *, const struct cpu_key *,
- loff_t, char *);
-void reiserfs_unmap_buffer(struct buffer_head *);
-
-/* file.c */
-extern const struct inode_operations reiserfs_file_inode_operations;
-extern const struct inode_operations reiserfs_priv_file_inode_operations;
-extern const struct file_operations reiserfs_file_operations;
-extern const struct address_space_operations reiserfs_address_space_operations;
-
-/* fix_nodes.c */
-
-int fix_nodes(int n_op_mode, struct tree_balance *tb,
- struct item_head *ins_ih, const void *);
-void unfix_nodes(struct tree_balance *);
-
-/* prints.c */
-void __reiserfs_panic(struct super_block *s, const char *id,
- const char *function, const char *fmt, ...)
- __attribute__ ((noreturn));
-#define reiserfs_panic(s, id, fmt, args...) \
- __reiserfs_panic(s, id, __func__, fmt, ##args)
-void __reiserfs_error(struct super_block *s, const char *id,
- const char *function, const char *fmt, ...);
-#define reiserfs_error(s, id, fmt, args...) \
- __reiserfs_error(s, id, __func__, fmt, ##args)
-void reiserfs_info(struct super_block *s, const char *fmt, ...);
-void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...);
-void print_indirect_item(struct buffer_head *bh, int item_num);
-void store_print_tb(struct tree_balance *tb);
-void print_cur_tb(char *mes);
-void print_de(struct reiserfs_dir_entry *de);
-void print_bi(struct buffer_info *bi, char *mes);
-#define PRINT_LEAF_ITEMS 1 /* print all items */
-#define PRINT_DIRECTORY_ITEMS 2 /* print directory items */
-#define PRINT_DIRECT_ITEMS 4 /* print contents of direct items */
-void print_block(struct buffer_head *bh, ...);
-void print_bmap(struct super_block *s, int silent);
-void print_bmap_block(int i, char *data, int size, int silent);
-/*void print_super_block (struct super_block * s, char * mes);*/
-void print_objectid_map(struct super_block *s);
-void print_block_head(struct buffer_head *bh, char *mes);
-void check_leaf(struct buffer_head *bh);
-void check_internal(struct buffer_head *bh);
-void print_statistics(struct super_block *s);
-char *reiserfs_hashname(int code);
-
-/* lbalance.c */
-int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
- int mov_bytes, struct buffer_head *Snew);
-int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes);
-int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
-void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
- int del_num, int del_bytes);
-void leaf_insert_into_buf(struct buffer_info *bi, int before,
- struct item_head * const inserted_item_ih,
- const char * const inserted_item_body,
- int zeros_number);
-void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
- int pos_in_item, int paste_size,
- const char * const body, int zeros_number);
-void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
- int pos_in_item, int cut_size);
-void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
- int new_entry_count, struct reiserfs_de_head *new_dehs,
- const char *records, int paste_size);
-/* ibalance.c */
-int balance_internal(struct tree_balance *, int, int, struct item_head *,
- struct buffer_head **);
-
-/* do_balance.c */
-void do_balance_mark_leaf_dirty(struct tree_balance *tb,
- struct buffer_head *bh, int flag);
-#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
-#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
-
-void do_balance(struct tree_balance *tb, struct item_head *ih,
- const char *body, int flag);
-void reiserfs_invalidate_buffer(struct tree_balance *tb,
- struct buffer_head *bh);
-
-int get_left_neighbor_position(struct tree_balance *tb, int h);
-int get_right_neighbor_position(struct tree_balance *tb, int h);
-void replace_key(struct tree_balance *tb, struct buffer_head *, int,
- struct buffer_head *, int);
-void make_empty_node(struct buffer_info *);
-struct buffer_head *get_FEB(struct tree_balance *);
-
-/* bitmap.c */
-
-/*
- * structure contains hints for block allocator, and it is a container for
- * arguments, such as node, search path, transaction_handle, etc.
- */
-struct __reiserfs_blocknr_hint {
- /* inode passed to allocator, if we allocate unf. nodes */
- struct inode *inode;
-
- sector_t block; /* file offset, in blocks */
- struct in_core_key key;
-
- /*
- * search path, used by allocator to deternine search_start by
- * various ways
- */
- struct treepath *path;
-
- /*
- * transaction handle is needed to log super blocks
- * and bitmap blocks changes
- */
- struct reiserfs_transaction_handle *th;
-
- b_blocknr_t beg, end;
-
- /*
- * a field used to transfer search start value (block number)
- * between different block allocator procedures
- * (determine_search_start() and others)
- */
- b_blocknr_t search_start;
-
- /*
- * is set in determine_prealloc_size() function,
- * used by underlayed function that do actual allocation
- */
- int prealloc_size;
-
- /*
- * the allocator uses different polices for getting disk
- * space for formatted/unformatted blocks with/without preallocation
- */
- unsigned formatted_node:1;
- unsigned preallocate:1;
-};
-
-typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
-
-int reiserfs_parse_alloc_options(struct super_block *, char *);
-void reiserfs_init_alloc_options(struct super_block *s);
-
-/*
- * given a directory, this will tell you what packing locality
- * to use for a new object underneat it. The locality is returned
- * in disk byte order (le).
- */
-__le32 reiserfs_choose_packing(struct inode *dir);
-
-void show_alloc_options(struct seq_file *seq, struct super_block *s);
-int reiserfs_init_bitmap_cache(struct super_block *sb);
-void reiserfs_free_bitmap_cache(struct super_block *sb);
-void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
-struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap);
-int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
-void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *,
- b_blocknr_t, int for_unformatted);
-int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int,
- int);
-static inline int reiserfs_new_form_blocknrs(struct tree_balance *tb,
- b_blocknr_t * new_blocknrs,
- int amount_needed)
-{
- reiserfs_blocknr_hint_t hint = {
- .th = tb->transaction_handle,
- .path = tb->tb_path,
- .inode = NULL,
- .key = tb->key,
- .block = 0,
- .formatted_node = 1
- };
- return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed,
- 0);
-}
-
-static inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle
- *th, struct inode *inode,
- b_blocknr_t * new_blocknrs,
- struct treepath *path,
- sector_t block)
-{
- reiserfs_blocknr_hint_t hint = {
- .th = th,
- .path = path,
- .inode = inode,
- .block = block,
- .formatted_node = 0,
- .preallocate = 0
- };
- return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
-}
-
-#ifdef REISERFS_PREALLOCATE
-static inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle
- *th, struct inode *inode,
- b_blocknr_t * new_blocknrs,
- struct treepath *path,
- sector_t block)
-{
- reiserfs_blocknr_hint_t hint = {
- .th = th,
- .path = path,
- .inode = inode,
- .block = block,
- .formatted_node = 0,
- .preallocate = 1
- };
- return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
-}
-
-void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
- struct inode *inode);
-void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th);
-#endif
-
-/* hashes.c */
-__u32 keyed_hash(const signed char *msg, int len);
-__u32 yura_hash(const signed char *msg, int len);
-__u32 r5_hash(const signed char *msg, int len);
-
-#define reiserfs_set_le_bit __set_bit_le
-#define reiserfs_test_and_set_le_bit __test_and_set_bit_le
-#define reiserfs_clear_le_bit __clear_bit_le
-#define reiserfs_test_and_clear_le_bit __test_and_clear_bit_le
-#define reiserfs_test_le_bit test_bit_le
-#define reiserfs_find_next_zero_le_bit find_next_zero_bit_le
-
-/*
- * sometimes reiserfs_truncate may require to allocate few new blocks
- * to perform indirect2direct conversion. People probably used to
- * think, that truncate should work without problems on a filesystem
- * without free disk space. They may complain that they can not
- * truncate due to lack of free disk space. This spare space allows us
- * to not worry about it. 500 is probably too much, but it should be
- * absolutely safe
- */
-#define SPARE_SPACE 500
-
-/* prototypes from ioctl.c */
-int reiserfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-int reiserfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
-long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-long reiserfs_compat_ioctl(struct file *filp,
- unsigned int cmd, unsigned long arg);
-int reiserfs_unpack(struct inode *inode);
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
deleted file mode 100644
index 7b498a0d060b..000000000000
--- a/fs/reiserfs/resize.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/*
- * Written by Alexander Zarochentcev.
- *
- * The kernel part of the (on-line) reiserfs resizer.
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-
-int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
-{
- int err = 0;
- struct reiserfs_super_block *sb;
- struct reiserfs_bitmap_info *bitmap;
- struct reiserfs_bitmap_info *info;
- struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
- struct buffer_head *bh;
- struct reiserfs_transaction_handle th;
- unsigned int bmap_nr_new, bmap_nr;
- unsigned int block_r_new, block_r;
-
- struct reiserfs_list_bitmap *jb;
- struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS];
-
- unsigned long int block_count, free_blocks;
- int i;
- int copy_size;
- int depth;
-
- sb = SB_DISK_SUPER_BLOCK(s);
-
- if (SB_BLOCK_COUNT(s) >= block_count_new) {
- printk("can\'t shrink filesystem on-line\n");
- return -EINVAL;
- }
-
- /* check the device size */
- depth = reiserfs_write_unlock_nested(s);
- bh = sb_bread(s, block_count_new - 1);
- reiserfs_write_lock_nested(s, depth);
- if (!bh) {
- printk("reiserfs_resize: can\'t read last block\n");
- return -EINVAL;
- }
- bforget(bh);
-
- /*
- * old disk layout detection; those partitions can be mounted, but
- * cannot be resized
- */
- if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
- != REISERFS_DISK_OFFSET_IN_BYTES) {
- printk
- ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
- return -ENOTSUPP;
- }
-
- /* count used bits in last bitmap block */
- block_r = SB_BLOCK_COUNT(s) -
- (reiserfs_bmap_count(s) - 1) * s->s_blocksize * 8;
-
- /* count bitmap blocks in new fs */
- bmap_nr_new = block_count_new / (s->s_blocksize * 8);
- block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8;
- if (block_r_new)
- bmap_nr_new++;
- else
- block_r_new = s->s_blocksize * 8;
-
- /* save old values */
- block_count = SB_BLOCK_COUNT(s);
- bmap_nr = reiserfs_bmap_count(s);
-
- /* resizing of reiserfs bitmaps (journal and real), if needed */
- if (bmap_nr_new > bmap_nr) {
- /* reallocate journal bitmaps */
- if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
- printk
- ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
- return -ENOMEM;
- }
- /*
- * the new journal bitmaps are zero filled, now we copy i
- * the bitmap node pointers from the old journal bitmap
- * structs, and then transfer the new data structures
- * into the journal struct.
- *
- * using the copy_size var below allows this code to work for
- * both shrinking and expanding the FS.
- */
- copy_size = min(bmap_nr_new, bmap_nr);
- copy_size =
- copy_size * sizeof(struct reiserfs_list_bitmap_node *);
- for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
- struct reiserfs_bitmap_node **node_tmp;
- jb = SB_JOURNAL(s)->j_list_bitmap + i;
- memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
-
- /*
- * just in case vfree schedules on us, copy the new
- * pointer into the journal struct before freeing the
- * old one
- */
- node_tmp = jb->bitmaps;
- jb->bitmaps = jbitmap[i].bitmaps;
- vfree(node_tmp);
- }
-
- /*
- * allocate additional bitmap blocks, reallocate
- * array of bitmap block pointers
- */
- bitmap =
- vzalloc(array_size(bmap_nr_new,
- sizeof(struct reiserfs_bitmap_info)));
- if (!bitmap) {
- /*
- * Journal bitmaps are still supersized, but the
- * memory isn't leaked, so I guess it's ok
- */
- printk("reiserfs_resize: unable to allocate memory.\n");
- return -ENOMEM;
- }
- for (i = 0; i < bmap_nr; i++)
- bitmap[i] = old_bitmap[i];
-
- /*
- * This doesn't go through the journal, but it doesn't have to.
- * The changes are still atomic: We're synced up when the
- * journal transaction begins, and the new bitmaps don't
- * matter if the transaction fails.
- */
- for (i = bmap_nr; i < bmap_nr_new; i++) {
- int depth;
- /*
- * don't use read_bitmap_block since it will cache
- * the uninitialized bitmap
- */
- depth = reiserfs_write_unlock_nested(s);
- bh = sb_bread(s, i * s->s_blocksize * 8);
- reiserfs_write_lock_nested(s, depth);
- if (!bh) {
- vfree(bitmap);
- return -EIO;
- }
- memset(bh->b_data, 0, sb_blocksize(sb));
- reiserfs_set_le_bit(0, bh->b_data);
- reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
-
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- depth = reiserfs_write_unlock_nested(s);
- sync_dirty_buffer(bh);
- reiserfs_write_lock_nested(s, depth);
- /* update bitmap_info stuff */
- bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
- brelse(bh);
- }
- /* free old bitmap blocks array */
- SB_AP_BITMAP(s) = bitmap;
- vfree(old_bitmap);
- }
-
- /*
- * begin transaction, if there was an error, it's fine. Yes, we have
- * incorrect bitmaps now, but none of it is ever going to touch the
- * disk anyway.
- */
- err = journal_begin(&th, s, 10);
- if (err)
- return err;
-
- /* Extend old last bitmap block - new blocks have been made available */
- info = SB_AP_BITMAP(s) + bmap_nr - 1;
- bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
- if (!bh) {
- int jerr = journal_end(&th);
- if (jerr)
- return jerr;
- return -EIO;
- }
-
- reiserfs_prepare_for_journal(s, bh, 1);
- for (i = block_r; i < s->s_blocksize * 8; i++)
- reiserfs_clear_le_bit(i, bh->b_data);
- info->free_count += s->s_blocksize * 8 - block_r;
-
- journal_mark_dirty(&th, bh);
- brelse(bh);
-
- /* Correct new last bitmap block - It may not be full */
- info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
- bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
- if (!bh) {
- int jerr = journal_end(&th);
- if (jerr)
- return jerr;
- return -EIO;
- }
-
- reiserfs_prepare_for_journal(s, bh, 1);
- for (i = block_r_new; i < s->s_blocksize * 8; i++)
- reiserfs_set_le_bit(i, bh->b_data);
- journal_mark_dirty(&th, bh);
- brelse(bh);
-
- info->free_count -= s->s_blocksize * 8 - block_r_new;
- /* update super */
- reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
- free_blocks = SB_FREE_BLOCKS(s);
- PUT_SB_FREE_BLOCKS(s,
- free_blocks + (block_count_new - block_count -
- (bmap_nr_new - bmap_nr)));
- PUT_SB_BLOCK_COUNT(s, block_count_new);
- PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
-
- journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
-
- SB_JOURNAL(s)->j_must_wait = 1;
- return journal_end(&th);
-}
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
deleted file mode 100644
index 5faf702f8d15..000000000000
--- a/fs/reiserfs/stree.c
+++ /dev/null
@@ -1,2280 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- */
-
-/*
- * Written by Anatoly P. Pinchuk pap@namesys.botik.ru
- * Programm System Institute
- * Pereslavl-Zalessky Russia
- */
-
-#include <linux/time.h>
-#include <linux/string.h>
-#include <linux/pagemap.h>
-#include <linux/bio.h>
-#include "reiserfs.h"
-#include <linux/buffer_head.h>
-#include <linux/quotaops.h>
-
-/* Does the buffer contain a disk block which is in the tree. */
-inline int B_IS_IN_TREE(const struct buffer_head *bh)
-{
-
- RFALSE(B_LEVEL(bh) > MAX_HEIGHT,
- "PAP-1010: block (%b) has too big level (%z)", bh, bh);
-
- return (B_LEVEL(bh) != FREE_LEVEL);
-}
-
-/* to get item head in le form */
-inline void copy_item_head(struct item_head *to,
- const struct item_head *from)
-{
- memcpy(to, from, IH_SIZE);
-}
-
-/*
- * k1 is pointer to on-disk structure which is stored in little-endian
- * form. k2 is pointer to cpu variable. For key of items of the same
- * object this returns 0.
- * Returns: -1 if key1 < key2
- * 0 if key1 == key2
- * 1 if key1 > key2
- */
-inline int comp_short_keys(const struct reiserfs_key *le_key,
- const struct cpu_key *cpu_key)
-{
- __u32 n;
- n = le32_to_cpu(le_key->k_dir_id);
- if (n < cpu_key->on_disk_key.k_dir_id)
- return -1;
- if (n > cpu_key->on_disk_key.k_dir_id)
- return 1;
- n = le32_to_cpu(le_key->k_objectid);
- if (n < cpu_key->on_disk_key.k_objectid)
- return -1;
- if (n > cpu_key->on_disk_key.k_objectid)
- return 1;
- return 0;
-}
-
-/*
- * k1 is pointer to on-disk structure which is stored in little-endian
- * form. k2 is pointer to cpu variable.
- * Compare keys using all 4 key fields.
- * Returns: -1 if key1 < key2 0
- * if key1 = key2 1 if key1 > key2
- */
-static inline int comp_keys(const struct reiserfs_key *le_key,
- const struct cpu_key *cpu_key)
-{
- int retval;
-
- retval = comp_short_keys(le_key, cpu_key);
- if (retval)
- return retval;
- if (le_key_k_offset(le_key_version(le_key), le_key) <
- cpu_key_k_offset(cpu_key))
- return -1;
- if (le_key_k_offset(le_key_version(le_key), le_key) >
- cpu_key_k_offset(cpu_key))
- return 1;
-
- if (cpu_key->key_length == 3)
- return 0;
-
- /* this part is needed only when tail conversion is in progress */
- if (le_key_k_type(le_key_version(le_key), le_key) <
- cpu_key_k_type(cpu_key))
- return -1;
-
- if (le_key_k_type(le_key_version(le_key), le_key) >
- cpu_key_k_type(cpu_key))
- return 1;
-
- return 0;
-}
-
-inline int comp_short_le_keys(const struct reiserfs_key *key1,
- const struct reiserfs_key *key2)
-{
- __u32 *k1_u32, *k2_u32;
- int key_length = REISERFS_SHORT_KEY_LEN;
-
- k1_u32 = (__u32 *) key1;
- k2_u32 = (__u32 *) key2;
- for (; key_length--; ++k1_u32, ++k2_u32) {
- if (le32_to_cpu(*k1_u32) < le32_to_cpu(*k2_u32))
- return -1;
- if (le32_to_cpu(*k1_u32) > le32_to_cpu(*k2_u32))
- return 1;
- }
- return 0;
-}
-
-inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
-{
- int version;
- to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
- to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
-
- /* find out version of the key */
- version = le_key_version(from);
- to->version = version;
- to->on_disk_key.k_offset = le_key_k_offset(version, from);
- to->on_disk_key.k_type = le_key_k_type(version, from);
-}
-
-/*
- * this does not say which one is bigger, it only returns 1 if keys
- * are not equal, 0 otherwise
- */
-inline int comp_le_keys(const struct reiserfs_key *k1,
- const struct reiserfs_key *k2)
-{
- return memcmp(k1, k2, sizeof(struct reiserfs_key));
-}
-
-/**************************************************************************
- * Binary search toolkit function *
- * Search for an item in the array by the item key *
- * Returns: 1 if found, 0 if not found; *
- * *pos = number of the searched element if found, else the *
- * number of the first element that is larger than key. *
- **************************************************************************/
-/*
- * For those not familiar with binary search: lbound is the leftmost item
- * that it could be, rbound the rightmost item that it could be. We examine
- * the item halfway between lbound and rbound, and that tells us either
- * that we can increase lbound, or decrease rbound, or that we have found it,
- * or if lbound <= rbound that there are no possible items, and we have not
- * found it. With each examination we cut the number of possible items it
- * could be by one more than half rounded down, or we find it.
- */
-static inline int bin_search(const void *key, /* Key to search for. */
- const void *base, /* First item in the array. */
- int num, /* Number of items in the array. */
- /*
- * Item size in the array. searched. Lest the
- * reader be confused, note that this is crafted
- * as a general function, and when it is applied
- * specifically to the array of item headers in a
- * node, width is actually the item header size
- * not the item size.
- */
- int width,
- int *pos /* Number of the searched for element. */
- )
-{
- int rbound, lbound, j;
-
- for (j = ((rbound = num - 1) + (lbound = 0)) / 2;
- lbound <= rbound; j = (rbound + lbound) / 2)
- switch (comp_keys
- ((struct reiserfs_key *)((char *)base + j * width),
- (struct cpu_key *)key)) {
- case -1:
- lbound = j + 1;
- continue;
- case 1:
- rbound = j - 1;
- continue;
- case 0:
- *pos = j;
- return ITEM_FOUND; /* Key found in the array. */
- }
-
- /*
- * bin_search did not find given key, it returns position of key,
- * that is minimal and greater than the given one.
- */
- *pos = lbound;
- return ITEM_NOT_FOUND;
-}
-
-
-/* Minimal possible key. It is never in the tree. */
-const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
-
-/* Maximal possible key. It is never in the tree. */
-static const struct reiserfs_key MAX_KEY = {
- cpu_to_le32(0xffffffff),
- cpu_to_le32(0xffffffff),
- {{cpu_to_le32(0xffffffff),
- cpu_to_le32(0xffffffff)},}
-};
-
-/*
- * Get delimiting key of the buffer by looking for it in the buffers in the
- * path, starting from the bottom of the path, and going upwards. We must
- * check the path's validity at each step. If the key is not in the path,
- * there is no delimiting key in the tree (buffer is first or last buffer
- * in tree), and in this case we return a special key, either MIN_KEY or
- * MAX_KEY.
- */
-static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
- const struct super_block *sb)
-{
- int position, path_offset = chk_path->path_length;
- struct buffer_head *parent;
-
- RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
- "PAP-5010: invalid offset in the path");
-
- /* While not higher in path than first element. */
- while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
-
- RFALSE(!buffer_uptodate
- (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
- "PAP-5020: parent is not uptodate");
-
- /* Parent at the path is not in the tree now. */
- if (!B_IS_IN_TREE
- (parent =
- PATH_OFFSET_PBUFFER(chk_path, path_offset)))
- return &MAX_KEY;
- /* Check whether position in the parent is correct. */
- if ((position =
- PATH_OFFSET_POSITION(chk_path,
- path_offset)) >
- B_NR_ITEMS(parent))
- return &MAX_KEY;
- /* Check whether parent at the path really points to the child. */
- if (B_N_CHILD_NUM(parent, position) !=
- PATH_OFFSET_PBUFFER(chk_path,
- path_offset + 1)->b_blocknr)
- return &MAX_KEY;
- /*
- * Return delimiting key if position in the parent
- * is not equal to zero.
- */
- if (position)
- return internal_key(parent, position - 1);
- }
- /* Return MIN_KEY if we are in the root of the buffer tree. */
- if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
- b_blocknr == SB_ROOT_BLOCK(sb))
- return &MIN_KEY;
- return &MAX_KEY;
-}
-
-/* Get delimiting key of the buffer at the path and its right neighbor. */
-inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
- const struct super_block *sb)
-{
- int position, path_offset = chk_path->path_length;
- struct buffer_head *parent;
-
- RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET,
- "PAP-5030: invalid offset in the path");
-
- while (path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
-
- RFALSE(!buffer_uptodate
- (PATH_OFFSET_PBUFFER(chk_path, path_offset)),
- "PAP-5040: parent is not uptodate");
-
- /* Parent at the path is not in the tree now. */
- if (!B_IS_IN_TREE
- (parent =
- PATH_OFFSET_PBUFFER(chk_path, path_offset)))
- return &MIN_KEY;
- /* Check whether position in the parent is correct. */
- if ((position =
- PATH_OFFSET_POSITION(chk_path,
- path_offset)) >
- B_NR_ITEMS(parent))
- return &MIN_KEY;
- /*
- * Check whether parent at the path really points
- * to the child.
- */
- if (B_N_CHILD_NUM(parent, position) !=
- PATH_OFFSET_PBUFFER(chk_path,
- path_offset + 1)->b_blocknr)
- return &MIN_KEY;
-
- /*
- * Return delimiting key if position in the parent
- * is not the last one.
- */
- if (position != B_NR_ITEMS(parent))
- return internal_key(parent, position);
- }
-
- /* Return MAX_KEY if we are in the root of the buffer tree. */
- if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
- b_blocknr == SB_ROOT_BLOCK(sb))
- return &MAX_KEY;
- return &MIN_KEY;
-}
-
-/*
- * Check whether a key is contained in the tree rooted from a buffer at a path.
- * This works by looking at the left and right delimiting keys for the buffer
- * in the last path_element in the path. These delimiting keys are stored
- * at least one level above that buffer in the tree. If the buffer is the
- * first or last node in the tree order then one of the delimiting keys may
- * be absent, and in this case get_lkey and get_rkey return a special key
- * which is MIN_KEY or MAX_KEY.
- */
-static inline int key_in_buffer(
- /* Path which should be checked. */
- struct treepath *chk_path,
- /* Key which should be checked. */
- const struct cpu_key *key,
- struct super_block *sb
- )
-{
-
- RFALSE(!key || chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
- || chk_path->path_length > MAX_HEIGHT,
- "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
- key, chk_path->path_length);
- RFALSE(!PATH_PLAST_BUFFER(chk_path)->b_bdev,
- "PAP-5060: device must not be NODEV");
-
- if (comp_keys(get_lkey(chk_path, sb), key) == 1)
- /* left delimiting key is bigger, that the key we look for */
- return 0;
- /* if ( comp_keys(key, get_rkey(chk_path, sb)) != -1 ) */
- if (comp_keys(get_rkey(chk_path, sb), key) != 1)
- /* key must be less than right delimitiing key */
- return 0;
- return 1;
-}
-
-int reiserfs_check_path(struct treepath *p)
-{
- RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
- "path not properly relsed");
- return 0;
-}
-
-/*
- * Drop the reference to each buffer in a path and restore
- * dirty bits clean when preparing the buffer for the log.
- * This version should only be called from fix_nodes()
- */
-void pathrelse_and_restore(struct super_block *sb,
- struct treepath *search_path)
-{
- int path_offset = search_path->path_length;
-
- RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
- "clm-4000: invalid path offset");
-
- while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
- struct buffer_head *bh;
- bh = PATH_OFFSET_PBUFFER(search_path, path_offset--);
- reiserfs_restore_prepared_buffer(sb, bh);
- brelse(bh);
- }
- search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
-}
-
-/* Drop the reference to each buffer in a path */
-void pathrelse(struct treepath *search_path)
-{
- int path_offset = search_path->path_length;
-
- RFALSE(path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
- "PAP-5090: invalid path offset");
-
- while (path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
- brelse(PATH_OFFSET_PBUFFER(search_path, path_offset--));
-
- search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
-}
-
-static int has_valid_deh_location(struct buffer_head *bh, struct item_head *ih)
-{
- struct reiserfs_de_head *deh;
- int i;
-
- deh = B_I_DEH(bh, ih);
- for (i = 0; i < ih_entry_count(ih); i++) {
- if (deh_location(&deh[i]) > ih_item_len(ih)) {
- reiserfs_warning(NULL, "reiserfs-5094",
- "directory entry location seems wrong %h",
- &deh[i]);
- return 0;
- }
- }
-
- return 1;
-}
-
-static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
-{
- struct block_head *blkh;
- struct item_head *ih;
- int used_space;
- int prev_location;
- int i;
- int nr;
-
- blkh = (struct block_head *)buf;
- if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
- reiserfs_warning(NULL, "reiserfs-5080",
- "this should be caught earlier");
- return 0;
- }
-
- nr = blkh_nr_item(blkh);
- if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
- /* item number is too big or too small */
- reiserfs_warning(NULL, "reiserfs-5081",
- "nr_item seems wrong: %z", bh);
- return 0;
- }
- ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
- used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
-
- /* free space does not match to calculated amount of use space */
- if (used_space != blocksize - blkh_free_space(blkh)) {
- reiserfs_warning(NULL, "reiserfs-5082",
- "free space seems wrong: %z", bh);
- return 0;
- }
- /*
- * FIXME: it is_leaf will hit performance too much - we may have
- * return 1 here
- */
-
- /* check tables of item heads */
- ih = (struct item_head *)(buf + BLKH_SIZE);
- prev_location = blocksize;
- for (i = 0; i < nr; i++, ih++) {
- if (le_ih_k_type(ih) == TYPE_ANY) {
- reiserfs_warning(NULL, "reiserfs-5083",
- "wrong item type for item %h",
- ih);
- return 0;
- }
- if (ih_location(ih) >= blocksize
- || ih_location(ih) < IH_SIZE * nr) {
- reiserfs_warning(NULL, "reiserfs-5084",
- "item location seems wrong: %h",
- ih);
- return 0;
- }
- if (ih_item_len(ih) < 1
- || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
- reiserfs_warning(NULL, "reiserfs-5085",
- "item length seems wrong: %h",
- ih);
- return 0;
- }
- if (prev_location - ih_location(ih) != ih_item_len(ih)) {
- reiserfs_warning(NULL, "reiserfs-5086",
- "item location seems wrong "
- "(second one): %h", ih);
- return 0;
- }
- if (is_direntry_le_ih(ih)) {
- if (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE)) {
- reiserfs_warning(NULL, "reiserfs-5093",
- "item entry count seems wrong %h",
- ih);
- return 0;
- }
- return has_valid_deh_location(bh, ih);
- }
- prev_location = ih_location(ih);
- }
-
- /* one may imagine many more checks */
- return 1;
-}
-
-/* returns 1 if buf looks like an internal node, 0 otherwise */
-static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
-{
- struct block_head *blkh;
- int nr;
- int used_space;
-
- blkh = (struct block_head *)buf;
- nr = blkh_level(blkh);
- if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
- /* this level is not possible for internal nodes */
- reiserfs_warning(NULL, "reiserfs-5087",
- "this should be caught earlier");
- return 0;
- }
-
- nr = blkh_nr_item(blkh);
- /* for internal which is not root we might check min number of keys */
- if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
- reiserfs_warning(NULL, "reiserfs-5088",
- "number of key seems wrong: %z", bh);
- return 0;
- }
-
- used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
- if (used_space != blocksize - blkh_free_space(blkh)) {
- reiserfs_warning(NULL, "reiserfs-5089",
- "free space seems wrong: %z", bh);
- return 0;
- }
-
- /* one may imagine many more checks */
- return 1;
-}
-
-/*
- * make sure that bh contains formatted node of reiserfs tree of
- * 'level'-th level
- */
-static int is_tree_node(struct buffer_head *bh, int level)
-{
- if (B_LEVEL(bh) != level) {
- reiserfs_warning(NULL, "reiserfs-5090", "node level %d does "
- "not match to the expected one %d",
- B_LEVEL(bh), level);
- return 0;
- }
- if (level == DISK_LEAF_NODE_LEVEL)
- return is_leaf(bh->b_data, bh->b_size, bh);
-
- return is_internal(bh->b_data, bh->b_size, bh);
-}
-
-#define SEARCH_BY_KEY_READA 16
-
-/*
- * The function is NOT SCHEDULE-SAFE!
- * It might unlock the write lock if we needed to wait for a block
- * to be read. Note that in this case it won't recover the lock to avoid
- * high contention resulting from too much lock requests, especially
- * the caller (search_by_key) will perform other schedule-unsafe
- * operations just after calling this function.
- *
- * @return depth of lock to be restored after read completes
- */
-static int search_by_key_reada(struct super_block *s,
- struct buffer_head **bh,
- b_blocknr_t *b, int num)
-{
- int i, j;
- int depth = -1;
-
- for (i = 0; i < num; i++) {
- bh[i] = sb_getblk(s, b[i]);
- }
- /*
- * We are going to read some blocks on which we
- * have a reference. It's safe, though we might be
- * reading blocks concurrently changed if we release
- * the lock. But it's still fine because we check later
- * if the tree changed
- */
- for (j = 0; j < i; j++) {
- /*
- * note, this needs attention if we are getting rid of the BKL
- * you have to make sure the prepared bit isn't set on this
- * buffer
- */
- if (!buffer_uptodate(bh[j])) {
- if (depth == -1)
- depth = reiserfs_write_unlock_nested(s);
- bh_readahead(bh[j], REQ_RAHEAD);
- }
- brelse(bh[j]);
- }
- return depth;
-}
-
-/*
- * This function fills up the path from the root to the leaf as it
- * descends the tree looking for the key. It uses reiserfs_bread to
- * try to find buffers in the cache given their block number. If it
- * does not find them in the cache it reads them from disk. For each
- * node search_by_key finds using reiserfs_bread it then uses
- * bin_search to look through that node. bin_search will find the
- * position of the block_number of the next node if it is looking
- * through an internal node. If it is looking through a leaf node
- * bin_search will find the position of the item which has key either
- * equal to given key, or which is the maximal key less than the given
- * key. search_by_key returns a path that must be checked for the
- * correctness of the top of the path but need not be checked for the
- * correctness of the bottom of the path
- */
-/*
- * search_by_key - search for key (and item) in stree
- * @sb: superblock
- * @key: pointer to key to search for
- * @search_path: Allocated and initialized struct treepath; Returned filled
- * on success.
- * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to
- * stop at leaf level.
- *
- * The function is NOT SCHEDULE-SAFE!
- */
-int search_by_key(struct super_block *sb, const struct cpu_key *key,
- struct treepath *search_path, int stop_level)
-{
- b_blocknr_t block_number;
- int expected_level;
- struct buffer_head *bh;
- struct path_element *last_element;
- int node_level, retval;
- int fs_gen;
- struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
- b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA];
- int reada_count = 0;
-
-#ifdef CONFIG_REISERFS_CHECK
- int repeat_counter = 0;
-#endif
-
- PROC_INFO_INC(sb, search_by_key);
-
- /*
- * As we add each node to a path we increase its count. This means
- * that we must be careful to release all nodes in a path before we
- * either discard the path struct or re-use the path struct, as we
- * do here.
- */
-
- pathrelse(search_path);
-
- /*
- * With each iteration of this loop we search through the items in the
- * current node, and calculate the next current node(next path element)
- * for the next iteration of this loop..
- */
- block_number = SB_ROOT_BLOCK(sb);
- expected_level = -1;
- while (1) {
-
-#ifdef CONFIG_REISERFS_CHECK
- if (!(++repeat_counter % 50000))
- reiserfs_warning(sb, "PAP-5100",
- "%s: there were %d iterations of "
- "while loop looking for key %K",
- current->comm, repeat_counter,
- key);
-#endif
-
- /* prep path to have another element added to it. */
- last_element =
- PATH_OFFSET_PELEMENT(search_path,
- ++search_path->path_length);
- fs_gen = get_generation(sb);
-
- /*
- * Read the next tree node, and set the last element
- * in the path to have a pointer to it.
- */
- if ((bh = last_element->pe_buffer =
- sb_getblk(sb, block_number))) {
-
- /*
- * We'll need to drop the lock if we encounter any
- * buffers that need to be read. If all of them are
- * already up to date, we don't need to drop the lock.
- */
- int depth = -1;
-
- if (!buffer_uptodate(bh) && reada_count > 1)
- depth = search_by_key_reada(sb, reada_bh,
- reada_blocks, reada_count);
-
- if (!buffer_uptodate(bh) && depth == -1)
- depth = reiserfs_write_unlock_nested(sb);
-
- bh_read_nowait(bh, 0);
- wait_on_buffer(bh);
-
- if (depth != -1)
- reiserfs_write_lock_nested(sb, depth);
- if (!buffer_uptodate(bh))
- goto io_error;
- } else {
-io_error:
- search_path->path_length--;
- pathrelse(search_path);
- return IO_ERROR;
- }
- reada_count = 0;
- if (expected_level == -1)
- expected_level = SB_TREE_HEIGHT(sb);
- expected_level--;
-
- /*
- * It is possible that schedule occurred. We must check
- * whether the key to search is still in the tree rooted
- * from the current buffer. If not then repeat search
- * from the root.
- */
- if (fs_changed(fs_gen, sb) &&
- (!B_IS_IN_TREE(bh) ||
- B_LEVEL(bh) != expected_level ||
- !key_in_buffer(search_path, key, sb))) {
- PROC_INFO_INC(sb, search_by_key_fs_changed);
- PROC_INFO_INC(sb, search_by_key_restarted);
- PROC_INFO_INC(sb,
- sbk_restarted[expected_level - 1]);
- pathrelse(search_path);
-
- /*
- * Get the root block number so that we can
- * repeat the search starting from the root.
- */
- block_number = SB_ROOT_BLOCK(sb);
- expected_level = -1;
-
- /* repeat search from the root */
- continue;
- }
-
- /*
- * only check that the key is in the buffer if key is not
- * equal to the MAX_KEY. Latter case is only possible in
- * "finish_unfinished()" processing during mount.
- */
- RFALSE(comp_keys(&MAX_KEY, key) &&
- !key_in_buffer(search_path, key, sb),
- "PAP-5130: key is not in the buffer");
-#ifdef CONFIG_REISERFS_CHECK
- if (REISERFS_SB(sb)->cur_tb) {
- print_cur_tb("5140");
- reiserfs_panic(sb, "PAP-5140",
- "schedule occurred in do_balance!");
- }
-#endif
-
- /*
- * make sure, that the node contents look like a node of
- * certain level
- */
- if (!is_tree_node(bh, expected_level)) {
- reiserfs_error(sb, "vs-5150",
- "invalid format found in block %ld. "
- "Fsck?", bh->b_blocknr);
- pathrelse(search_path);
- return IO_ERROR;
- }
-
- /* ok, we have acquired next formatted node in the tree */
- node_level = B_LEVEL(bh);
-
- PROC_INFO_BH_STAT(sb, bh, node_level - 1);
-
- RFALSE(node_level < stop_level,
- "vs-5152: tree level (%d) is less than stop level (%d)",
- node_level, stop_level);
-
- retval = bin_search(key, item_head(bh, 0),
- B_NR_ITEMS(bh),
- (node_level ==
- DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
- KEY_SIZE,
- &last_element->pe_position);
- if (node_level == stop_level) {
- return retval;
- }
-
- /* we are not in the stop level */
- /*
- * item has been found, so we choose the pointer which
- * is to the right of the found one
- */
- if (retval == ITEM_FOUND)
- last_element->pe_position++;
-
- /*
- * if item was not found we choose the position which is to
- * the left of the found item. This requires no code,
- * bin_search did it already.
- */
-
- /*
- * So we have chosen a position in the current node which is
- * an internal node. Now we calculate child block number by
- * position in the node.
- */
- block_number =
- B_N_CHILD_NUM(bh, last_element->pe_position);
-
- /*
- * if we are going to read leaf nodes, try for read
- * ahead as well
- */
- if ((search_path->reada & PATH_READA) &&
- node_level == DISK_LEAF_NODE_LEVEL + 1) {
- int pos = last_element->pe_position;
- int limit = B_NR_ITEMS(bh);
- struct reiserfs_key *le_key;
-
- if (search_path->reada & PATH_READA_BACK)
- limit = 0;
- while (reada_count < SEARCH_BY_KEY_READA) {
- if (pos == limit)
- break;
- reada_blocks[reada_count++] =
- B_N_CHILD_NUM(bh, pos);
- if (search_path->reada & PATH_READA_BACK)
- pos--;
- else
- pos++;
-
- /*
- * check to make sure we're in the same object
- */
- le_key = internal_key(bh, pos);
- if (le32_to_cpu(le_key->k_objectid) !=
- key->on_disk_key.k_objectid) {
- break;
- }
- }
- }
- }
-}
-
-/*
- * Form the path to an item and position in this item which contains
- * file byte defined by key. If there is no such item
- * corresponding to the key, we point the path to the item with
- * maximal key less than key, and *pos_in_item is set to one
- * past the last entry/byte in the item. If searching for entry in a
- * directory item, and it is not found, *pos_in_item is set to one
- * entry more than the entry with maximal key which is less than the
- * sought key.
- *
- * Note that if there is no entry in this same node which is one more,
- * then we point to an imaginary entry. for direct items, the
- * position is in units of bytes, for indirect items the position is
- * in units of blocknr entries, for directory items the position is in
- * units of directory entries.
- */
-/* The function is NOT SCHEDULE-SAFE! */
-int search_for_position_by_key(struct super_block *sb,
- /* Key to search (cpu variable) */
- const struct cpu_key *p_cpu_key,
- /* Filled up by this function. */
- struct treepath *search_path)
-{
- struct item_head *p_le_ih; /* pointer to on-disk structure */
- int blk_size;
- loff_t item_offset, offset;
- struct reiserfs_dir_entry de;
- int retval;
-
- /* If searching for directory entry. */
- if (is_direntry_cpu_key(p_cpu_key))
- return search_by_entry_key(sb, p_cpu_key, search_path,
- &de);
-
- /* If not searching for directory entry. */
-
- /* If item is found. */
- retval = search_item(sb, p_cpu_key, search_path);
- if (retval == IO_ERROR)
- return retval;
- if (retval == ITEM_FOUND) {
-
- RFALSE(!ih_item_len
- (item_head
- (PATH_PLAST_BUFFER(search_path),
- PATH_LAST_POSITION(search_path))),
- "PAP-5165: item length equals zero");
-
- pos_in_item(search_path) = 0;
- return POSITION_FOUND;
- }
-
- RFALSE(!PATH_LAST_POSITION(search_path),
- "PAP-5170: position equals zero");
-
- /* Item is not found. Set path to the previous item. */
- p_le_ih =
- item_head(PATH_PLAST_BUFFER(search_path),
- --PATH_LAST_POSITION(search_path));
- blk_size = sb->s_blocksize;
-
- if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key))
- return FILE_NOT_FOUND;
-
- /* FIXME: quite ugly this far */
-
- item_offset = le_ih_k_offset(p_le_ih);
- offset = cpu_key_k_offset(p_cpu_key);
-
- /* Needed byte is contained in the item pointed to by the path. */
- if (item_offset <= offset &&
- item_offset + op_bytes_number(p_le_ih, blk_size) > offset) {
- pos_in_item(search_path) = offset - item_offset;
- if (is_indirect_le_ih(p_le_ih)) {
- pos_in_item(search_path) /= blk_size;
- }
- return POSITION_FOUND;
- }
-
- /*
- * Needed byte is not contained in the item pointed to by the
- * path. Set pos_in_item out of the item.
- */
- if (is_indirect_le_ih(p_le_ih))
- pos_in_item(search_path) =
- ih_item_len(p_le_ih) / UNFM_P_SIZE;
- else
- pos_in_item(search_path) = ih_item_len(p_le_ih);
-
- return POSITION_NOT_FOUND;
-}
-
-/* Compare given item and item pointed to by the path. */
-int comp_items(const struct item_head *stored_ih, const struct treepath *path)
-{
- struct buffer_head *bh = PATH_PLAST_BUFFER(path);
- struct item_head *ih;
-
- /* Last buffer at the path is not in the tree. */
- if (!B_IS_IN_TREE(bh))
- return 1;
-
- /* Last path position is invalid. */
- if (PATH_LAST_POSITION(path) >= B_NR_ITEMS(bh))
- return 1;
-
- /* we need only to know, whether it is the same item */
- ih = tp_item_head(path);
- return memcmp(stored_ih, ih, IH_SIZE);
-}
-
-/* prepare for delete or cut of direct item */
-static inline int prepare_for_direct_item(struct treepath *path,
- struct item_head *le_ih,
- struct inode *inode,
- loff_t new_file_length, int *cut_size)
-{
- loff_t round_len;
-
- if (new_file_length == max_reiserfs_offset(inode)) {
- /* item has to be deleted */
- *cut_size = -(IH_SIZE + ih_item_len(le_ih));
- return M_DELETE;
- }
- /* new file gets truncated */
- if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
- round_len = ROUND_UP(new_file_length);
- /* this was new_file_length < le_ih ... */
- if (round_len < le_ih_k_offset(le_ih)) {
- *cut_size = -(IH_SIZE + ih_item_len(le_ih));
- return M_DELETE; /* Delete this item. */
- }
- /* Calculate first position and size for cutting from item. */
- pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1);
- *cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
-
- return M_CUT; /* Cut from this item. */
- }
-
- /* old file: items may have any length */
-
- if (new_file_length < le_ih_k_offset(le_ih)) {
- *cut_size = -(IH_SIZE + ih_item_len(le_ih));
- return M_DELETE; /* Delete this item. */
- }
-
- /* Calculate first position and size for cutting from item. */
- *cut_size = -(ih_item_len(le_ih) -
- (pos_in_item(path) =
- new_file_length + 1 - le_ih_k_offset(le_ih)));
- return M_CUT; /* Cut from this item. */
-}
-
-static inline int prepare_for_direntry_item(struct treepath *path,
- struct item_head *le_ih,
- struct inode *inode,
- loff_t new_file_length,
- int *cut_size)
-{
- if (le_ih_k_offset(le_ih) == DOT_OFFSET &&
- new_file_length == max_reiserfs_offset(inode)) {
- RFALSE(ih_entry_count(le_ih) != 2,
- "PAP-5220: incorrect empty directory item (%h)", le_ih);
- *cut_size = -(IH_SIZE + ih_item_len(le_ih));
- /* Delete the directory item containing "." and ".." entry. */
- return M_DELETE;
- }
-
- if (ih_entry_count(le_ih) == 1) {
- /*
- * Delete the directory item such as there is one record only
- * in this item
- */
- *cut_size = -(IH_SIZE + ih_item_len(le_ih));
- return M_DELETE;
- }
-
- /* Cut one record from the directory item. */
- *cut_size =
- -(DEH_SIZE +
- entry_length(get_last_bh(path), le_ih, pos_in_item(path)));
- return M_CUT;
-}
-
-#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
-
-/*
- * If the path points to a directory or direct item, calculate mode
- * and the size cut, for balance.
- * If the path points to an indirect item, remove some number of its
- * unformatted nodes.
- * In case of file truncate calculate whether this item must be
- * deleted/truncated or last unformatted node of this item will be
- * converted to a direct item.
- * This function returns a determination of what balance mode the
- * calling function should employ.
- */
-static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th,
- struct inode *inode,
- struct treepath *path,
- const struct cpu_key *item_key,
- /*
- * Number of unformatted nodes
- * which were removed from end
- * of the file.
- */
- int *removed,
- int *cut_size,
- /* MAX_KEY_OFFSET in case of delete. */
- unsigned long long new_file_length
- )
-{
- struct super_block *sb = inode->i_sb;
- struct item_head *p_le_ih = tp_item_head(path);
- struct buffer_head *bh = PATH_PLAST_BUFFER(path);
-
- BUG_ON(!th->t_trans_id);
-
- /* Stat_data item. */
- if (is_statdata_le_ih(p_le_ih)) {
-
- RFALSE(new_file_length != max_reiserfs_offset(inode),
- "PAP-5210: mode must be M_DELETE");
-
- *cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
- return M_DELETE;
- }
-
- /* Directory item. */
- if (is_direntry_le_ih(p_le_ih))
- return prepare_for_direntry_item(path, p_le_ih, inode,
- new_file_length,
- cut_size);
-
- /* Direct item. */
- if (is_direct_le_ih(p_le_ih))
- return prepare_for_direct_item(path, p_le_ih, inode,
- new_file_length, cut_size);
-
- /* Case of an indirect item. */
- {
- int blk_size = sb->s_blocksize;
- struct item_head s_ih;
- int need_re_search;
- int delete = 0;
- int result = M_CUT;
- int pos = 0;
-
- if ( new_file_length == max_reiserfs_offset (inode) ) {
- /*
- * prepare_for_delete_or_cut() is called by
- * reiserfs_delete_item()
- */
- new_file_length = 0;
- delete = 1;
- }
-
- do {
- need_re_search = 0;
- *cut_size = 0;
- bh = PATH_PLAST_BUFFER(path);
- copy_item_head(&s_ih, tp_item_head(path));
- pos = I_UNFM_NUM(&s_ih);
-
- while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
- __le32 *unfm;
- __u32 block;
-
- /*
- * Each unformatted block deletion may involve
- * one additional bitmap block into the transaction,
- * thereby the initial journal space reservation
- * might not be enough.
- */
- if (!delete && (*cut_size) != 0 &&
- reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
- break;
-
- unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1;
- block = get_block_num(unfm, 0);
-
- if (block != 0) {
- reiserfs_prepare_for_journal(sb, bh, 1);
- put_block_num(unfm, 0, 0);
- journal_mark_dirty(th, bh);
- reiserfs_free_block(th, inode, block, 1);
- }
-
- reiserfs_cond_resched(sb);
-
- if (item_moved (&s_ih, path)) {
- need_re_search = 1;
- break;
- }
-
- pos --;
- (*removed)++;
- (*cut_size) -= UNFM_P_SIZE;
-
- if (pos == 0) {
- (*cut_size) -= IH_SIZE;
- result = M_DELETE;
- break;
- }
- }
- /*
- * a trick. If the buffer has been logged, this will
- * do nothing. If we've broken the loop without logging
- * it, it will restore the buffer
- */
- reiserfs_restore_prepared_buffer(sb, bh);
- } while (need_re_search &&
- search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
- pos_in_item(path) = pos * UNFM_P_SIZE;
-
- if (*cut_size == 0) {
- /*
- * Nothing was cut. maybe convert last unformatted node to the
- * direct item?
- */
- result = M_CONVERT;
- }
- return result;
- }
-}
-
-/* Calculate number of bytes which will be deleted or cut during balance */
-static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
-{
- int del_size;
- struct item_head *p_le_ih = tp_item_head(tb->tb_path);
-
- if (is_statdata_le_ih(p_le_ih))
- return 0;
-
- del_size =
- (mode ==
- M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
- if (is_direntry_le_ih(p_le_ih)) {
- /*
- * return EMPTY_DIR_SIZE; We delete emty directories only.
- * we can't use EMPTY_DIR_SIZE, as old format dirs have a
- * different empty size. ick. FIXME, is this right?
- */
- return del_size;
- }
-
- if (is_indirect_le_ih(p_le_ih))
- del_size = (del_size / UNFM_P_SIZE) *
- (PATH_PLAST_BUFFER(tb->tb_path)->b_size);
- return del_size;
-}
-
-static void init_tb_struct(struct reiserfs_transaction_handle *th,
- struct tree_balance *tb,
- struct super_block *sb,
- struct treepath *path, int size)
-{
-
- BUG_ON(!th->t_trans_id);
-
- memset(tb, '\0', sizeof(struct tree_balance));
- tb->transaction_handle = th;
- tb->tb_sb = sb;
- tb->tb_path = path;
- PATH_OFFSET_PBUFFER(path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
- PATH_OFFSET_POSITION(path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
- tb->insert_size[0] = size;
-}
-
-void padd_item(char *item, int total_length, int length)
-{
- int i;
-
- for (i = total_length; i > length;)
- item[--i] = 0;
-}
-
-#ifdef REISERQUOTA_DEBUG
-char key2type(struct reiserfs_key *ih)
-{
- if (is_direntry_le_key(2, ih))
- return 'd';
- if (is_direct_le_key(2, ih))
- return 'D';
- if (is_indirect_le_key(2, ih))
- return 'i';
- if (is_statdata_le_key(2, ih))
- return 's';
- return 'u';
-}
-
-char head2type(struct item_head *ih)
-{
- if (is_direntry_le_ih(ih))
- return 'd';
- if (is_direct_le_ih(ih))
- return 'D';
- if (is_indirect_le_ih(ih))
- return 'i';
- if (is_statdata_le_ih(ih))
- return 's';
- return 'u';
-}
-#endif
-
-/*
- * Delete object item.
- * th - active transaction handle
- * path - path to the deleted item
- * item_key - key to search for the deleted item
- * indode - used for updating i_blocks and quotas
- * un_bh - NULL or unformatted node pointer
- */
-int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
- struct treepath *path, const struct cpu_key *item_key,
- struct inode *inode, struct buffer_head *un_bh)
-{
- struct super_block *sb = inode->i_sb;
- struct tree_balance s_del_balance;
- struct item_head s_ih;
- struct item_head *q_ih;
- int quota_cut_bytes;
- int ret_value, del_size, removed;
- int depth;
-
-#ifdef CONFIG_REISERFS_CHECK
- char mode;
-#endif
-
- BUG_ON(!th->t_trans_id);
-
- init_tb_struct(th, &s_del_balance, sb, path,
- 0 /*size is unknown */ );
-
- while (1) {
- removed = 0;
-
-#ifdef CONFIG_REISERFS_CHECK
- mode =
-#endif
- prepare_for_delete_or_cut(th, inode, path,
- item_key, &removed,
- &del_size,
- max_reiserfs_offset(inode));
-
- RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
-
- copy_item_head(&s_ih, tp_item_head(path));
- s_del_balance.insert_size[0] = del_size;
-
- ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
- if (ret_value != REPEAT_SEARCH)
- break;
-
- PROC_INFO_INC(sb, delete_item_restarted);
-
- /* file system changed, repeat search */
- ret_value =
- search_for_position_by_key(sb, item_key, path);
- if (ret_value == IO_ERROR)
- break;
- if (ret_value == FILE_NOT_FOUND) {
- reiserfs_warning(sb, "vs-5340",
- "no items of the file %K found",
- item_key);
- break;
- }
- } /* while (1) */
-
- if (ret_value != CARRY_ON) {
- unfix_nodes(&s_del_balance);
- return 0;
- }
-
- /* reiserfs_delete_item returns item length when success */
- ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
- q_ih = tp_item_head(path);
- quota_cut_bytes = ih_item_len(q_ih);
-
- /*
- * hack so the quota code doesn't have to guess if the file has a
- * tail. On tail insert, we allocate quota for 1 unformatted node.
- * We test the offset because the tail might have been
- * split into multiple items, and we only want to decrement for
- * the unfm node once
- */
- if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
- if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
- quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
- } else {
- quota_cut_bytes = 0;
- }
- }
-
- if (un_bh) {
- int off;
- char *data;
-
- /*
- * We are in direct2indirect conversion, so move tail contents
- * to the unformatted node
- */
- /*
- * note, we do the copy before preparing the buffer because we
- * don't care about the contents of the unformatted node yet.
- * the only thing we really care about is the direct item's
- * data is in the unformatted node.
- *
- * Otherwise, we would have to call
- * reiserfs_prepare_for_journal on the unformatted node,
- * which might schedule, meaning we'd have to loop all the
- * way back up to the start of the while loop.
- *
- * The unformatted node must be dirtied later on. We can't be
- * sure here if the entire tail has been deleted yet.
- *
- * un_bh is from the page cache (all unformatted nodes are
- * from the page cache) and might be a highmem page. So, we
- * can't use un_bh->b_data.
- * -clm
- */
-
- data = kmap_atomic(un_bh->b_page);
- off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1));
- memcpy(data + off,
- ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
- ret_value);
- kunmap_atomic(data);
- }
-
- /* Perform balancing after all resources have been collected at once. */
- do_balance(&s_del_balance, NULL, NULL, M_DELETE);
-
-#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(sb, REISERFS_DEBUG_CODE,
- "reiserquota delete_item(): freeing %u, id=%u type=%c",
- quota_cut_bytes, inode->i_uid, head2type(&s_ih));
-#endif
- depth = reiserfs_write_unlock_nested(inode->i_sb);
- dquot_free_space_nodirty(inode, quota_cut_bytes);
- reiserfs_write_lock_nested(inode->i_sb, depth);
-
- /* Return deleted body length */
- return ret_value;
-}
-
-/*
- * Summary Of Mechanisms For Handling Collisions Between Processes:
- *
- * deletion of the body of the object is performed by iput(), with the
- * result that if multiple processes are operating on a file, the
- * deletion of the body of the file is deferred until the last process
- * that has an open inode performs its iput().
- *
- * writes and truncates are protected from collisions by use of
- * semaphores.
- *
- * creates, linking, and mknod are protected from collisions with other
- * processes by making the reiserfs_add_entry() the last step in the
- * creation, and then rolling back all changes if there was a collision.
- * - Hans
-*/
-
-/* this deletes item which never gets split */
-void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
- struct inode *inode, struct reiserfs_key *key)
-{
- struct super_block *sb = th->t_super;
- struct tree_balance tb;
- INITIALIZE_PATH(path);
- int item_len = 0;
- int tb_init = 0;
- struct cpu_key cpu_key = {};
- int retval;
- int quota_cut_bytes = 0;
-
- BUG_ON(!th->t_trans_id);
-
- le_key2cpu_key(&cpu_key, key);
-
- while (1) {
- retval = search_item(th->t_super, &cpu_key, &path);
- if (retval == IO_ERROR) {
- reiserfs_error(th->t_super, "vs-5350",
- "i/o failure occurred trying "
- "to delete %K", &cpu_key);
- break;
- }
- if (retval != ITEM_FOUND) {
- pathrelse(&path);
- /*
- * No need for a warning, if there is just no free
- * space to insert '..' item into the
- * newly-created subdir
- */
- if (!
- ((unsigned long long)
- GET_HASH_VALUE(le_key_k_offset
- (le_key_version(key), key)) == 0
- && (unsigned long long)
- GET_GENERATION_NUMBER(le_key_k_offset
- (le_key_version(key),
- key)) == 1))
- reiserfs_warning(th->t_super, "vs-5355",
- "%k not found", key);
- break;
- }
- if (!tb_init) {
- tb_init = 1;
- item_len = ih_item_len(tp_item_head(&path));
- init_tb_struct(th, &tb, th->t_super, &path,
- -(IH_SIZE + item_len));
- }
- quota_cut_bytes = ih_item_len(tp_item_head(&path));
-
- retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
- if (retval == REPEAT_SEARCH) {
- PROC_INFO_INC(th->t_super, delete_solid_item_restarted);
- continue;
- }
-
- if (retval == CARRY_ON) {
- do_balance(&tb, NULL, NULL, M_DELETE);
- /*
- * Should we count quota for item? (we don't
- * count quotas for save-links)
- */
- if (inode) {
- int depth;
-#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
- "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
- quota_cut_bytes, inode->i_uid,
- key2type(key));
-#endif
- depth = reiserfs_write_unlock_nested(sb);
- dquot_free_space_nodirty(inode,
- quota_cut_bytes);
- reiserfs_write_lock_nested(sb, depth);
- }
- break;
- }
-
- /* IO_ERROR, NO_DISK_SPACE, etc */
- reiserfs_warning(th->t_super, "vs-5360",
- "could not delete %K due to fix_nodes failure",
- &cpu_key);
- unfix_nodes(&tb);
- break;
- }
-
- reiserfs_check_path(&path);
-}
-
-int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
- struct inode *inode)
-{
- int err;
- inode->i_size = 0;
- BUG_ON(!th->t_trans_id);
-
- /* for directory this deletes item containing "." and ".." */
- err =
- reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ );
- if (err)
- return err;
-
-#if defined( USE_INODE_GENERATION_COUNTER )
- if (!old_format_only(th->t_super)) {
- __le32 *inode_generation;
-
- inode_generation =
- &REISERFS_SB(th->t_super)->s_rs->s_inode_generation;
- le32_add_cpu(inode_generation, 1);
- }
-/* USE_INODE_GENERATION_COUNTER */
-#endif
- reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
-
- return err;
-}
-
-static void unmap_buffers(struct page *page, loff_t pos)
-{
- struct buffer_head *bh;
- struct buffer_head *head;
- struct buffer_head *next;
- unsigned long tail_index;
- unsigned long cur_index;
-
- if (page) {
- if (page_has_buffers(page)) {
- tail_index = pos & (PAGE_SIZE - 1);
- cur_index = 0;
- head = page_buffers(page);
- bh = head;
- do {
- next = bh->b_this_page;
-
- /*
- * we want to unmap the buffers that contain
- * the tail, and all the buffers after it
- * (since the tail must be at the end of the
- * file). We don't want to unmap file data
- * before the tail, since it might be dirty
- * and waiting to reach disk
- */
- cur_index += bh->b_size;
- if (cur_index > tail_index) {
- reiserfs_unmap_buffer(bh);
- }
- bh = next;
- } while (bh != head);
- }
- }
-}
-
-static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
- struct inode *inode,
- struct page *page,
- struct treepath *path,
- const struct cpu_key *item_key,
- loff_t new_file_size, char *mode)
-{
- struct super_block *sb = inode->i_sb;
- int block_size = sb->s_blocksize;
- int cut_bytes;
- BUG_ON(!th->t_trans_id);
- BUG_ON(new_file_size != inode->i_size);
-
- /*
- * the page being sent in could be NULL if there was an i/o error
- * reading in the last block. The user will hit problems trying to
- * read the file, but for now we just skip the indirect2direct
- */
- if (atomic_read(&inode->i_count) > 1 ||
- !tail_has_to_be_packed(inode) ||
- !page || (REISERFS_I(inode)->i_flags & i_nopack_mask)) {
- /* leave tail in an unformatted node */
- *mode = M_SKIP_BALANCING;
- cut_bytes =
- block_size - (new_file_size & (block_size - 1));
- pathrelse(path);
- return cut_bytes;
- }
-
- /* Perform the conversion to a direct_item. */
- return indirect2direct(th, inode, page, path, item_key,
- new_file_size, mode);
-}
-
-/*
- * we did indirect_to_direct conversion. And we have inserted direct
- * item successesfully, but there were no disk space to cut unfm
- * pointer being converted. Therefore we have to delete inserted
- * direct item(s)
- */
-static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
- struct inode *inode, struct treepath *path)
-{
- struct cpu_key tail_key;
- int tail_len;
- int removed;
- BUG_ON(!th->t_trans_id);
-
- make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);
- tail_key.key_length = 4;
-
- tail_len =
- (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
- while (tail_len) {
- /* look for the last byte of the tail */
- if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
- POSITION_NOT_FOUND)
- reiserfs_panic(inode->i_sb, "vs-5615",
- "found invalid item");
- RFALSE(path->pos_in_item !=
- ih_item_len(tp_item_head(path)) - 1,
- "vs-5616: appended bytes found");
- PATH_LAST_POSITION(path)--;
-
- removed =
- reiserfs_delete_item(th, path, &tail_key, inode,
- NULL /*unbh not needed */ );
- RFALSE(removed <= 0
- || removed > tail_len,
- "vs-5617: there was tail %d bytes, removed item length %d bytes",
- tail_len, removed);
- tail_len -= removed;
- set_cpu_key_k_offset(&tail_key,
- cpu_key_k_offset(&tail_key) - removed);
- }
- reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
- "conversion has been rolled back due to "
- "lack of disk space");
- mark_inode_dirty(inode);
-}
-
-/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
-int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
- struct treepath *path,
- struct cpu_key *item_key,
- struct inode *inode,
- struct page *page, loff_t new_file_size)
-{
- struct super_block *sb = inode->i_sb;
- /*
- * Every function which is going to call do_balance must first
- * create a tree_balance structure. Then it must fill up this
- * structure by using the init_tb_struct and fix_nodes functions.
- * After that we can make tree balancing.
- */
- struct tree_balance s_cut_balance;
- struct item_head *p_le_ih;
- int cut_size = 0; /* Amount to be cut. */
- int ret_value = CARRY_ON;
- int removed = 0; /* Number of the removed unformatted nodes. */
- int is_inode_locked = 0;
- char mode; /* Mode of the balance. */
- int retval2 = -1;
- int quota_cut_bytes;
- loff_t tail_pos = 0;
- int depth;
-
- BUG_ON(!th->t_trans_id);
-
- init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
- cut_size);
-
- /*
- * Repeat this loop until we either cut the item without needing
- * to balance, or we fix_nodes without schedule occurring
- */
- while (1) {
- /*
- * Determine the balance mode, position of the first byte to
- * be cut, and size to be cut. In case of the indirect item
- * free unformatted nodes which are pointed to by the cut
- * pointers.
- */
-
- mode =
- prepare_for_delete_or_cut(th, inode, path,
- item_key, &removed,
- &cut_size, new_file_size);
- if (mode == M_CONVERT) {
- /*
- * convert last unformatted node to direct item or
- * leave tail in the unformatted node
- */
- RFALSE(ret_value != CARRY_ON,
- "PAP-5570: can not convert twice");
-
- ret_value =
- maybe_indirect_to_direct(th, inode, page,
- path, item_key,
- new_file_size, &mode);
- if (mode == M_SKIP_BALANCING)
- /* tail has been left in the unformatted node */
- return ret_value;
-
- is_inode_locked = 1;
-
- /*
- * removing of last unformatted node will
- * change value we have to return to truncate.
- * Save it
- */
- retval2 = ret_value;
-
- /*
- * So, we have performed the first part of the
- * conversion:
- * inserting the new direct item. Now we are
- * removing the last unformatted node pointer.
- * Set key to search for it.
- */
- set_cpu_key_k_type(item_key, TYPE_INDIRECT);
- item_key->key_length = 4;
- new_file_size -=
- (new_file_size & (sb->s_blocksize - 1));
- tail_pos = new_file_size;
- set_cpu_key_k_offset(item_key, new_file_size + 1);
- if (search_for_position_by_key
- (sb, item_key,
- path) == POSITION_NOT_FOUND) {
- print_block(PATH_PLAST_BUFFER(path), 3,
- PATH_LAST_POSITION(path) - 1,
- PATH_LAST_POSITION(path) + 1);
- reiserfs_panic(sb, "PAP-5580", "item to "
- "convert does not exist (%K)",
- item_key);
- }
- continue;
- }
- if (cut_size == 0) {
- pathrelse(path);
- return 0;
- }
-
- s_cut_balance.insert_size[0] = cut_size;
-
- ret_value = fix_nodes(mode, &s_cut_balance, NULL, NULL);
- if (ret_value != REPEAT_SEARCH)
- break;
-
- PROC_INFO_INC(sb, cut_from_item_restarted);
-
- ret_value =
- search_for_position_by_key(sb, item_key, path);
- if (ret_value == POSITION_FOUND)
- continue;
-
- reiserfs_warning(sb, "PAP-5610", "item %K not found",
- item_key);
- unfix_nodes(&s_cut_balance);
- return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
- } /* while */
-
- /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */
- if (ret_value != CARRY_ON) {
- if (is_inode_locked) {
- /*
- * FIXME: this seems to be not needed: we are always
- * able to cut item
- */
- indirect_to_direct_roll_back(th, inode, path);
- }
- if (ret_value == NO_DISK_SPACE)
- reiserfs_warning(sb, "reiserfs-5092",
- "NO_DISK_SPACE");
- unfix_nodes(&s_cut_balance);
- return -EIO;
- }
-
- /* go ahead and perform balancing */
-
- RFALSE(mode == M_PASTE || mode == M_INSERT, "invalid mode");
-
- /* Calculate number of bytes that need to be cut from the item. */
- quota_cut_bytes =
- (mode ==
- M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance.
- insert_size[0];
- if (retval2 == -1)
- ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
- else
- ret_value = retval2;
-
- /*
- * For direct items, we only change the quota when deleting the last
- * item.
- */
- p_le_ih = tp_item_head(s_cut_balance.tb_path);
- if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
- if (mode == M_DELETE &&
- (le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
- 1) {
- /* FIXME: this is to keep 3.5 happy */
- REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
- quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
- } else {
- quota_cut_bytes = 0;
- }
- }
-#ifdef CONFIG_REISERFS_CHECK
- if (is_inode_locked) {
- struct item_head *le_ih =
- tp_item_head(s_cut_balance.tb_path);
- /*
- * we are going to complete indirect2direct conversion. Make
- * sure, that we exactly remove last unformatted node pointer
- * of the item
- */
- if (!is_indirect_le_ih(le_ih))
- reiserfs_panic(sb, "vs-5652",
- "item must be indirect %h", le_ih);
-
- if (mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
- reiserfs_panic(sb, "vs-5653", "completing "
- "indirect2direct conversion indirect "
- "item %h being deleted must be of "
- "4 byte long", le_ih);
-
- if (mode == M_CUT
- && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
- reiserfs_panic(sb, "vs-5654", "can not complete "
- "indirect2direct conversion of %h "
- "(CUT, insert_size==%d)",
- le_ih, s_cut_balance.insert_size[0]);
- }
- /*
- * it would be useful to make sure, that right neighboring
- * item is direct item of this file
- */
- }
-#endif
-
- do_balance(&s_cut_balance, NULL, NULL, mode);
- if (is_inode_locked) {
- /*
- * we've done an indirect->direct conversion. when the
- * data block was freed, it was removed from the list of
- * blocks that must be flushed before the transaction
- * commits, make sure to unmap and invalidate it
- */
- unmap_buffers(page, tail_pos);
- REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
- }
-#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
- "reiserquota cut_from_item(): freeing %u id=%u type=%c",
- quota_cut_bytes, inode->i_uid, '?');
-#endif
- depth = reiserfs_write_unlock_nested(sb);
- dquot_free_space_nodirty(inode, quota_cut_bytes);
- reiserfs_write_lock_nested(sb, depth);
- return ret_value;
-}
-
-static void truncate_directory(struct reiserfs_transaction_handle *th,
- struct inode *inode)
-{
- BUG_ON(!th->t_trans_id);
- if (inode->i_nlink)
- reiserfs_error(inode->i_sb, "vs-5655", "link count != 0");
-
- set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
- set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
- reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
- reiserfs_update_sd(th, inode);
- set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET);
- set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
-}
-
-/*
- * Truncate file to the new size. Note, this must be called with a
- * transaction already started
- */
-int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
- struct inode *inode, /* ->i_size contains new size */
- struct page *page, /* up to date for last block */
- /*
- * when it is called by file_release to convert
- * the tail - no timestamps should be updated
- */
- int update_timestamps
- )
-{
- INITIALIZE_PATH(s_search_path); /* Path to the current object item. */
- struct item_head *p_le_ih; /* Pointer to an item header. */
-
- /* Key to search for a previous file item. */
- struct cpu_key s_item_key;
- loff_t file_size, /* Old file size. */
- new_file_size; /* New file size. */
- int deleted; /* Number of deleted or truncated bytes. */
- int retval;
- int err = 0;
-
- BUG_ON(!th->t_trans_id);
- if (!
- (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
- || S_ISLNK(inode->i_mode)))
- return 0;
-
- /* deletion of directory - no need to update timestamps */
- if (S_ISDIR(inode->i_mode)) {
- truncate_directory(th, inode);
- return 0;
- }
-
- /* Get new file size. */
- new_file_size = inode->i_size;
-
- /* FIXME: note, that key type is unimportant here */
- make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
- TYPE_DIRECT, 3);
-
- retval =
- search_for_position_by_key(inode->i_sb, &s_item_key,
- &s_search_path);
- if (retval == IO_ERROR) {
- reiserfs_error(inode->i_sb, "vs-5657",
- "i/o failure occurred trying to truncate %K",
- &s_item_key);
- err = -EIO;
- goto out;
- }
- if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
- reiserfs_error(inode->i_sb, "PAP-5660",
- "wrong result %d of search for %K", retval,
- &s_item_key);
-
- err = -EIO;
- goto out;
- }
-
- s_search_path.pos_in_item--;
-
- /* Get real file size (total length of all file items) */
- p_le_ih = tp_item_head(&s_search_path);
- if (is_statdata_le_ih(p_le_ih))
- file_size = 0;
- else {
- loff_t offset = le_ih_k_offset(p_le_ih);
- int bytes =
- op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
-
- /*
- * this may mismatch with real file size: if last direct item
- * had no padding zeros and last unformatted node had no free
- * space, this file would have this file size
- */
- file_size = offset + bytes - 1;
- }
- /*
- * are we doing a full truncate or delete, if so
- * kick in the reada code
- */
- if (new_file_size == 0)
- s_search_path.reada = PATH_READA | PATH_READA_BACK;
-
- if (file_size == 0 || file_size < new_file_size) {
- goto update_and_out;
- }
-
- /* Update key to search for the last file item. */
- set_cpu_key_k_offset(&s_item_key, file_size);
-
- do {
- /* Cut or delete file item. */
- deleted =
- reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
- inode, page, new_file_size);
- if (deleted < 0) {
- reiserfs_warning(inode->i_sb, "vs-5665",
- "reiserfs_cut_from_item failed");
- reiserfs_check_path(&s_search_path);
- return 0;
- }
-
- RFALSE(deleted > file_size,
- "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
- deleted, file_size, &s_item_key);
-
- /* Change key to search the last file item. */
- file_size -= deleted;
-
- set_cpu_key_k_offset(&s_item_key, file_size);
-
- /*
- * While there are bytes to truncate and previous
- * file item is presented in the tree.
- */
-
- /*
- * This loop could take a really long time, and could log
- * many more blocks than a transaction can hold. So, we do
- * a polite journal end here, and if the transaction needs
- * ending, we make sure the file is consistent before ending
- * the current trans and starting a new one
- */
- if (journal_transaction_should_end(th, 0) ||
- reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
- pathrelse(&s_search_path);
-
- if (update_timestamps) {
- inode_set_mtime_to_ts(inode,
- current_time(inode));
- inode_set_ctime_current(inode);
- }
- reiserfs_update_sd(th, inode);
-
- err = journal_end(th);
- if (err)
- goto out;
- err = journal_begin(th, inode->i_sb,
- JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD + JOURNAL_PER_BALANCE_CNT * 4) ;
- if (err)
- goto out;
- reiserfs_update_inode_transaction(inode);
- }
- } while (file_size > ROUND_UP(new_file_size) &&
- search_for_position_by_key(inode->i_sb, &s_item_key,
- &s_search_path) == POSITION_FOUND);
-
- RFALSE(file_size > ROUND_UP(new_file_size),
- "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d",
- new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
-
-update_and_out:
- if (update_timestamps) {
- /* this is truncate, not file closing */
- inode_set_mtime_to_ts(inode, current_time(inode));
- inode_set_ctime_current(inode);
- }
- reiserfs_update_sd(th, inode);
-
-out:
- pathrelse(&s_search_path);
- return err;
-}
-
-#ifdef CONFIG_REISERFS_CHECK
-/* this makes sure, that we __append__, not overwrite or add holes */
-static void check_research_for_paste(struct treepath *path,
- const struct cpu_key *key)
-{
- struct item_head *found_ih = tp_item_head(path);
-
- if (is_direct_le_ih(found_ih)) {
- if (le_ih_k_offset(found_ih) +
- op_bytes_number(found_ih,
- get_last_bh(path)->b_size) !=
- cpu_key_k_offset(key)
- || op_bytes_number(found_ih,
- get_last_bh(path)->b_size) !=
- pos_in_item(path))
- reiserfs_panic(NULL, "PAP-5720", "found direct item "
- "%h or position (%d) does not match "
- "to key %K", found_ih,
- pos_in_item(path), key);
- }
- if (is_indirect_le_ih(found_ih)) {
- if (le_ih_k_offset(found_ih) +
- op_bytes_number(found_ih,
- get_last_bh(path)->b_size) !=
- cpu_key_k_offset(key)
- || I_UNFM_NUM(found_ih) != pos_in_item(path)
- || get_ih_free_space(found_ih) != 0)
- reiserfs_panic(NULL, "PAP-5730", "found indirect "
- "item (%h) or position (%d) does not "
- "match to key (%K)",
- found_ih, pos_in_item(path), key);
- }
-}
-#endif /* config reiserfs check */
-
-/*
- * Paste bytes to the existing item.
- * Returns bytes number pasted into the item.
- */
-int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
- /* Path to the pasted item. */
- struct treepath *search_path,
- /* Key to search for the needed item. */
- const struct cpu_key *key,
- /* Inode item belongs to */
- struct inode *inode,
- /* Pointer to the bytes to paste. */
- const char *body,
- /* Size of pasted bytes. */
- int pasted_size)
-{
- struct super_block *sb = inode->i_sb;
- struct tree_balance s_paste_balance;
- int retval;
- int fs_gen;
- int depth;
-
- BUG_ON(!th->t_trans_id);
-
- fs_gen = get_generation(inode->i_sb);
-
-#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
- "reiserquota paste_into_item(): allocating %u id=%u type=%c",
- pasted_size, inode->i_uid,
- key2type(&key->on_disk_key));
-#endif
-
- depth = reiserfs_write_unlock_nested(sb);
- retval = dquot_alloc_space_nodirty(inode, pasted_size);
- reiserfs_write_lock_nested(sb, depth);
- if (retval) {
- pathrelse(search_path);
- return retval;
- }
- init_tb_struct(th, &s_paste_balance, th->t_super, search_path,
- pasted_size);
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- s_paste_balance.key = key->on_disk_key;
-#endif
-
- /* DQUOT_* can schedule, must check before the fix_nodes */
- if (fs_changed(fs_gen, inode->i_sb)) {
- goto search_again;
- }
-
- while ((retval =
- fix_nodes(M_PASTE, &s_paste_balance, NULL,
- body)) == REPEAT_SEARCH) {
-search_again:
- /* file system changed while we were in the fix_nodes */
- PROC_INFO_INC(th->t_super, paste_into_item_restarted);
- retval =
- search_for_position_by_key(th->t_super, key,
- search_path);
- if (retval == IO_ERROR) {
- retval = -EIO;
- goto error_out;
- }
- if (retval == POSITION_FOUND) {
- reiserfs_warning(inode->i_sb, "PAP-5710",
- "entry or pasted byte (%K) exists",
- key);
- retval = -EEXIST;
- goto error_out;
- }
-#ifdef CONFIG_REISERFS_CHECK
- check_research_for_paste(search_path, key);
-#endif
- }
-
- /*
- * Perform balancing after all resources are collected by fix_nodes,
- * and accessing them will not risk triggering schedule.
- */
- if (retval == CARRY_ON) {
- do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
- return 0;
- }
- retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
-error_out:
- /* this also releases the path */
- unfix_nodes(&s_paste_balance);
-#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
- "reiserquota paste_into_item(): freeing %u id=%u type=%c",
- pasted_size, inode->i_uid,
- key2type(&key->on_disk_key));
-#endif
- depth = reiserfs_write_unlock_nested(sb);
- dquot_free_space_nodirty(inode, pasted_size);
- reiserfs_write_lock_nested(sb, depth);
- return retval;
-}
-
-/*
- * Insert new item into the buffer at the path.
- * th - active transaction handle
- * path - path to the inserted item
- * ih - pointer to the item header to insert
- * body - pointer to the bytes to insert
- */
-int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
- struct treepath *path, const struct cpu_key *key,
- struct item_head *ih, struct inode *inode,
- const char *body)
-{
- struct tree_balance s_ins_balance;
- int retval;
- int fs_gen = 0;
- int quota_bytes = 0;
-
- BUG_ON(!th->t_trans_id);
-
- if (inode) { /* Do we count quotas for item? */
- int depth;
- fs_gen = get_generation(inode->i_sb);
- quota_bytes = ih_item_len(ih);
-
- /*
- * hack so the quota code doesn't have to guess
- * if the file has a tail, links are always tails,
- * so there's no guessing needed
- */
- if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
- quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
-#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
- "reiserquota insert_item(): allocating %u id=%u type=%c",
- quota_bytes, inode->i_uid, head2type(ih));
-#endif
- /*
- * We can't dirty inode here. It would be immediately
- * written but appropriate stat item isn't inserted yet...
- */
- depth = reiserfs_write_unlock_nested(inode->i_sb);
- retval = dquot_alloc_space_nodirty(inode, quota_bytes);
- reiserfs_write_lock_nested(inode->i_sb, depth);
- if (retval) {
- pathrelse(path);
- return retval;
- }
- }
- init_tb_struct(th, &s_ins_balance, th->t_super, path,
- IH_SIZE + ih_item_len(ih));
-#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- s_ins_balance.key = key->on_disk_key;
-#endif
- /*
- * DQUOT_* can schedule, must check to be sure calling
- * fix_nodes is safe
- */
- if (inode && fs_changed(fs_gen, inode->i_sb)) {
- goto search_again;
- }
-
- while ((retval =
- fix_nodes(M_INSERT, &s_ins_balance, ih,
- body)) == REPEAT_SEARCH) {
-search_again:
- /* file system changed while we were in the fix_nodes */
- PROC_INFO_INC(th->t_super, insert_item_restarted);
- retval = search_item(th->t_super, key, path);
- if (retval == IO_ERROR) {
- retval = -EIO;
- goto error_out;
- }
- if (retval == ITEM_FOUND) {
- reiserfs_warning(th->t_super, "PAP-5760",
- "key %K already exists in the tree",
- key);
- retval = -EEXIST;
- goto error_out;
- }
- }
-
- /* make balancing after all resources will be collected at a time */
- if (retval == CARRY_ON) {
- do_balance(&s_ins_balance, ih, body, M_INSERT);
- return 0;
- }
-
- retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
-error_out:
- /* also releases the path */
- unfix_nodes(&s_ins_balance);
-#ifdef REISERQUOTA_DEBUG
- if (inode)
- reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
- "reiserquota insert_item(): freeing %u id=%u type=%c",
- quota_bytes, inode->i_uid, head2type(ih));
-#endif
- if (inode) {
- int depth = reiserfs_write_unlock_nested(inode->i_sb);
- dquot_free_space_nodirty(inode, quota_bytes);
- reiserfs_write_lock_nested(inode->i_sb, depth);
- }
- return retval;
-}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
deleted file mode 100644
index ab76468da02d..000000000000
--- a/fs/reiserfs/super.c
+++ /dev/null
@@ -1,2646 +0,0 @@
-/*
- * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
- *
- * Trivial changes by Alan Cox to add the LFS fixes
- *
- * Trivial Changes:
- * Rights granted to Hans Reiser to redistribute under other terms providing
- * he accepts all liability including but not limited to patent, fitness
- * for purpose, and direct or indirect claims arising from failure to perform.
- *
- * NO WARRANTY
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/time.h>
-#include <linux/uaccess.h>
-#include "reiserfs.h"
-#include "acl.h"
-#include "xattr.h"
-#include <linux/init.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/exportfs.h>
-#include <linux/quotaops.h>
-#include <linux/vfs.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/crc32.h>
-#include <linux/seq_file.h>
-
-struct file_system_type reiserfs_fs_type;
-
-static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING;
-static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING;
-static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING;
-
-int is_reiserfs_3_5(struct reiserfs_super_block *rs)
-{
- return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string,
- strlen(reiserfs_3_5_magic_string));
-}
-
-int is_reiserfs_3_6(struct reiserfs_super_block *rs)
-{
- return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string,
- strlen(reiserfs_3_6_magic_string));
-}
-
-int is_reiserfs_jr(struct reiserfs_super_block *rs)
-{
- return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string,
- strlen(reiserfs_jr_magic_string));
-}
-
-static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
-{
- return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) ||
- is_reiserfs_jr(rs));
-}
-
-static int reiserfs_remount(struct super_block *s, int *flags, char *data);
-static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
-
-static int reiserfs_sync_fs(struct super_block *s, int wait)
-{
- struct reiserfs_transaction_handle th;
-
- /*
- * Writeback quota in non-journalled quota case - journalled quota has
- * no dirty dquots
- */
- dquot_writeback_dquots(s, -1);
- reiserfs_write_lock(s);
- if (!journal_begin(&th, s, 1))
- if (!journal_end_sync(&th))
- reiserfs_flush_old_commits(s);
- reiserfs_write_unlock(s);
- return 0;
-}
-
-static void flush_old_commits(struct work_struct *work)
-{
- struct reiserfs_sb_info *sbi;
- struct super_block *s;
-
- sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
- s = sbi->s_journal->j_work_sb;
-
- /*
- * We need s_umount for protecting quota writeback. We have to use
- * trylock as reiserfs_cancel_old_flush() may be waiting for this work
- * to complete with s_umount held.
- */
- if (!down_read_trylock(&s->s_umount)) {
- /* Requeue work if we are not cancelling it */
- spin_lock(&sbi->old_work_lock);
- if (sbi->work_queued == 1)
- queue_delayed_work(system_long_wq, &sbi->old_work, HZ);
- spin_unlock(&sbi->old_work_lock);
- return;
- }
- spin_lock(&sbi->old_work_lock);
- /* Avoid clobbering the cancel state... */
- if (sbi->work_queued == 1)
- sbi->work_queued = 0;
- spin_unlock(&sbi->old_work_lock);
-
- reiserfs_sync_fs(s, 1);
- up_read(&s->s_umount);
-}
-
-void reiserfs_schedule_old_flush(struct super_block *s)
-{
- struct reiserfs_sb_info *sbi = REISERFS_SB(s);
- unsigned long delay;
-
- /*
- * Avoid scheduling flush when sb is being shut down. It can race
- * with journal shutdown and free still queued delayed work.
- */
- if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE))
- return;
-
- spin_lock(&sbi->old_work_lock);
- if (!sbi->work_queued) {
- delay = msecs_to_jiffies(dirty_writeback_interval * 10);
- queue_delayed_work(system_long_wq, &sbi->old_work, delay);
- sbi->work_queued = 1;
- }
- spin_unlock(&sbi->old_work_lock);
-}
-
-void reiserfs_cancel_old_flush(struct super_block *s)
-{
- struct reiserfs_sb_info *sbi = REISERFS_SB(s);
-
- spin_lock(&sbi->old_work_lock);
- /* Make sure no new flushes will be queued */
- sbi->work_queued = 2;
- spin_unlock(&sbi->old_work_lock);
- cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
-}
-
-static int reiserfs_freeze(struct super_block *s)
-{
- struct reiserfs_transaction_handle th;
-
- reiserfs_cancel_old_flush(s);
-
- reiserfs_write_lock(s);
- if (!sb_rdonly(s)) {
- int err = journal_begin(&th, s, 1);
- if (err) {
- reiserfs_block_writes(&th);
- } else {
- reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
- 1);
- journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
- reiserfs_block_writes(&th);
- journal_end_sync(&th);
- }
- }
- reiserfs_write_unlock(s);
- return 0;
-}
-
-static int reiserfs_unfreeze(struct super_block *s)
-{
- struct reiserfs_sb_info *sbi = REISERFS_SB(s);
-
- reiserfs_allow_writes(s);
- spin_lock(&sbi->old_work_lock);
- /* Allow old_work to run again */
- sbi->work_queued = 0;
- spin_unlock(&sbi->old_work_lock);
- return 0;
-}
-
-extern const struct in_core_key MAX_IN_CORE_KEY;
-
-/*
- * this is used to delete "save link" when there are no items of a
- * file it points to. It can either happen if unlink is completed but
- * "save unlink" removal, or if file has both unlink and truncate
- * pending and as unlink completes first (because key of "save link"
- * protecting unlink is bigger that a key lf "save link" which
- * protects truncate), so there left no items to make truncate
- * completion on
- */
-static int remove_save_link_only(struct super_block *s,
- struct reiserfs_key *key, int oid_free)
-{
- struct reiserfs_transaction_handle th;
- int err;
-
- /* we are going to do one balancing */
- err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT);
- if (err)
- return err;
-
- reiserfs_delete_solid_item(&th, NULL, key);
- if (oid_free)
- /* removals are protected by direct items */
- reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid));
-
- return journal_end(&th);
-}
-
-#ifdef CONFIG_QUOTA
-static int reiserfs_quota_on_mount(struct super_block *, int);
-#endif
-
-/*
- * Look for uncompleted unlinks and truncates and complete them
- *
- * Called with superblock write locked. If quotas are enabled, we have to
- * release/retake lest we call dquot_quota_on_mount(), proceed to
- * schedule_on_each_cpu() in invalidate_bdev() and deadlock waiting for the per
- * cpu worklets to complete flush_async_commits() that in turn wait for the
- * superblock write lock.
- */
-static int finish_unfinished(struct super_block *s)
-{
- INITIALIZE_PATH(path);
- struct cpu_key max_cpu_key, obj_key;
- struct reiserfs_key save_link_key, last_inode_key;
- int retval = 0;
- struct item_head *ih;
- struct buffer_head *bh;
- int item_pos;
- char *item;
- int done;
- struct inode *inode;
- int truncate;
-#ifdef CONFIG_QUOTA
- int i;
- int ms_active_set;
- int quota_enabled[REISERFS_MAXQUOTAS];
-#endif
-
- /* compose key to look for "save" links */
- max_cpu_key.version = KEY_FORMAT_3_5;
- max_cpu_key.on_disk_key.k_dir_id = ~0U;
- max_cpu_key.on_disk_key.k_objectid = ~0U;
- set_cpu_key_k_offset(&max_cpu_key, ~0U);
- max_cpu_key.key_length = 3;
-
- memset(&last_inode_key, 0, sizeof(last_inode_key));
-
-#ifdef CONFIG_QUOTA
- /* Needed for iput() to work correctly and not trash data */
- if (s->s_flags & SB_ACTIVE) {
- ms_active_set = 0;
- } else {
- ms_active_set = 1;
- s->s_flags |= SB_ACTIVE;
- }
- /* Turn on quotas so that they are updated correctly */
- for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
- quota_enabled[i] = 1;
- if (REISERFS_SB(s)->s_qf_names[i]) {
- int ret;
-
- if (sb_has_quota_active(s, i)) {
- quota_enabled[i] = 0;
- continue;
- }
- reiserfs_write_unlock(s);
- ret = reiserfs_quota_on_mount(s, i);
- reiserfs_write_lock(s);
- if (ret < 0)
- reiserfs_warning(s, "reiserfs-2500",
- "cannot turn on journaled "
- "quota: error %d", ret);
- }
- }
-#endif
-
- done = 0;
- REISERFS_SB(s)->s_is_unlinked_ok = 1;
- while (!retval) {
- int depth;
- retval = search_item(s, &max_cpu_key, &path);
- if (retval != ITEM_NOT_FOUND) {
- reiserfs_error(s, "vs-2140",
- "search_by_key returned %d", retval);
- break;
- }
-
- bh = get_last_bh(&path);
- item_pos = get_item_pos(&path);
- if (item_pos != B_NR_ITEMS(bh)) {
- reiserfs_warning(s, "vs-2060",
- "wrong position found");
- break;
- }
- item_pos--;
- ih = item_head(bh, item_pos);
-
- if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
- /* there are no "save" links anymore */
- break;
-
- save_link_key = ih->ih_key;
- if (is_indirect_le_ih(ih))
- truncate = 1;
- else
- truncate = 0;
-
- /* reiserfs_iget needs k_dirid and k_objectid only */
- item = ih_item_body(bh, ih);
- obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item);
- obj_key.on_disk_key.k_objectid =
- le32_to_cpu(ih->ih_key.k_objectid);
- obj_key.on_disk_key.k_offset = 0;
- obj_key.on_disk_key.k_type = 0;
-
- pathrelse(&path);
-
- inode = reiserfs_iget(s, &obj_key);
- if (IS_ERR_OR_NULL(inode)) {
- /*
- * the unlink almost completed, it just did not
- * manage to remove "save" link and release objectid
- */
- reiserfs_warning(s, "vs-2180", "iget failed for %K",
- &obj_key);
- retval = remove_save_link_only(s, &save_link_key, 1);
- continue;
- }
-
- if (!truncate && inode->i_nlink) {
- /* file is not unlinked */
- reiserfs_warning(s, "vs-2185",
- "file %K is not unlinked",
- &obj_key);
- retval = remove_save_link_only(s, &save_link_key, 0);
- continue;
- }
- depth = reiserfs_write_unlock_nested(inode->i_sb);
- dquot_initialize(inode);
- reiserfs_write_lock_nested(inode->i_sb, depth);
-
- if (truncate && S_ISDIR(inode->i_mode)) {
- /*
- * We got a truncate request for a dir which
- * is impossible. The only imaginable way is to
- * execute unfinished truncate request then boot
- * into old kernel, remove the file and create dir
- * with the same key.
- */
- reiserfs_warning(s, "green-2101",
- "impossible truncate on a "
- "directory %k. Please report",
- INODE_PKEY(inode));
- retval = remove_save_link_only(s, &save_link_key, 0);
- truncate = 0;
- iput(inode);
- continue;
- }
-
- if (truncate) {
- REISERFS_I(inode)->i_flags |=
- i_link_saved_truncate_mask;
- /*
- * not completed truncate found. New size was
- * committed together with "save" link
- */
- reiserfs_info(s, "Truncating %k to %lld ..",
- INODE_PKEY(inode), inode->i_size);
-
- /* don't update modification time */
- reiserfs_truncate_file(inode, 0);
-
- retval = remove_save_link(inode, truncate);
- } else {
- REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
- /* not completed unlink (rmdir) found */
- reiserfs_info(s, "Removing %k..", INODE_PKEY(inode));
- if (memcmp(&last_inode_key, INODE_PKEY(inode),
- sizeof(last_inode_key))){
- last_inode_key = *INODE_PKEY(inode);
- /* removal gets completed in iput */
- retval = 0;
- } else {
- reiserfs_warning(s, "super-2189", "Dead loop "
- "in finish_unfinished "
- "detected, just remove "
- "save link\n");
- retval = remove_save_link_only(s,
- &save_link_key, 0);
- }
- }
-
- iput(inode);
- printk("done\n");
- done++;
- }
- REISERFS_SB(s)->s_is_unlinked_ok = 0;
-
-#ifdef CONFIG_QUOTA
- /* Turn quotas off */
- reiserfs_write_unlock(s);
- for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
- if (sb_dqopt(s)->files[i] && quota_enabled[i])
- dquot_quota_off(s, i);
- }
- reiserfs_write_lock(s);
- if (ms_active_set)
- /* Restore the flag back */
- s->s_flags &= ~SB_ACTIVE;
-#endif
- pathrelse(&path);
- if (done)
- reiserfs_info(s, "There were %d uncompleted unlinks/truncates. "
- "Completed\n", done);
- return retval;
-}
-
-/*
- * to protect file being unlinked from getting lost we "safe" link files
- * being unlinked. This link will be deleted in the same transaction with last
- * item of file. mounting the filesystem we scan all these links and remove
- * files which almost got lost
- */
-void add_save_link(struct reiserfs_transaction_handle *th,
- struct inode *inode, int truncate)
-{
- INITIALIZE_PATH(path);
- int retval;
- struct cpu_key key;
- struct item_head ih;
- __le32 link;
-
- BUG_ON(!th->t_trans_id);
-
- /* file can only get one "save link" of each kind */
- RFALSE(truncate &&
- (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask),
- "saved link already exists for truncated inode %lx",
- (long)inode->i_ino);
- RFALSE(!truncate &&
- (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask),
- "saved link already exists for unlinked inode %lx",
- (long)inode->i_ino);
-
- /* setup key of "save" link */
- key.version = KEY_FORMAT_3_5;
- key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID;
- key.on_disk_key.k_objectid = inode->i_ino;
- if (!truncate) {
- /* unlink, rmdir, rename */
- set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize);
- set_cpu_key_k_type(&key, TYPE_DIRECT);
-
- /* item head of "safe" link */
- make_le_item_head(&ih, &key, key.version,
- 1 + inode->i_sb->s_blocksize, TYPE_DIRECT,
- 4 /*length */ , 0xffff /*free space */ );
- } else {
- /* truncate */
- if (S_ISDIR(inode->i_mode))
- reiserfs_warning(inode->i_sb, "green-2102",
- "Adding a truncate savelink for "
- "a directory %k! Please report",
- INODE_PKEY(inode));
- set_cpu_key_k_offset(&key, 1);
- set_cpu_key_k_type(&key, TYPE_INDIRECT);
-
- /* item head of "safe" link */
- make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT,
- 4 /*length */ , 0 /*free space */ );
- }
- key.key_length = 3;
-
- /* look for its place in the tree */
- retval = search_item(inode->i_sb, &key, &path);
- if (retval != ITEM_NOT_FOUND) {
- if (retval != -ENOSPC)
- reiserfs_error(inode->i_sb, "vs-2100",
- "search_by_key (%K) returned %d", &key,
- retval);
- pathrelse(&path);
- return;
- }
-
- /* body of "save" link */
- link = INODE_PKEY(inode)->k_dir_id;
-
- /* put "save" link into tree, don't charge quota to anyone */
- retval =
- reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
- if (retval) {
- if (retval != -ENOSPC)
- reiserfs_error(inode->i_sb, "vs-2120",
- "insert_item returned %d", retval);
- } else {
- if (truncate)
- REISERFS_I(inode)->i_flags |=
- i_link_saved_truncate_mask;
- else
- REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
- }
-}
-
-/* this opens transaction unlike add_save_link */
-int remove_save_link(struct inode *inode, int truncate)
-{
- struct reiserfs_transaction_handle th;
- struct reiserfs_key key;
- int err;
-
- /* we are going to do one balancing only */
- err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
- if (err)
- return err;
-
- /* setup key of "save" link */
- key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID);
- key.k_objectid = INODE_PKEY(inode)->k_objectid;
- if (!truncate) {
- /* unlink, rmdir, rename */
- set_le_key_k_offset(KEY_FORMAT_3_5, &key,
- 1 + inode->i_sb->s_blocksize);
- set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT);
- } else {
- /* truncate */
- set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1);
- set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT);
- }
-
- if ((truncate &&
- (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) ||
- (!truncate &&
- (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask)))
- /* don't take quota bytes from anywhere */
- reiserfs_delete_solid_item(&th, NULL, &key);
- if (!truncate) {
- reiserfs_release_objectid(&th, inode->i_ino);
- REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask;
- } else
- REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask;
-
- return journal_end(&th);
-}
-
-static void reiserfs_kill_sb(struct super_block *s)
-{
- if (REISERFS_SB(s)) {
- reiserfs_proc_info_done(s);
- /*
- * Force any pending inode evictions to occur now. Any
- * inodes to be removed that have extended attributes
- * associated with them need to clean them up before
- * we can release the extended attribute root dentries.
- * shrink_dcache_for_umount will BUG if we don't release
- * those before it's called so ->put_super is too late.
- */
- shrink_dcache_sb(s);
-
- dput(REISERFS_SB(s)->xattr_root);
- REISERFS_SB(s)->xattr_root = NULL;
- dput(REISERFS_SB(s)->priv_root);
- REISERFS_SB(s)->priv_root = NULL;
- }
-
- kill_block_super(s);
-}
-
-#ifdef CONFIG_QUOTA
-static int reiserfs_quota_off(struct super_block *sb, int type);
-
-static void reiserfs_quota_off_umount(struct super_block *s)
-{
- int type;
-
- for (type = 0; type < REISERFS_MAXQUOTAS; type++)
- reiserfs_quota_off(s, type);
-}
-#else
-static inline void reiserfs_quota_off_umount(struct super_block *s)
-{
-}
-#endif
-
-static void reiserfs_put_super(struct super_block *s)
-{
- struct reiserfs_transaction_handle th;
- th.t_trans_id = 0;
-
- reiserfs_quota_off_umount(s);
-
- reiserfs_write_lock(s);
-
- /*
- * change file system state to current state if it was mounted
- * with read-write permissions
- */
- if (!sb_rdonly(s)) {
- if (!journal_begin(&th, s, 10)) {
- reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
- 1);
- set_sb_umount_state(SB_DISK_SUPER_BLOCK(s),
- REISERFS_SB(s)->s_mount_state);
- journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
- }
- }
-
- /*
- * note, journal_release checks for readonly mount, and can
- * decide not to do a journal_end
- */
- journal_release(&th, s);
-
- reiserfs_free_bitmap_cache(s);
-
- brelse(SB_BUFFER_WITH_SB(s));
-
- print_statistics(s);
-
- if (REISERFS_SB(s)->reserved_blocks != 0) {
- reiserfs_warning(s, "green-2005", "reserved blocks left %d",
- REISERFS_SB(s)->reserved_blocks);
- }
-
- reiserfs_write_unlock(s);
- mutex_destroy(&REISERFS_SB(s)->lock);
- destroy_workqueue(REISERFS_SB(s)->commit_wq);
- kfree(REISERFS_SB(s)->s_jdev);
- kfree(s->s_fs_info);
- s->s_fs_info = NULL;
-}
-
-static struct kmem_cache *reiserfs_inode_cachep;
-
-static struct inode *reiserfs_alloc_inode(struct super_block *sb)
-{
- struct reiserfs_inode_info *ei;
- ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL);
- if (!ei)
- return NULL;
- atomic_set(&ei->openers, 0);
- mutex_init(&ei->tailpack);
-#ifdef CONFIG_QUOTA
- memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
-#endif
-
- return &ei->vfs_inode;
-}
-
-static void reiserfs_free_inode(struct inode *inode)
-{
- kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
-}
-
-static void init_once(void *foo)
-{
- struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
-
- INIT_LIST_HEAD(&ei->i_prealloc_list);
- inode_init_once(&ei->vfs_inode);
-}
-
-static int __init init_inodecache(void)
-{
- reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
- sizeof(struct
- reiserfs_inode_info),
- 0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_ACCOUNT),
- init_once);
- if (reiserfs_inode_cachep == NULL)
- return -ENOMEM;
- return 0;
-}
-
-static void destroy_inodecache(void)
-{
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(reiserfs_inode_cachep);
-}
-
-/* we don't mark inodes dirty, we just log them */
-static void reiserfs_dirty_inode(struct inode *inode, int flags)
-{
- struct reiserfs_transaction_handle th;
-
- int err = 0;
-
- if (sb_rdonly(inode->i_sb)) {
- reiserfs_warning(inode->i_sb, "clm-6006",
- "writing inode %lu on readonly FS",
- inode->i_ino);
- return;
- }
- reiserfs_write_lock(inode->i_sb);
-
- /*
- * this is really only used for atime updates, so they don't have
- * to be included in O_SYNC or fsync
- */
- err = journal_begin(&th, inode->i_sb, 1);
- if (err)
- goto out;
-
- reiserfs_update_sd(&th, inode);
- journal_end(&th);
-
-out:
- reiserfs_write_unlock(inode->i_sb);
-}
-
-static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
-{
- struct super_block *s = root->d_sb;
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- long opts = REISERFS_SB(s)->s_mount_opt;
-
- if (opts & (1 << REISERFS_LARGETAIL))
- seq_puts(seq, ",tails=on");
- else if (!(opts & (1 << REISERFS_SMALLTAIL)))
- seq_puts(seq, ",notail");
- /* tails=small is default so we don't show it */
-
- if (!(opts & (1 << REISERFS_BARRIER_FLUSH)))
- seq_puts(seq, ",barrier=none");
- /* barrier=flush is default so we don't show it */
-
- if (opts & (1 << REISERFS_ERROR_CONTINUE))
- seq_puts(seq, ",errors=continue");
- else if (opts & (1 << REISERFS_ERROR_PANIC))
- seq_puts(seq, ",errors=panic");
- /* errors=ro is default so we don't show it */
-
- if (opts & (1 << REISERFS_DATA_LOG))
- seq_puts(seq, ",data=journal");
- else if (opts & (1 << REISERFS_DATA_WRITEBACK))
- seq_puts(seq, ",data=writeback");
- /* data=ordered is default so we don't show it */
-
- if (opts & (1 << REISERFS_ATTRS))
- seq_puts(seq, ",attrs");
-
- if (opts & (1 << REISERFS_XATTRS_USER))
- seq_puts(seq, ",user_xattr");
-
- if (opts & (1 << REISERFS_EXPOSE_PRIVROOT))
- seq_puts(seq, ",expose_privroot");
-
- if (opts & (1 << REISERFS_POSIXACL))
- seq_puts(seq, ",acl");
-
- if (REISERFS_SB(s)->s_jdev)
- seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
-
- if (journal->j_max_commit_age != journal->j_default_max_commit_age)
- seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
-
-#ifdef CONFIG_QUOTA
- if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
- seq_show_option(seq, "usrjquota",
- REISERFS_SB(s)->s_qf_names[USRQUOTA]);
- else if (opts & (1 << REISERFS_USRQUOTA))
- seq_puts(seq, ",usrquota");
- if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
- seq_show_option(seq, "grpjquota",
- REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
- else if (opts & (1 << REISERFS_GRPQUOTA))
- seq_puts(seq, ",grpquota");
- if (REISERFS_SB(s)->s_jquota_fmt) {
- if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_OLD)
- seq_puts(seq, ",jqfmt=vfsold");
- else if (REISERFS_SB(s)->s_jquota_fmt == QFMT_VFS_V0)
- seq_puts(seq, ",jqfmt=vfsv0");
- }
-#endif
-
- /* Block allocator options */
- if (opts & (1 << REISERFS_NO_BORDER))
- seq_puts(seq, ",block-allocator=noborder");
- if (opts & (1 << REISERFS_NO_UNHASHED_RELOCATION))
- seq_puts(seq, ",block-allocator=no_unhashed_relocation");
- if (opts & (1 << REISERFS_HASHED_RELOCATION))
- seq_puts(seq, ",block-allocator=hashed_relocation");
- if (opts & (1 << REISERFS_TEST4))
- seq_puts(seq, ",block-allocator=test4");
- show_alloc_options(seq, s);
- return 0;
-}
-
-#ifdef CONFIG_QUOTA
-static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
- size_t, loff_t);
-static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
- loff_t);
-
-static struct dquot __rcu **reiserfs_get_dquots(struct inode *inode)
-{
- return REISERFS_I(inode)->i_dquot;
-}
-#endif
-
-static const struct super_operations reiserfs_sops = {
- .alloc_inode = reiserfs_alloc_inode,
- .free_inode = reiserfs_free_inode,
- .write_inode = reiserfs_write_inode,
- .dirty_inode = reiserfs_dirty_inode,
- .evict_inode = reiserfs_evict_inode,
- .put_super = reiserfs_put_super,
- .sync_fs = reiserfs_sync_fs,
- .freeze_fs = reiserfs_freeze,
- .unfreeze_fs = reiserfs_unfreeze,
- .statfs = reiserfs_statfs,
- .remount_fs = reiserfs_remount,
- .show_options = reiserfs_show_options,
-#ifdef CONFIG_QUOTA
- .quota_read = reiserfs_quota_read,
- .quota_write = reiserfs_quota_write,
- .get_dquots = reiserfs_get_dquots,
-#endif
-};
-
-#ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
-
-static int reiserfs_write_dquot(struct dquot *);
-static int reiserfs_acquire_dquot(struct dquot *);
-static int reiserfs_release_dquot(struct dquot *);
-static int reiserfs_mark_dquot_dirty(struct dquot *);
-static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, const struct path *);
-
-static const struct dquot_operations reiserfs_quota_operations = {
- .write_dquot = reiserfs_write_dquot,
- .acquire_dquot = reiserfs_acquire_dquot,
- .release_dquot = reiserfs_release_dquot,
- .mark_dirty = reiserfs_mark_dquot_dirty,
- .write_info = reiserfs_write_info,
- .alloc_dquot = dquot_alloc,
- .destroy_dquot = dquot_destroy,
- .get_next_id = dquot_get_next_id,
-};
-
-static const struct quotactl_ops reiserfs_qctl_operations = {
- .quota_on = reiserfs_quota_on,
- .quota_off = reiserfs_quota_off,
- .quota_sync = dquot_quota_sync,
- .get_state = dquot_get_state,
- .set_info = dquot_set_dqinfo,
- .get_dqblk = dquot_get_dqblk,
- .set_dqblk = dquot_set_dqblk,
-};
-#endif
-
-static const struct export_operations reiserfs_export_ops = {
- .encode_fh = reiserfs_encode_fh,
- .fh_to_dentry = reiserfs_fh_to_dentry,
- .fh_to_parent = reiserfs_fh_to_parent,
- .get_parent = reiserfs_get_parent,
-};
-
-/*
- * this struct is used in reiserfs_getopt () for containing the value for
- * those mount options that have values rather than being toggles.
- */
-typedef struct {
- char *value;
- /*
- * bitmask which is to set on mount_options bitmask
- * when this value is found, 0 is no bits are to be changed.
- */
- int setmask;
- /*
- * bitmask which is to clear on mount_options bitmask
- * when this value is found, 0 is no bits are to be changed.
- * This is applied BEFORE setmask
- */
- int clrmask;
-} arg_desc_t;
-
-/* Set this bit in arg_required to allow empty arguments */
-#define REISERFS_OPT_ALLOWEMPTY 31
-
-/*
- * this struct is used in reiserfs_getopt() for describing the
- * set of reiserfs mount options
- */
-typedef struct {
- char *option_name;
-
- /* 0 if argument is not required, not 0 otherwise */
- int arg_required;
-
- /* list of values accepted by an option */
- const arg_desc_t *values;
-
- /*
- * bitmask which is to set on mount_options bitmask
- * when this value is found, 0 is no bits are to be changed.
- */
- int setmask;
-
- /*
- * bitmask which is to clear on mount_options bitmask
- * when this value is found, 0 is no bits are to be changed.
- * This is applied BEFORE setmask
- */
- int clrmask;
-} opt_desc_t;
-
-/* possible values for -o data= */
-static const arg_desc_t logging_mode[] = {
- {"ordered", 1 << REISERFS_DATA_ORDERED,
- (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)},
- {"journal", 1 << REISERFS_DATA_LOG,
- (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)},
- {"writeback", 1 << REISERFS_DATA_WRITEBACK,
- (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)},
- {.value = NULL}
-};
-
-/* possible values for -o barrier= */
-static const arg_desc_t barrier_mode[] = {
- {"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH},
- {"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE},
- {.value = NULL}
-};
-
-/*
- * possible values for "-o block-allocator=" and bits which are to be set in
- * s_mount_opt of reiserfs specific part of in-core super block
- */
-static const arg_desc_t balloc[] = {
- {"noborder", 1 << REISERFS_NO_BORDER, 0},
- {"border", 0, 1 << REISERFS_NO_BORDER},
- {"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0},
- {"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0},
- {"test4", 1 << REISERFS_TEST4, 0},
- {"notest4", 0, 1 << REISERFS_TEST4},
- {NULL, 0, 0}
-};
-
-static const arg_desc_t tails[] = {
- {"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL},
- {"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
- {"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL},
- {NULL, 0, 0}
-};
-
-static const arg_desc_t error_actions[] = {
- {"panic", 1 << REISERFS_ERROR_PANIC,
- (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)},
- {"ro-remount", 1 << REISERFS_ERROR_RO,
- (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)},
-#ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG
- {"continue", 1 << REISERFS_ERROR_CONTINUE,
- (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)},
-#endif
- {NULL, 0, 0},
-};
-
-/*
- * proceed only one option from a list *cur - string containing of mount
- * options
- * opts - array of options which are accepted
- * opt_arg - if option is found and requires an argument and if it is specifed
- * in the input - pointer to the argument is stored here
- * bit_flags - if option requires to set a certain bit - it is set here
- * return -1 if unknown option is found, opt->arg_required otherwise
- */
-static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
- char **opt_arg, unsigned long *bit_flags)
-{
- char *p;
- /*
- * foo=bar,
- * ^ ^ ^
- * | | +-- option_end
- * | +-- arg_start
- * +-- option_start
- */
- const opt_desc_t *opt;
- const arg_desc_t *arg;
-
- p = *cur;
-
- /* assume argument cannot contain commas */
- *cur = strchr(p, ',');
- if (*cur) {
- *(*cur) = '\0';
- (*cur)++;
- }
-
- if (!strncmp(p, "alloc=", 6)) {
- /*
- * Ugly special case, probably we should redo options
- * parser so that it can understand several arguments for
- * some options, also so that it can fill several bitfields
- * with option values.
- */
- if (reiserfs_parse_alloc_options(s, p + 6)) {
- return -1;
- } else {
- return 0;
- }
- }
-
- /* for every option in the list */
- for (opt = opts; opt->option_name; opt++) {
- if (!strncmp(p, opt->option_name, strlen(opt->option_name))) {
- if (bit_flags) {
- if (opt->clrmask ==
- (1 << REISERFS_UNSUPPORTED_OPT))
- reiserfs_warning(s, "super-6500",
- "%s not supported.\n",
- p);
- else
- *bit_flags &= ~opt->clrmask;
- if (opt->setmask ==
- (1 << REISERFS_UNSUPPORTED_OPT))
- reiserfs_warning(s, "super-6501",
- "%s not supported.\n",
- p);
- else
- *bit_flags |= opt->setmask;
- }
- break;
- }
- }
- if (!opt->option_name) {
- reiserfs_warning(s, "super-6502",
- "unknown mount option \"%s\"", p);
- return -1;
- }
-
- p += strlen(opt->option_name);
- switch (*p) {
- case '=':
- if (!opt->arg_required) {
- reiserfs_warning(s, "super-6503",
- "the option \"%s\" does not "
- "require an argument\n",
- opt->option_name);
- return -1;
- }
- break;
-
- case 0:
- if (opt->arg_required) {
- reiserfs_warning(s, "super-6504",
- "the option \"%s\" requires an "
- "argument\n", opt->option_name);
- return -1;
- }
- break;
- default:
- reiserfs_warning(s, "super-6505",
- "head of option \"%s\" is only correct\n",
- opt->option_name);
- return -1;
- }
-
- /*
- * move to the argument, or to next option if argument is not
- * required
- */
- p++;
-
- if (opt->arg_required
- && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
- && !strlen(p)) {
- /* this catches "option=," if not allowed */
- reiserfs_warning(s, "super-6506",
- "empty argument for \"%s\"\n",
- opt->option_name);
- return -1;
- }
-
- if (!opt->values) {
- /* *=NULLopt_arg contains pointer to argument */
- *opt_arg = p;
- return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY);
- }
-
- /* values possible for this option are listed in opt->values */
- for (arg = opt->values; arg->value; arg++) {
- if (!strcmp(p, arg->value)) {
- if (bit_flags) {
- *bit_flags &= ~arg->clrmask;
- *bit_flags |= arg->setmask;
- }
- return opt->arg_required;
- }
- }
-
- reiserfs_warning(s, "super-6506",
- "bad value \"%s\" for option \"%s\"\n", p,
- opt->option_name);
- return -1;
-}
-
-/* returns 0 if something is wrong in option string, 1 - otherwise */
-static int reiserfs_parse_options(struct super_block *s,
-
- /* string given via mount's -o */
- char *options,
-
- /*
- * after the parsing phase, contains the
- * collection of bitflags defining what
- * mount options were selected.
- */
- unsigned long *mount_options,
-
- /* strtol-ed from NNN of resize=NNN */
- unsigned long *blocks,
- char **jdev_name,
- unsigned int *commit_max_age,
- char **qf_names,
- unsigned int *qfmt)
-{
- int c;
- char *arg = NULL;
- char *pos;
- opt_desc_t opts[] = {
- /*
- * Compatibility stuff, so that -o notail for old
- * setups still work
- */
- {"tails",.arg_required = 't',.values = tails},
- {"notail",.clrmask =
- (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
- {"conv",.setmask = 1 << REISERFS_CONVERT},
- {"attrs",.setmask = 1 << REISERFS_ATTRS},
- {"noattrs",.clrmask = 1 << REISERFS_ATTRS},
- {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
-#ifdef CONFIG_REISERFS_FS_XATTR
- {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
- {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
-#else
- {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
- {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
-#endif
-#ifdef CONFIG_REISERFS_FS_POSIX_ACL
- {"acl",.setmask = 1 << REISERFS_POSIXACL},
- {"noacl",.clrmask = 1 << REISERFS_POSIXACL},
-#else
- {"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
- {"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
-#endif
- {.option_name = "nolog"},
- {"replayonly",.setmask = 1 << REPLAYONLY},
- {"block-allocator",.arg_required = 'a',.values = balloc},
- {"data",.arg_required = 'd',.values = logging_mode},
- {"barrier",.arg_required = 'b',.values = barrier_mode},
- {"resize",.arg_required = 'r',.values = NULL},
- {"jdev",.arg_required = 'j',.values = NULL},
- {"nolargeio",.arg_required = 'w',.values = NULL},
- {"commit",.arg_required = 'c',.values = NULL},
- {"usrquota",.setmask = 1 << REISERFS_USRQUOTA},
- {"grpquota",.setmask = 1 << REISERFS_GRPQUOTA},
- {"noquota",.clrmask = 1 << REISERFS_USRQUOTA | 1 << REISERFS_GRPQUOTA},
- {"errors",.arg_required = 'e',.values = error_actions},
- {"usrjquota",.arg_required =
- 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
- {"grpjquota",.arg_required =
- 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
- {"jqfmt",.arg_required = 'f',.values = NULL},
- {.option_name = NULL}
- };
-
- *blocks = 0;
- if (!options || !*options)
- /*
- * use default configuration: create tails, journaling on, no
- * conversion to newest format
- */
- return 1;
-
- for (pos = options; pos;) {
- c = reiserfs_getopt(s, &pos, opts, &arg, mount_options);
- if (c == -1)
- /* wrong option is given */
- return 0;
-
- if (c == 'r') {
- char *p;
-
- p = NULL;
- /* "resize=NNN" or "resize=auto" */
-
- if (!strcmp(arg, "auto")) {
- /* From JFS code, to auto-get the size. */
- *blocks = sb_bdev_nr_blocks(s);
- } else {
- *blocks = simple_strtoul(arg, &p, 0);
- if (*p != '\0') {
- /* NNN does not look like a number */
- reiserfs_warning(s, "super-6507",
- "bad value %s for "
- "-oresize\n", arg);
- return 0;
- }
- }
- }
-
- if (c == 'c') {
- char *p = NULL;
- unsigned long val = simple_strtoul(arg, &p, 0);
- /* commit=NNN (time in seconds) */
- if (*p != '\0' || val >= (unsigned int)-1) {
- reiserfs_warning(s, "super-6508",
- "bad value %s for -ocommit\n",
- arg);
- return 0;
- }
- *commit_max_age = (unsigned int)val;
- }
-
- if (c == 'w') {
- reiserfs_warning(s, "super-6509", "nolargeio option "
- "is no longer supported");
- return 0;
- }
-
- if (c == 'j') {
- if (arg && *arg && jdev_name) {
- /* Hm, already assigned? */
- if (*jdev_name) {
- reiserfs_warning(s, "super-6510",
- "journal device was "
- "already specified to "
- "be %s", *jdev_name);
- return 0;
- }
- *jdev_name = arg;
- }
- }
-#ifdef CONFIG_QUOTA
- if (c == 'u' || c == 'g') {
- int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
-
- if (sb_any_quota_loaded(s) &&
- (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
- reiserfs_warning(s, "super-6511",
- "cannot change journaled "
- "quota options when quota "
- "turned on.");
- return 0;
- }
- if (qf_names[qtype] !=
- REISERFS_SB(s)->s_qf_names[qtype])
- kfree(qf_names[qtype]);
- qf_names[qtype] = NULL;
- if (*arg) { /* Some filename specified? */
- if (REISERFS_SB(s)->s_qf_names[qtype]
- && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
- arg)) {
- reiserfs_warning(s, "super-6512",
- "%s quota file "
- "already specified.",
- QTYPE2NAME(qtype));
- return 0;
- }
- if (strchr(arg, '/')) {
- reiserfs_warning(s, "super-6513",
- "quotafile must be "
- "on filesystem root.");
- return 0;
- }
- qf_names[qtype] = kstrdup(arg, GFP_KERNEL);
- if (!qf_names[qtype]) {
- reiserfs_warning(s, "reiserfs-2502",
- "not enough memory "
- "for storing "
- "quotafile name.");
- return 0;
- }
- if (qtype == USRQUOTA)
- *mount_options |= 1 << REISERFS_USRQUOTA;
- else
- *mount_options |= 1 << REISERFS_GRPQUOTA;
- } else {
- if (qtype == USRQUOTA)
- *mount_options &= ~(1 << REISERFS_USRQUOTA);
- else
- *mount_options &= ~(1 << REISERFS_GRPQUOTA);
- }
- }
- if (c == 'f') {
- if (!strcmp(arg, "vfsold"))
- *qfmt = QFMT_VFS_OLD;
- else if (!strcmp(arg, "vfsv0"))
- *qfmt = QFMT_VFS_V0;
- else {
- reiserfs_warning(s, "super-6514",
- "unknown quota format "
- "specified.");
- return 0;
- }
- if (sb_any_quota_loaded(s) &&
- *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
- reiserfs_warning(s, "super-6515",
- "cannot change journaled "
- "quota options when quota "
- "turned on.");
- return 0;
- }
- }
-#else
- if (c == 'u' || c == 'g' || c == 'f') {
- reiserfs_warning(s, "reiserfs-2503", "journaled "
- "quota options not supported.");
- return 0;
- }
-#endif
- }
-
-#ifdef CONFIG_QUOTA
- if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
- && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
- reiserfs_warning(s, "super-6515",
- "journaled quota format not specified.");
- return 0;
- }
- if ((!(*mount_options & (1 << REISERFS_USRQUOTA)) &&
- sb_has_quota_loaded(s, USRQUOTA)) ||
- (!(*mount_options & (1 << REISERFS_GRPQUOTA)) &&
- sb_has_quota_loaded(s, GRPQUOTA))) {
- reiserfs_warning(s, "super-6516", "quota options must "
- "be present when quota is turned on.");
- return 0;
- }
-#endif
-
- return 1;
-}
-
-static void switch_data_mode(struct super_block *s, unsigned long mode)
-{
- REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
- (1 << REISERFS_DATA_ORDERED) |
- (1 << REISERFS_DATA_WRITEBACK));
- REISERFS_SB(s)->s_mount_opt |= (1 << mode);
-}
-
-static void handle_data_mode(struct super_block *s, unsigned long mount_options)
-{
- if (mount_options & (1 << REISERFS_DATA_LOG)) {
- if (!reiserfs_data_log(s)) {
- switch_data_mode(s, REISERFS_DATA_LOG);
- reiserfs_info(s, "switching to journaled data mode\n");
- }
- } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
- if (!reiserfs_data_ordered(s)) {
- switch_data_mode(s, REISERFS_DATA_ORDERED);
- reiserfs_info(s, "switching to ordered data mode\n");
- }
- } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
- if (!reiserfs_data_writeback(s)) {
- switch_data_mode(s, REISERFS_DATA_WRITEBACK);
- reiserfs_info(s, "switching to writeback data mode\n");
- }
- }
-}
-
-static void handle_barrier_mode(struct super_block *s, unsigned long bits)
-{
- int flush = (1 << REISERFS_BARRIER_FLUSH);
- int none = (1 << REISERFS_BARRIER_NONE);
- int all_barrier = flush | none;
-
- if (bits & all_barrier) {
- REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
- if (bits & flush) {
- REISERFS_SB(s)->s_mount_opt |= flush;
- printk("reiserfs: enabling write barrier flush mode\n");
- } else if (bits & none) {
- REISERFS_SB(s)->s_mount_opt |= none;
- printk("reiserfs: write barriers turned off\n");
- }
- }
-}
-
-static void handle_attrs(struct super_block *s)
-{
- struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
-
- if (reiserfs_attrs(s)) {
- if (old_format_only(s)) {
- reiserfs_warning(s, "super-6517", "cannot support "
- "attributes on 3.5.x disk format");
- REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
- return;
- }
- if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
- reiserfs_warning(s, "super-6518", "cannot support "
- "attributes until flag is set in "
- "super-block");
- REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
- }
- }
-}
-
-#ifdef CONFIG_QUOTA
-static void handle_quota_files(struct super_block *s, char **qf_names,
- unsigned int *qfmt)
-{
- int i;
-
- for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
- if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
- kfree(REISERFS_SB(s)->s_qf_names[i]);
- REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
- }
- if (*qfmt)
- REISERFS_SB(s)->s_jquota_fmt = *qfmt;
-}
-#endif
-
-static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
-{
- struct reiserfs_super_block *rs;
- struct reiserfs_transaction_handle th;
- unsigned long blocks;
- unsigned long mount_options = REISERFS_SB(s)->s_mount_opt;
- unsigned long safe_mask = 0;
- unsigned int commit_max_age = (unsigned int)-1;
- struct reiserfs_journal *journal = SB_JOURNAL(s);
- int err;
- char *qf_names[REISERFS_MAXQUOTAS];
- unsigned int qfmt = 0;
-#ifdef CONFIG_QUOTA
- int i;
-#endif
-
- sync_filesystem(s);
- reiserfs_write_lock(s);
-
-#ifdef CONFIG_QUOTA
- memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
-#endif
-
- rs = SB_DISK_SUPER_BLOCK(s);
-
- if (!reiserfs_parse_options
- (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
- qf_names, &qfmt)) {
-#ifdef CONFIG_QUOTA
- for (i = 0; i < REISERFS_MAXQUOTAS; i++)
- if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
- kfree(qf_names[i]);
-#endif
- err = -EINVAL;
- goto out_err_unlock;
- }
-#ifdef CONFIG_QUOTA
- handle_quota_files(s, qf_names, &qfmt);
-#endif
-
- handle_attrs(s);
-
- /* Add options that are safe here */
- safe_mask |= 1 << REISERFS_SMALLTAIL;
- safe_mask |= 1 << REISERFS_LARGETAIL;
- safe_mask |= 1 << REISERFS_NO_BORDER;
- safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION;
- safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
- safe_mask |= 1 << REISERFS_TEST4;
- safe_mask |= 1 << REISERFS_ATTRS;
- safe_mask |= 1 << REISERFS_XATTRS_USER;
- safe_mask |= 1 << REISERFS_POSIXACL;
- safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
- safe_mask |= 1 << REISERFS_BARRIER_NONE;
- safe_mask |= 1 << REISERFS_ERROR_RO;
- safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
- safe_mask |= 1 << REISERFS_ERROR_PANIC;
- safe_mask |= 1 << REISERFS_USRQUOTA;
- safe_mask |= 1 << REISERFS_GRPQUOTA;
-
- /*
- * Update the bitmask, taking care to keep
- * the bits we're not allowed to change here
- */
- REISERFS_SB(s)->s_mount_opt =
- (REISERFS_SB(s)->
- s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
-
- if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) {
- journal->j_max_commit_age = commit_max_age;
- journal->j_max_trans_age = commit_max_age;
- } else if (commit_max_age == 0) {
- /* 0 means restore defaults. */
- journal->j_max_commit_age = journal->j_default_max_commit_age;
- journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
- }
-
- if (blocks) {
- err = reiserfs_resize(s, blocks);
- if (err != 0)
- goto out_err_unlock;
- }
-
- if (*mount_flags & SB_RDONLY) {
- reiserfs_write_unlock(s);
- reiserfs_xattr_init(s, *mount_flags);
- /* remount read-only */
- if (sb_rdonly(s))
- /* it is read-only already */
- goto out_ok_unlocked;
-
- err = dquot_suspend(s, -1);
- if (err < 0)
- goto out_err;
-
- /* try to remount file system with read-only permissions */
- if (sb_umount_state(rs) == REISERFS_VALID_FS
- || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
- goto out_ok_unlocked;
- }
-
- reiserfs_write_lock(s);
-
- err = journal_begin(&th, s, 10);
- if (err)
- goto out_err_unlock;
-
- /* Mounting a rw partition read-only. */
- reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
- set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state);
- journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
- } else {
- /* remount read-write */
- if (!sb_rdonly(s)) {
- reiserfs_write_unlock(s);
- reiserfs_xattr_init(s, *mount_flags);
- goto out_ok_unlocked; /* We are read-write already */
- }
-
- if (reiserfs_is_journal_aborted(journal)) {
- err = journal->j_errno;
- goto out_err_unlock;
- }
-
- handle_data_mode(s, mount_options);
- handle_barrier_mode(s, mount_options);
- REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
-
- /* now it is safe to call journal_begin */
- s->s_flags &= ~SB_RDONLY;
- err = journal_begin(&th, s, 10);
- if (err)
- goto out_err_unlock;
-
- /* Mount a partition which is read-only, read-write */
- reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
- REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
- s->s_flags &= ~SB_RDONLY;
- set_sb_umount_state(rs, REISERFS_ERROR_FS);
- if (!old_format_only(s))
- set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
- /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
- journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
- REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
- }
- /* this will force a full flush of all journal lists */
- SB_JOURNAL(s)->j_must_wait = 1;
- err = journal_end(&th);
- if (err)
- goto out_err_unlock;
-
- reiserfs_write_unlock(s);
- if (!(*mount_flags & SB_RDONLY)) {
- dquot_resume(s, -1);
- reiserfs_write_lock(s);
- finish_unfinished(s);
- reiserfs_write_unlock(s);
- reiserfs_xattr_init(s, *mount_flags);
- }
-
-out_ok_unlocked:
- return 0;
-
-out_err_unlock:
- reiserfs_write_unlock(s);
-out_err:
- return err;
-}
-
-static int read_super_block(struct super_block *s, int offset)
-{
- struct buffer_head *bh;
- struct reiserfs_super_block *rs;
- int fs_blocksize;
-
- bh = sb_bread(s, offset / s->s_blocksize);
- if (!bh) {
- reiserfs_warning(s, "sh-2006",
- "bread failed (dev %s, block %lu, size %lu)",
- s->s_id, offset / s->s_blocksize,
- s->s_blocksize);
- return 1;
- }
-
- rs = (struct reiserfs_super_block *)bh->b_data;
- if (!is_any_reiserfs_magic_string(rs)) {
- brelse(bh);
- return 1;
- }
- /*
- * ok, reiserfs signature (old or new) found in at the given offset
- */
- fs_blocksize = sb_blocksize(rs);
- brelse(bh);
- sb_set_blocksize(s, fs_blocksize);
-
- bh = sb_bread(s, offset / s->s_blocksize);
- if (!bh) {
- reiserfs_warning(s, "sh-2007",
- "bread failed (dev %s, block %lu, size %lu)",
- s->s_id, offset / s->s_blocksize,
- s->s_blocksize);
- return 1;
- }
-
- rs = (struct reiserfs_super_block *)bh->b_data;
- if (sb_blocksize(rs) != s->s_blocksize) {
- reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
- "filesystem on (dev %s, block %llu, size %lu)",
- s->s_id,
- (unsigned long long)bh->b_blocknr,
- s->s_blocksize);
- brelse(bh);
- return 1;
- }
-
- if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
- brelse(bh);
- reiserfs_warning(s, "super-6519", "Unfinished reiserfsck "
- "--rebuild-tree run detected. Please run\n"
- "reiserfsck --rebuild-tree and wait for a "
- "completion. If that fails\n"
- "get newer reiserfsprogs package");
- return 1;
- }
-
- reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and "
- "scheduled to be removed from the kernel in 2025");
- SB_BUFFER_WITH_SB(s) = bh;
- SB_DISK_SUPER_BLOCK(s) = rs;
-
- /*
- * magic is of non-standard journal filesystem, look at s_version to
- * find which format is in use
- */
- if (is_reiserfs_jr(rs)) {
- if (sb_version(rs) == REISERFS_VERSION_2)
- reiserfs_info(s, "found reiserfs format \"3.6\""
- " with non-standard journal\n");
- else if (sb_version(rs) == REISERFS_VERSION_1)
- reiserfs_info(s, "found reiserfs format \"3.5\""
- " with non-standard journal\n");
- else {
- reiserfs_warning(s, "sh-2012", "found unknown "
- "format \"%u\" of reiserfs with "
- "non-standard magic", sb_version(rs));
- return 1;
- }
- } else
- /*
- * s_version of standard format may contain incorrect
- * information, so we just look at the magic string
- */
- reiserfs_info(s,
- "found reiserfs format \"%s\" with standard journal\n",
- is_reiserfs_3_5(rs) ? "3.5" : "3.6");
-
- s->s_op = &reiserfs_sops;
- s->s_export_op = &reiserfs_export_ops;
-#ifdef CONFIG_QUOTA
- s->s_qcop = &reiserfs_qctl_operations;
- s->dq_op = &reiserfs_quota_operations;
- s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
-#endif
-
- /*
- * new format is limited by the 32 bit wide i_blocks field, want to
- * be one full block below that.
- */
- s->s_maxbytes = (512LL << 32) - s->s_blocksize;
- return 0;
-}
-
-/* after journal replay, reread all bitmap and super blocks */
-static int reread_meta_blocks(struct super_block *s)
-{
- if (bh_read(SB_BUFFER_WITH_SB(s), 0) < 0) {
- reiserfs_warning(s, "reiserfs-2504", "error reading the super");
- return 1;
- }
-
- return 0;
-}
-
-/* hash detection stuff */
-
-/*
- * if root directory is empty - we set default - Yura's - hash and
- * warn about it
- * FIXME: we look for only one name in a directory. If tea and yura
- * both have the same value - we ask user to send report to the
- * mailing list
- */
-static __u32 find_hash_out(struct super_block *s)
-{
- int retval;
- struct inode *inode;
- struct cpu_key key;
- INITIALIZE_PATH(path);
- struct reiserfs_dir_entry de;
- struct reiserfs_de_head *deh;
- __u32 hash = DEFAULT_HASH;
- __u32 deh_hashval, teahash, r5hash, yurahash;
-
- inode = d_inode(s->s_root);
-
- make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
- retval = search_by_entry_key(s, &key, &path, &de);
- if (retval == IO_ERROR) {
- pathrelse(&path);
- return UNSET_HASH;
- }
- if (retval == NAME_NOT_FOUND)
- de.de_entry_num--;
-
- set_de_name_and_namelen(&de);
- deh = de.de_deh + de.de_entry_num;
-
- if (deh_offset(deh) == DOT_DOT_OFFSET) {
- /* allow override in this case */
- if (reiserfs_rupasov_hash(s))
- hash = YURA_HASH;
- reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n");
- goto out;
- }
-
- deh_hashval = GET_HASH_VALUE(deh_offset(deh));
- r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
- teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
- yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
-
- if ((teahash == r5hash && deh_hashval == r5hash) ||
- (teahash == yurahash && deh_hashval == yurahash) ||
- (r5hash == yurahash && deh_hashval == yurahash)) {
- reiserfs_warning(s, "reiserfs-2506",
- "Unable to automatically detect hash "
- "function. Please mount with -o "
- "hash={tea,rupasov,r5}");
- hash = UNSET_HASH;
- goto out;
- }
-
- if (deh_hashval == yurahash)
- hash = YURA_HASH;
- else if (deh_hashval == teahash)
- hash = TEA_HASH;
- else if (deh_hashval == r5hash)
- hash = R5_HASH;
- else {
- reiserfs_warning(s, "reiserfs-2506",
- "Unrecognised hash function");
- hash = UNSET_HASH;
- }
-out:
- pathrelse(&path);
- return hash;
-}
-
-/* finds out which hash names are sorted with */
-static int what_hash(struct super_block *s)
-{
- __u32 code;
-
- code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
-
- /*
- * reiserfs_hash_detect() == true if any of the hash mount options
- * were used. We must check them to make sure the user isn't
- * using a bad hash value
- */
- if (code == UNSET_HASH || reiserfs_hash_detect(s))
- code = find_hash_out(s);
-
- if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
- /*
- * detection has found the hash, and we must check against the
- * mount options
- */
- if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
- reiserfs_warning(s, "reiserfs-2507",
- "Error, %s hash detected, "
- "unable to force rupasov hash",
- reiserfs_hashname(code));
- code = UNSET_HASH;
- } else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
- reiserfs_warning(s, "reiserfs-2508",
- "Error, %s hash detected, "
- "unable to force tea hash",
- reiserfs_hashname(code));
- code = UNSET_HASH;
- } else if (reiserfs_r5_hash(s) && code != R5_HASH) {
- reiserfs_warning(s, "reiserfs-2509",
- "Error, %s hash detected, "
- "unable to force r5 hash",
- reiserfs_hashname(code));
- code = UNSET_HASH;
- }
- } else {
- /*
- * find_hash_out was not called or
- * could not determine the hash
- */
- if (reiserfs_rupasov_hash(s)) {
- code = YURA_HASH;
- } else if (reiserfs_tea_hash(s)) {
- code = TEA_HASH;
- } else if (reiserfs_r5_hash(s)) {
- code = R5_HASH;
- }
- }
-
- /*
- * if we are mounted RW, and we have a new valid hash code, update
- * the super
- */
- if (code != UNSET_HASH &&
- !sb_rdonly(s) &&
- code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) {
- set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code);
- }
- return code;
-}
-
-/* return pointer to appropriate function */
-static hashf_t hash_function(struct super_block *s)
-{
- switch (what_hash(s)) {
- case TEA_HASH:
- reiserfs_info(s, "Using tea hash to sort names\n");
- return keyed_hash;
- case YURA_HASH:
- reiserfs_info(s, "Using rupasov hash to sort names\n");
- return yura_hash;
- case R5_HASH:
- reiserfs_info(s, "Using r5 hash to sort names\n");
- return r5_hash;
- }
- return NULL;
-}
-
-/* this is used to set up correct value for old partitions */
-static int function2code(hashf_t func)
-{
- if (func == keyed_hash)
- return TEA_HASH;
- if (func == yura_hash)
- return YURA_HASH;
- if (func == r5_hash)
- return R5_HASH;
-
- BUG(); /* should never happen */
-
- return 0;
-}
-
-#define SWARN(silent, s, id, ...) \
- if (!(silent)) \
- reiserfs_warning(s, id, __VA_ARGS__)
-
-static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
-{
- struct inode *root_inode;
- struct reiserfs_transaction_handle th;
- int old_format = 0;
- unsigned long blocks;
- unsigned int commit_max_age = 0;
- int jinit_done = 0;
- struct reiserfs_iget_args args;
- struct reiserfs_super_block *rs;
- char *jdev_name;
- struct reiserfs_sb_info *sbi;
- int errval = -EINVAL;
- char *qf_names[REISERFS_MAXQUOTAS] = {};
- unsigned int qfmt = 0;
-
- sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
- s->s_fs_info = sbi;
- /* Set default values for options: non-aggressive tails, RO on errors */
- sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
- sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
- sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
- /* no preallocation minimum, be smart in reiserfs_file_write instead */
- sbi->s_alloc_options.preallocmin = 0;
- /* Preallocate by 16 blocks (17-1) at once */
- sbi->s_alloc_options.preallocsize = 17;
- /* setup default block allocator options */
- reiserfs_init_alloc_options(s);
-
- spin_lock_init(&sbi->old_work_lock);
- INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits);
- mutex_init(&sbi->lock);
- sbi->lock_depth = -1;
-
- sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0,
- s->s_id);
- if (!sbi->commit_wq) {
- SWARN(silent, s, "", "Cannot allocate commit workqueue");
- errval = -ENOMEM;
- goto error_unlocked;
- }
-
- jdev_name = NULL;
- if (reiserfs_parse_options
- (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name,
- &commit_max_age, qf_names, &qfmt) == 0) {
- goto error_unlocked;
- }
- if (jdev_name && jdev_name[0]) {
- sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
- if (!sbi->s_jdev) {
- SWARN(silent, s, "", "Cannot allocate memory for "
- "journal device name");
- goto error_unlocked;
- }
- }
-#ifdef CONFIG_QUOTA
- handle_quota_files(s, qf_names, &qfmt);
-#endif
-
- if (blocks) {
- SWARN(silent, s, "jmacd-7", "resize option for remount only");
- goto error_unlocked;
- }
-
- /*
- * try old format (undistributed bitmap, super block in 8-th 1k
- * block of a device)
- */
- if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
- old_format = 1;
-
- /*
- * try new format (64-th 1k block), which can contain reiserfs
- * super block
- */
- else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
- SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
- s->s_id);
- goto error_unlocked;
- }
-
- s->s_time_min = 0;
- s->s_time_max = U32_MAX;
-
- rs = SB_DISK_SUPER_BLOCK(s);
- /*
- * Let's do basic sanity check to verify that underlying device is not
- * smaller than the filesystem. If the check fails then abort and
- * scream, because bad stuff will happen otherwise.
- */
- if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) {
- SWARN(silent, s, "", "Filesystem cannot be "
- "mounted because it is bigger than the device");
- SWARN(silent, s, "", "You may need to run fsck "
- "or increase size of your LVM partition");
- SWARN(silent, s, "", "Or may be you forgot to "
- "reboot after fdisk when it told you to");
- goto error_unlocked;
- }
-
- sbi->s_mount_state = SB_REISERFS_STATE(s);
- sbi->s_mount_state = REISERFS_VALID_FS;
-
- if ((errval = reiserfs_init_bitmap_cache(s))) {
- SWARN(silent, s, "jmacd-8", "unable to read bitmap");
- goto error_unlocked;
- }
-
- errval = -EINVAL;
-#ifdef CONFIG_REISERFS_CHECK
- SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
- SWARN(silent, s, "", "- it is slow mode for debugging.");
-#endif
-
- /* make data=ordered the default */
- if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
- !reiserfs_data_writeback(s)) {
- sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
- }
-
- if (reiserfs_data_log(s)) {
- reiserfs_info(s, "using journaled data mode\n");
- } else if (reiserfs_data_ordered(s)) {
- reiserfs_info(s, "using ordered data mode\n");
- } else {
- reiserfs_info(s, "using writeback data mode\n");
- }
- if (reiserfs_barrier_flush(s)) {
- printk("reiserfs: using flush barriers\n");
- }
-
- if (journal_init(s, jdev_name, old_format, commit_max_age)) {
- SWARN(silent, s, "sh-2022",
- "unable to initialize journal space");
- goto error_unlocked;
- } else {
- /*
- * once this is set, journal_release must be called
- * if we error out of the mount
- */
- jinit_done = 1;
- }
-
- if (reread_meta_blocks(s)) {
- SWARN(silent, s, "jmacd-9",
- "unable to reread meta blocks after journal init");
- goto error_unlocked;
- }
-
- if (replay_only(s))
- goto error_unlocked;
-
- s->s_xattr = reiserfs_xattr_handlers;
-
- if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
- SWARN(silent, s, "clm-7000",
- "Detected readonly device, marking FS readonly");
- s->s_flags |= SB_RDONLY;
- }
- args.objectid = REISERFS_ROOT_OBJECTID;
- args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
- root_inode =
- iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
- reiserfs_init_locked_inode, (void *)&args);
- if (!root_inode) {
- SWARN(silent, s, "jmacd-10", "get root inode failed");
- goto error_unlocked;
- }
-
- /*
- * This path assumed to be called with the BKL in the old times.
- * Now we have inherited the big reiserfs lock from it and many
- * reiserfs helpers called in the mount path and elsewhere require
- * this lock to be held even if it's not always necessary. Let's be
- * conservative and hold it early. The window can be reduced after
- * careful review of the code.
- */
- reiserfs_write_lock(s);
-
- if (root_inode->i_state & I_NEW) {
- reiserfs_read_locked_inode(root_inode, &args);
- unlock_new_inode(root_inode);
- }
-
- if (!S_ISDIR(root_inode->i_mode) || !inode_get_bytes(root_inode) ||
- !root_inode->i_size) {
- SWARN(silent, s, "", "corrupt root inode, run fsck");
- iput(root_inode);
- errval = -EUCLEAN;
- goto error;
- }
-
- s->s_root = d_make_root(root_inode);
- if (!s->s_root)
- goto error;
- /* define and initialize hash function */
- sbi->s_hash_function = hash_function(s);
- if (sbi->s_hash_function == NULL) {
- dput(s->s_root);
- s->s_root = NULL;
- goto error;
- }
-
- if (is_reiserfs_3_5(rs)
- || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
- set_bit(REISERFS_3_5, &sbi->s_properties);
- else if (old_format)
- set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties);
- else
- set_bit(REISERFS_3_6, &sbi->s_properties);
-
- if (!sb_rdonly(s)) {
-
- errval = journal_begin(&th, s, 1);
- if (errval) {
- dput(s->s_root);
- s->s_root = NULL;
- goto error;
- }
- reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-
- set_sb_umount_state(rs, REISERFS_ERROR_FS);
- set_sb_fs_state(rs, 0);
-
- /*
- * Clear out s_bmap_nr if it would wrap. We can handle this
- * case, but older revisions can't. This will cause the
- * file system to fail mount on those older implementations,
- * avoiding corruption. -jeffm
- */
- if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
- sb_bmap_nr(rs) != 0) {
- reiserfs_warning(s, "super-2030", "This file system "
- "claims to use %u bitmap blocks in "
- "its super block, but requires %u. "
- "Clearing to zero.", sb_bmap_nr(rs),
- reiserfs_bmap_count(s));
-
- set_sb_bmap_nr(rs, 0);
- }
-
- if (old_format_only(s)) {
- /*
- * filesystem of format 3.5 either with standard
- * or non-standard journal
- */
- if (convert_reiserfs(s)) {
- /* and -o conv is given */
- if (!silent)
- reiserfs_info(s,
- "converting 3.5 filesystem to the 3.6 format");
-
- if (is_reiserfs_3_5(rs))
- /*
- * put magic string of 3.6 format.
- * 2.2 will not be able to
- * mount this filesystem anymore
- */
- memcpy(rs->s_v1.s_magic,
- reiserfs_3_6_magic_string,
- sizeof
- (reiserfs_3_6_magic_string));
-
- set_sb_version(rs, REISERFS_VERSION_2);
- reiserfs_convert_objectid_map_v1(s);
- set_bit(REISERFS_3_6, &sbi->s_properties);
- clear_bit(REISERFS_3_5, &sbi->s_properties);
- } else if (!silent) {
- reiserfs_info(s, "using 3.5.x disk format\n");
- }
- } else
- set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
-
-
- journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
- errval = journal_end(&th);
- if (errval) {
- dput(s->s_root);
- s->s_root = NULL;
- goto error;
- }
-
- reiserfs_write_unlock(s);
- if ((errval = reiserfs_lookup_privroot(s)) ||
- (errval = reiserfs_xattr_init(s, s->s_flags))) {
- dput(s->s_root);
- s->s_root = NULL;
- goto error_unlocked;
- }
- reiserfs_write_lock(s);
-
- /*
- * look for files which were to be removed in previous session
- */
- finish_unfinished(s);
- } else {
- if (old_format_only(s) && !silent) {
- reiserfs_info(s, "using 3.5.x disk format\n");
- }
-
- reiserfs_write_unlock(s);
- if ((errval = reiserfs_lookup_privroot(s)) ||
- (errval = reiserfs_xattr_init(s, s->s_flags))) {
- dput(s->s_root);
- s->s_root = NULL;
- goto error_unlocked;
- }
- reiserfs_write_lock(s);
- }
- /*
- * mark hash in super block: it could be unset. overwrite should be ok
- */
- set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
-
- handle_attrs(s);
-
- reiserfs_proc_info_init(s);
-
- init_waitqueue_head(&(sbi->s_wait));
- spin_lock_init(&sbi->bitmap_lock);
-
- reiserfs_write_unlock(s);
-
- return (0);
-
-error:
- reiserfs_write_unlock(s);
-
-error_unlocked:
- /* kill the commit thread, free journal ram */
- if (jinit_done) {
- reiserfs_write_lock(s);
- journal_release_error(NULL, s);
- reiserfs_write_unlock(s);
- }
-
- if (sbi->commit_wq)
- destroy_workqueue(sbi->commit_wq);
-
- reiserfs_cancel_old_flush(s);
-
- reiserfs_free_bitmap_cache(s);
- if (SB_BUFFER_WITH_SB(s))
- brelse(SB_BUFFER_WITH_SB(s));
-#ifdef CONFIG_QUOTA
- {
- int j;
- for (j = 0; j < REISERFS_MAXQUOTAS; j++)
- kfree(qf_names[j]);
- }
-#endif
- kfree(sbi->s_jdev);
- kfree(sbi);
-
- s->s_fs_info = NULL;
- return errval;
-}
-
-static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
-
- buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
- buf->f_bfree = sb_free_blocks(rs);
- buf->f_bavail = buf->f_bfree;
- buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
- buf->f_bsize = dentry->d_sb->s_blocksize;
- /* changed to accommodate gcc folks. */
- buf->f_type = REISERFS_SUPER_MAGIC;
- buf->f_fsid.val[0] = (u32)crc32_le(0, rs->s_uuid, sizeof(rs->s_uuid)/2);
- buf->f_fsid.val[1] = (u32)crc32_le(0, rs->s_uuid + sizeof(rs->s_uuid)/2,
- sizeof(rs->s_uuid)/2);
-
- return 0;
-}
-
-#ifdef CONFIG_QUOTA
-static int reiserfs_write_dquot(struct dquot *dquot)
-{
- struct reiserfs_transaction_handle th;
- int ret, err;
- int depth;
-
- reiserfs_write_lock(dquot->dq_sb);
- ret =
- journal_begin(&th, dquot->dq_sb,
- REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
- if (ret)
- goto out;
- depth = reiserfs_write_unlock_nested(dquot->dq_sb);
- ret = dquot_commit(dquot);
- reiserfs_write_lock_nested(dquot->dq_sb, depth);
- err = journal_end(&th);
- if (!ret && err)
- ret = err;
-out:
- reiserfs_write_unlock(dquot->dq_sb);
- return ret;
-}
-
-static int reiserfs_acquire_dquot(struct dquot *dquot)
-{
- struct reiserfs_transaction_handle th;
- int ret, err;
- int depth;
-
- reiserfs_write_lock(dquot->dq_sb);
- ret =
- journal_begin(&th, dquot->dq_sb,
- REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
- if (ret)
- goto out;
- depth = reiserfs_write_unlock_nested(dquot->dq_sb);
- ret = dquot_acquire(dquot);
- reiserfs_write_lock_nested(dquot->dq_sb, depth);
- err = journal_end(&th);
- if (!ret && err)
- ret = err;
-out:
- reiserfs_write_unlock(dquot->dq_sb);
- return ret;
-}
-
-static int reiserfs_release_dquot(struct dquot *dquot)
-{
- struct reiserfs_transaction_handle th;
- int ret, err;
-
- reiserfs_write_lock(dquot->dq_sb);
- ret =
- journal_begin(&th, dquot->dq_sb,
- REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
- reiserfs_write_unlock(dquot->dq_sb);
- if (ret) {
- /* Release dquot anyway to avoid endless cycle in dqput() */
- dquot_release(dquot);
- goto out;
- }
- ret = dquot_release(dquot);
- reiserfs_write_lock(dquot->dq_sb);
- err = journal_end(&th);
- if (!ret && err)
- ret = err;
- reiserfs_write_unlock(dquot->dq_sb);
-out:
- return ret;
-}
-
-static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
-{
- /* Are we journaling quotas? */
- if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
- REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
- dquot_mark_dquot_dirty(dquot);
- return reiserfs_write_dquot(dquot);
- } else
- return dquot_mark_dquot_dirty(dquot);
-}
-
-static int reiserfs_write_info(struct super_block *sb, int type)
-{
- struct reiserfs_transaction_handle th;
- int ret, err;
- int depth;
-
- /* Data block + inode block */
- reiserfs_write_lock(sb);
- ret = journal_begin(&th, sb, 2);
- if (ret)
- goto out;
- depth = reiserfs_write_unlock_nested(sb);
- ret = dquot_commit_info(sb, type);
- reiserfs_write_lock_nested(sb, depth);
- err = journal_end(&th);
- if (!ret && err)
- ret = err;
-out:
- reiserfs_write_unlock(sb);
- return ret;
-}
-
-/*
- * Turn on quotas during mount time - we need to find the quota file and such...
- */
-static int reiserfs_quota_on_mount(struct super_block *sb, int type)
-{
- return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
- REISERFS_SB(sb)->s_jquota_fmt, type);
-}
-
-/*
- * Standard function to be called on quota_on
- */
-static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
- const struct path *path)
-{
- int err;
- struct inode *inode;
- struct reiserfs_transaction_handle th;
- int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
-
- reiserfs_write_lock(sb);
- if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
- err = -EINVAL;
- goto out;
- }
-
- /* Quotafile not on the same filesystem? */
- if (path->dentry->d_sb != sb) {
- err = -EXDEV;
- goto out;
- }
- inode = d_inode(path->dentry);
- /*
- * We must not pack tails for quota files on reiserfs for quota
- * IO to work
- */
- if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
- err = reiserfs_unpack(inode);
- if (err) {
- reiserfs_warning(sb, "super-6520",
- "Unpacking tail of quota file failed"
- " (%d). Cannot turn on quotas.", err);
- err = -EINVAL;
- goto out;
- }
- mark_inode_dirty(inode);
- }
- /* Journaling quota? */
- if (REISERFS_SB(sb)->s_qf_names[type]) {
- /* Quotafile not of fs root? */
- if (path->dentry->d_parent != sb->s_root)
- reiserfs_warning(sb, "super-6521",
- "Quota file not on filesystem root. "
- "Journalled quota will not work.");
- }
-
- /*
- * When we journal data on quota file, we have to flush journal to see
- * all updates to the file when we bypass pagecache...
- */
- if (reiserfs_file_data_log(inode)) {
- /* Just start temporary transaction and finish it */
- err = journal_begin(&th, sb, 1);
- if (err)
- goto out;
- err = journal_end_sync(&th);
- if (err)
- goto out;
- }
- reiserfs_write_unlock(sb);
- err = dquot_quota_on(sb, type, format_id, path);
- if (!err) {
- inode_lock(inode);
- REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL |
- REISERFS_NOATIME_FL;
- inode_set_flags(inode, S_IMMUTABLE | S_NOATIME,
- S_IMMUTABLE | S_NOATIME);
- inode_unlock(inode);
- mark_inode_dirty(inode);
- }
- return err;
-out:
- reiserfs_write_unlock(sb);
- return err;
-}
-
-static int reiserfs_quota_off(struct super_block *sb, int type)
-{
- int err;
- struct inode *inode = sb_dqopt(sb)->files[type];
-
- if (!inode || !igrab(inode))
- goto out;
-
- err = dquot_quota_off(sb, type);
- if (err)
- goto out_put;
-
- inode_lock(inode);
- REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL |
- REISERFS_NOATIME_FL);
- inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME);
- inode_unlock(inode);
- mark_inode_dirty(inode);
-out_put:
- iput(inode);
- return err;
-out:
- return dquot_quota_off(sb, type);
-}
-
-/*
- * Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and no one else should touch the files)
- * we don't have to be afraid of races
- */
-static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
- size_t len, loff_t off)
-{
- struct inode *inode = sb_dqopt(sb)->files[type];
- unsigned long blk = off >> sb->s_blocksize_bits;
- int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
- size_t toread;
- struct buffer_head tmp_bh, *bh;
- loff_t i_size = i_size_read(inode);
-
- if (off > i_size)
- return 0;
- if (off + len > i_size)
- len = i_size - off;
- toread = len;
- while (toread > 0) {
- tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
- tmp_bh.b_state = 0;
- /*
- * Quota files are without tails so we can safely
- * use this function
- */
- reiserfs_write_lock(sb);
- err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
- reiserfs_write_unlock(sb);
- if (err)
- return err;
- if (!buffer_mapped(&tmp_bh)) /* A hole? */
- memset(data, 0, tocopy);
- else {
- bh = sb_bread(sb, tmp_bh.b_blocknr);
- if (!bh)
- return -EIO;
- memcpy(data, bh->b_data + offset, tocopy);
- brelse(bh);
- }
- offset = 0;
- toread -= tocopy;
- data += tocopy;
- blk++;
- }
- return len;
-}
-
-/*
- * Write to quotafile (we know the transaction is already started and has
- * enough credits)
- */
-static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
- const char *data, size_t len, loff_t off)
-{
- struct inode *inode = sb_dqopt(sb)->files[type];
- unsigned long blk = off >> sb->s_blocksize_bits;
- int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
- int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL;
- size_t towrite = len;
- struct buffer_head tmp_bh, *bh;
-
- if (!current->journal_info) {
- printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n",
- (unsigned long long)off, (unsigned long long)len);
- return -EIO;
- }
- while (towrite > 0) {
- tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite);
- tmp_bh.b_state = 0;
- reiserfs_write_lock(sb);
- err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
- reiserfs_write_unlock(sb);
- if (err)
- goto out;
- if (offset || tocopy != sb->s_blocksize)
- bh = sb_bread(sb, tmp_bh.b_blocknr);
- else
- bh = sb_getblk(sb, tmp_bh.b_blocknr);
- if (!bh) {
- err = -EIO;
- goto out;
- }
- lock_buffer(bh);
- memcpy(bh->b_data + offset, data, tocopy);
- flush_dcache_page(bh->b_page);
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- reiserfs_write_lock(sb);
- reiserfs_prepare_for_journal(sb, bh, 1);
- journal_mark_dirty(current->journal_info, bh);
- if (!journal_quota)
- reiserfs_add_ordered_list(inode, bh);
- reiserfs_write_unlock(sb);
- brelse(bh);
- offset = 0;
- towrite -= tocopy;
- data += tocopy;
- blk++;
- }
-out:
- if (len == towrite)
- return err;
- if (inode->i_size < off + len - towrite)
- i_size_write(inode, off + len - towrite);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- mark_inode_dirty(inode);
- return len - towrite;
-}
-
-#endif
-
-static struct dentry *get_super_block(struct file_system_type *fs_type,
- int flags, const char *dev_name,
- void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
-}
-
-static int __init init_reiserfs_fs(void)
-{
- int ret;
-
- ret = init_inodecache();
- if (ret)
- return ret;
-
- reiserfs_proc_info_global_init();
-
- ret = register_filesystem(&reiserfs_fs_type);
- if (ret)
- goto out;
-
- return 0;
-out:
- reiserfs_proc_info_global_done();
- destroy_inodecache();
-
- return ret;
-}
-
-static void __exit exit_reiserfs_fs(void)
-{
- reiserfs_proc_info_global_done();
- unregister_filesystem(&reiserfs_fs_type);
- destroy_inodecache();
-}
-
-struct file_system_type reiserfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "reiserfs",
- .mount = get_super_block,
- .kill_sb = reiserfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("reiserfs");
-
-MODULE_DESCRIPTION("ReiserFS journaled filesystem");
-MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
-MODULE_LICENSE("GPL");
-
-module_init(init_reiserfs_fs);
-module_exit(exit_reiserfs_fs);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
deleted file mode 100644
index 2cec61af2a9e..000000000000
--- a/fs/reiserfs/tail_conversion.c
+++ /dev/null
@@ -1,318 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright
- * details
- */
-
-#include <linux/time.h>
-#include <linux/pagemap.h>
-#include <linux/buffer_head.h>
-#include "reiserfs.h"
-
-/*
- * access to tail : when one is going to read tail it must make sure, that is
- * not running. direct2indirect and indirect2direct can not run concurrently
- */
-
-/*
- * Converts direct items to an unformatted node. Panics if file has no
- * tail. -ENOSPC if no disk space for conversion
- */
-/*
- * path points to first direct item of the file regardless of how many of
- * them are there
- */
-int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
- struct treepath *path, struct buffer_head *unbh,
- loff_t tail_offset)
-{
- struct super_block *sb = inode->i_sb;
- struct buffer_head *up_to_date_bh;
- struct item_head *p_le_ih = tp_item_head(path);
- unsigned long total_tail = 0;
-
- /* Key to search for the last byte of the converted item. */
- struct cpu_key end_key;
-
- /*
- * new indirect item to be inserted or key
- * of unfm pointer to be pasted
- */
- struct item_head ind_ih;
- int blk_size;
- /* returned value for reiserfs_insert_item and clones */
- int retval;
- /* Handle on an unformatted node that will be inserted in the tree. */
- unp_t unfm_ptr;
-
- BUG_ON(!th->t_trans_id);
-
- REISERFS_SB(sb)->s_direct2indirect++;
-
- blk_size = sb->s_blocksize;
-
- /*
- * and key to search for append or insert pointer to the new
- * unformatted node.
- */
- copy_item_head(&ind_ih, p_le_ih);
- set_le_ih_k_offset(&ind_ih, tail_offset);
- set_le_ih_k_type(&ind_ih, TYPE_INDIRECT);
-
- /* Set the key to search for the place for new unfm pointer */
- make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
-
- /* FIXME: we could avoid this */
- if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
- reiserfs_error(sb, "PAP-14030",
- "pasted or inserted byte exists in "
- "the tree %K. Use fsck to repair.", &end_key);
- pathrelse(path);
- return -EIO;
- }
-
- p_le_ih = tp_item_head(path);
-
- unfm_ptr = cpu_to_le32(unbh->b_blocknr);
-
- if (is_statdata_le_ih(p_le_ih)) {
- /* Insert new indirect item. */
- set_ih_free_space(&ind_ih, 0); /* delete at nearest future */
- put_ih_item_len(&ind_ih, UNFM_P_SIZE);
- PATH_LAST_POSITION(path)++;
- retval =
- reiserfs_insert_item(th, path, &end_key, &ind_ih, inode,
- (char *)&unfm_ptr);
- } else {
- /* Paste into last indirect item of an object. */
- retval = reiserfs_paste_into_item(th, path, &end_key, inode,
- (char *)&unfm_ptr,
- UNFM_P_SIZE);
- }
- if (retval) {
- return retval;
- }
- /*
- * note: from here there are two keys which have matching first
- * three key components. They only differ by the fourth one.
- */
-
- /* Set the key to search for the direct items of the file */
- make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT,
- 4);
-
- /*
- * Move bytes from the direct items to the new unformatted node
- * and delete them.
- */
- while (1) {
- int tail_size;
-
- /*
- * end_key.k_offset is set so, that we will always have found
- * last item of the file
- */
- if (search_for_position_by_key(sb, &end_key, path) ==
- POSITION_FOUND)
- reiserfs_panic(sb, "PAP-14050",
- "direct item (%K) not found", &end_key);
- p_le_ih = tp_item_head(path);
- RFALSE(!is_direct_le_ih(p_le_ih),
- "vs-14055: direct item expected(%K), found %h",
- &end_key, p_le_ih);
- tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
- + ih_item_len(p_le_ih) - 1;
-
- /*
- * we only send the unbh pointer if the buffer is not
- * up to date. this avoids overwriting good data from
- * writepage() with old data from the disk or buffer cache
- * Special case: unbh->b_page will be NULL if we are coming
- * through DIRECT_IO handler here.
- */
- if (!unbh->b_page || buffer_uptodate(unbh)
- || PageUptodate(unbh->b_page)) {
- up_to_date_bh = NULL;
- } else {
- up_to_date_bh = unbh;
- }
- retval = reiserfs_delete_item(th, path, &end_key, inode,
- up_to_date_bh);
-
- total_tail += retval;
-
- /* done: file does not have direct items anymore */
- if (tail_size == retval)
- break;
-
- }
- /*
- * if we've copied bytes from disk into the page, we need to zero
- * out the unused part of the block (it was not up to date before)
- */
- if (up_to_date_bh) {
- unsigned pgoff =
- (tail_offset + total_tail - 1) & (PAGE_SIZE - 1);
- char *kaddr = kmap_atomic(up_to_date_bh->b_page);
- memset(kaddr + pgoff, 0, blk_size - total_tail);
- kunmap_atomic(kaddr);
- }
-
- REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
-
- return 0;
-}
-
-/* stolen from fs/buffer.c */
-void reiserfs_unmap_buffer(struct buffer_head *bh)
-{
- lock_buffer(bh);
- if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
- BUG();
- }
- clear_buffer_dirty(bh);
- /*
- * Remove the buffer from whatever list it belongs to. We are mostly
- * interested in removing it from per-sb j_dirty_buffers list, to avoid
- * BUG() on attempt to write not mapped buffer
- */
- if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
- struct inode *inode = bh->b_folio->mapping->host;
- struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
- spin_lock(&j->j_dirty_buffers_lock);
- list_del_init(&bh->b_assoc_buffers);
- reiserfs_free_jh(bh);
- spin_unlock(&j->j_dirty_buffers_lock);
- }
- clear_buffer_mapped(bh);
- clear_buffer_req(bh);
- clear_buffer_new(bh);
- bh->b_bdev = NULL;
- unlock_buffer(bh);
-}
-
-/*
- * this first locks inode (neither reads nor sync are permitted),
- * reads tail through page cache, insert direct item. When direct item
- * inserted successfully inode is left locked. Return value is always
- * what we expect from it (number of cut bytes). But when tail remains
- * in the unformatted node, we set mode to SKIP_BALANCING and unlock
- * inode
- */
-int indirect2direct(struct reiserfs_transaction_handle *th,
- struct inode *inode, struct page *page,
- struct treepath *path, /* path to the indirect item. */
- const struct cpu_key *item_key, /* Key to look for
- * unformatted node
- * pointer to be cut. */
- loff_t n_new_file_size, /* New file size. */
- char *mode)
-{
- struct super_block *sb = inode->i_sb;
- struct item_head s_ih;
- unsigned long block_size = sb->s_blocksize;
- char *tail;
- int tail_len, round_tail_len;
- loff_t pos, pos1; /* position of first byte of the tail */
- struct cpu_key key;
-
- BUG_ON(!th->t_trans_id);
-
- REISERFS_SB(sb)->s_indirect2direct++;
-
- *mode = M_SKIP_BALANCING;
-
- /* store item head path points to. */
- copy_item_head(&s_ih, tp_item_head(path));
-
- tail_len = (n_new_file_size & (block_size - 1));
- if (get_inode_sd_version(inode) == STAT_DATA_V2)
- round_tail_len = ROUND_UP(tail_len);
- else
- round_tail_len = tail_len;
-
- pos =
- le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
- 1) * sb->s_blocksize;
- pos1 = pos;
-
- /*
- * we are protected by i_mutex. The tail can not disapper, not
- * append can be done either
- * we are in truncate or packing tail in file_release
- */
-
- tail = (char *)kmap(page); /* this can schedule */
-
- if (path_changed(&s_ih, path)) {
- /* re-search indirect item */
- if (search_for_position_by_key(sb, item_key, path)
- == POSITION_NOT_FOUND)
- reiserfs_panic(sb, "PAP-5520",
- "item to be converted %K does not exist",
- item_key);
- copy_item_head(&s_ih, tp_item_head(path));
-#ifdef CONFIG_REISERFS_CHECK
- pos = le_ih_k_offset(&s_ih) - 1 +
- (ih_item_len(&s_ih) / UNFM_P_SIZE -
- 1) * sb->s_blocksize;
- if (pos != pos1)
- reiserfs_panic(sb, "vs-5530", "tail position "
- "changed while we were reading it");
-#endif
- }
-
- /* Set direct item header to insert. */
- make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode),
- pos1 + 1, TYPE_DIRECT, round_tail_len,
- 0xffff /*ih_free_space */ );
-
- /*
- * we want a pointer to the first byte of the tail in the page.
- * the page was locked and this part of the page was up to date when
- * indirect2direct was called, so we know the bytes are still valid
- */
- tail = tail + (pos & (PAGE_SIZE - 1));
-
- PATH_LAST_POSITION(path)++;
-
- key = *item_key;
- set_cpu_key_k_type(&key, TYPE_DIRECT);
- key.key_length = 4;
- /* Insert tail as new direct item in the tree */
- if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
- tail ? tail : NULL) < 0) {
- /*
- * No disk memory. So we can not convert last unformatted node
- * to the direct item. In this case we used to adjust
- * indirect items's ih_free_space. Now ih_free_space is not
- * used, it would be ideal to write zeros to corresponding
- * unformatted node. For now i_size is considered as guard for
- * going out of file size
- */
- kunmap(page);
- return block_size - round_tail_len;
- }
- kunmap(page);
-
- /* make sure to get the i_blocks changes from reiserfs_insert_item */
- reiserfs_update_sd(th, inode);
-
- /*
- * note: we have now the same as in above direct2indirect
- * conversion: there are two keys which have matching first three
- * key components. They only differ by the fourth one.
- */
-
- /*
- * We have inserted new direct item and must remove last
- * unformatted node.
- */
- *mode = M_CUT;
-
- /* we store position of first direct item in the in-core inode */
- /* mark_file_with_tail (inode, pos1 + 1); */
- REISERFS_I(inode)->i_first_direct_byte = pos1 + 1;
-
- return block_size - round_tail_len;
-}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
deleted file mode 100644
index 998035a6388e..000000000000
--- a/fs/reiserfs/xattr.c
+++ /dev/null
@@ -1,1039 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/reiserfs/xattr.c
- *
- * Copyright (c) 2002 by Jeff Mahoney, <jeffm@suse.com>
- *
- */
-
-/*
- * In order to implement EA/ACLs in a clean, backwards compatible manner,
- * they are implemented as files in a "private" directory.
- * Each EA is in it's own file, with the directory layout like so (/ is assumed
- * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory,
- * directories named using the capital-hex form of the objectid and
- * generation number are used. Inside each directory are individual files
- * named with the name of the extended attribute.
- *
- * So, for objectid 12648430, we could have:
- * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access
- * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default
- * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type
- * .. or similar.
- *
- * The file contents are the text of the EA. The size is known based on the
- * stat data describing the file.
- *
- * In the case of system.posix_acl_access and system.posix_acl_default, since
- * these are special cases for filesystem ACLs, they are interpreted by the
- * kernel, in addition, they are negatively and positively cached and attached
- * to the inode so that unnecessary lookups are avoided.
- *
- * Locking works like so:
- * Directory components (xattr root, xattr dir) are protectd by their i_mutex.
- * The xattrs themselves are protected by the xattr_sem.
- */
-
-#include "reiserfs.h"
-#include <linux/capability.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
-#include <linux/errno.h>
-#include <linux/gfp.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include "xattr.h"
-#include "acl.h"
-#include <linux/uaccess.h>
-#include <net/checksum.h>
-#include <linux/stat.h>
-#include <linux/quotaops.h>
-#include <linux/security.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
-
-#define PRIVROOT_NAME ".reiserfs_priv"
-#define XAROOT_NAME "xattrs"
-
-
-/*
- * Helpers for inode ops. We do this so that we don't have all the VFS
- * overhead and also for proper i_mutex annotation.
- * dir->i_mutex must be held for all of them.
- */
-#ifdef CONFIG_REISERFS_FS_XATTR
-static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
-{
- BUG_ON(!inode_is_locked(dir));
- return dir->i_op->create(&nop_mnt_idmap, dir, dentry, mode, true);
-}
-#endif
-
-static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- BUG_ON(!inode_is_locked(dir));
- return dir->i_op->mkdir(&nop_mnt_idmap, dir, dentry, mode);
-}
-
-/*
- * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
- * mutation ops aren't called during rename or splace, which are the
- * only other users of I_MUTEX_CHILD. It violates the ordering, but that's
- * better than allocating another subclass just for this code.
- */
-static int xattr_unlink(struct inode *dir, struct dentry *dentry)
-{
- int error;
-
- BUG_ON(!inode_is_locked(dir));
-
- inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
- error = dir->i_op->unlink(dir, dentry);
- inode_unlock(d_inode(dentry));
-
- if (!error)
- d_delete(dentry);
- return error;
-}
-
-static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
-{
- int error;
-
- BUG_ON(!inode_is_locked(dir));
-
- inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
- error = dir->i_op->rmdir(dir, dentry);
- if (!error)
- d_inode(dentry)->i_flags |= S_DEAD;
- inode_unlock(d_inode(dentry));
- if (!error)
- d_delete(dentry);
-
- return error;
-}
-
-#define xattr_may_create(flags) (!flags || flags & XATTR_CREATE)
-
-static struct dentry *open_xa_root(struct super_block *sb, int flags)
-{
- struct dentry *privroot = REISERFS_SB(sb)->priv_root;
- struct dentry *xaroot;
-
- if (d_really_is_negative(privroot))
- return ERR_PTR(-EOPNOTSUPP);
-
- inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
-
- xaroot = dget(REISERFS_SB(sb)->xattr_root);
- if (!xaroot)
- xaroot = ERR_PTR(-EOPNOTSUPP);
- else if (d_really_is_negative(xaroot)) {
- int err = -ENODATA;
-
- if (xattr_may_create(flags))
- err = xattr_mkdir(d_inode(privroot), xaroot, 0700);
- if (err) {
- dput(xaroot);
- xaroot = ERR_PTR(err);
- }
- }
-
- inode_unlock(d_inode(privroot));
- return xaroot;
-}
-
-static struct dentry *open_xa_dir(const struct inode *inode, int flags)
-{
- struct dentry *xaroot, *xadir;
- char namebuf[17];
-
- xaroot = open_xa_root(inode->i_sb, flags);
- if (IS_ERR(xaroot))
- return xaroot;
-
- snprintf(namebuf, sizeof(namebuf), "%X.%X",
- le32_to_cpu(INODE_PKEY(inode)->k_objectid),
- inode->i_generation);
-
- inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR);
-
- xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
- if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
- int err = -ENODATA;
-
- if (xattr_may_create(flags))
- err = xattr_mkdir(d_inode(xaroot), xadir, 0700);
- if (err) {
- dput(xadir);
- xadir = ERR_PTR(err);
- }
- }
-
- inode_unlock(d_inode(xaroot));
- dput(xaroot);
- return xadir;
-}
-
-/*
- * The following are side effects of other operations that aren't explicitly
- * modifying extended attributes. This includes operations such as permissions
- * or ownership changes, object deletions, etc.
- */
-struct reiserfs_dentry_buf {
- struct dir_context ctx;
- struct dentry *xadir;
- int count;
- int err;
- struct dentry *dentries[8];
-};
-
-static bool
-fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
- loff_t offset, u64 ino, unsigned int d_type)
-{
- struct reiserfs_dentry_buf *dbuf =
- container_of(ctx, struct reiserfs_dentry_buf, ctx);
- struct dentry *dentry;
-
- WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
-
- if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
- return false;
-
- if (name[0] == '.' && (namelen < 2 ||
- (namelen == 2 && name[1] == '.')))
- return true;
-
- dentry = lookup_one_len(name, dbuf->xadir, namelen);
- if (IS_ERR(dentry)) {
- dbuf->err = PTR_ERR(dentry);
- return false;
- } else if (d_really_is_negative(dentry)) {
- /* A directory entry exists, but no file? */
- reiserfs_error(dentry->d_sb, "xattr-20003",
- "Corrupted directory: xattr %pd listed but "
- "not found for file %pd.\n",
- dentry, dbuf->xadir);
- dput(dentry);
- dbuf->err = -EIO;
- return false;
- }
-
- dbuf->dentries[dbuf->count++] = dentry;
- return true;
-}
-
-static void
-cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
-{
- int i;
-
- for (i = 0; i < buf->count; i++)
- if (buf->dentries[i])
- dput(buf->dentries[i]);
-}
-
-static int reiserfs_for_each_xattr(struct inode *inode,
- int (*action)(struct dentry *, void *),
- void *data)
-{
- struct dentry *dir;
- int i, err = 0;
- struct reiserfs_dentry_buf buf = {
- .ctx.actor = fill_with_dentries,
- };
-
- /* Skip out, an xattr has no xattrs associated with it */
- if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
- return 0;
-
- dir = open_xa_dir(inode, XATTR_REPLACE);
- if (IS_ERR(dir)) {
- err = PTR_ERR(dir);
- goto out;
- } else if (d_really_is_negative(dir)) {
- err = 0;
- goto out_dir;
- }
-
- inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
-
- buf.xadir = dir;
- while (1) {
- err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
- if (err)
- break;
- if (buf.err) {
- err = buf.err;
- break;
- }
- if (!buf.count)
- break;
- for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
- struct dentry *dentry = buf.dentries[i];
-
- if (!d_is_dir(dentry))
- err = action(dentry, data);
-
- dput(dentry);
- buf.dentries[i] = NULL;
- }
- if (err)
- break;
- buf.count = 0;
- }
- inode_unlock(d_inode(dir));
-
- cleanup_dentry_buf(&buf);
-
- if (!err) {
- /*
- * We start a transaction here to avoid a ABBA situation
- * between the xattr root's i_mutex and the journal lock.
- * This doesn't incur much additional overhead since the
- * new transaction will just nest inside the
- * outer transaction.
- */
- int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
- 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
- struct reiserfs_transaction_handle th;
-
- reiserfs_write_lock(inode->i_sb);
- err = journal_begin(&th, inode->i_sb, blocks);
- reiserfs_write_unlock(inode->i_sb);
- if (!err) {
- int jerror;
-
- inode_lock_nested(d_inode(dir->d_parent),
- I_MUTEX_XATTR);
- err = action(dir, data);
- reiserfs_write_lock(inode->i_sb);
- jerror = journal_end(&th);
- reiserfs_write_unlock(inode->i_sb);
- inode_unlock(d_inode(dir->d_parent));
- err = jerror ?: err;
- }
- }
-out_dir:
- dput(dir);
-out:
- /*
- * -ENODATA: this object doesn't have any xattrs
- * -EOPNOTSUPP: this file system doesn't have xattrs enabled on disk.
- * Neither are errors
- */
- if (err == -ENODATA || err == -EOPNOTSUPP)
- err = 0;
- return err;
-}
-
-static int delete_one_xattr(struct dentry *dentry, void *data)
-{
- struct inode *dir = d_inode(dentry->d_parent);
-
- /* This is the xattr dir, handle specially. */
- if (d_is_dir(dentry))
- return xattr_rmdir(dir, dentry);
-
- return xattr_unlink(dir, dentry);
-}
-
-static int chown_one_xattr(struct dentry *dentry, void *data)
-{
- struct iattr *attrs = data;
- int ia_valid = attrs->ia_valid;
- int err;
-
- /*
- * We only want the ownership bits. Otherwise, we'll do
- * things like change a directory to a regular file if
- * ATTR_MODE is set.
- */
- attrs->ia_valid &= (ATTR_UID|ATTR_GID);
- err = reiserfs_setattr(&nop_mnt_idmap, dentry, attrs);
- attrs->ia_valid = ia_valid;
-
- return err;
-}
-
-/* No i_mutex, but the inode is unconnected. */
-int reiserfs_delete_xattrs(struct inode *inode)
-{
- int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
-
- if (err)
- reiserfs_warning(inode->i_sb, "jdm-20004",
- "Couldn't delete all xattrs (%d)\n", err);
- return err;
-}
-
-/* inode->i_mutex: down */
-int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
-{
- int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
-
- if (err)
- reiserfs_warning(inode->i_sb, "jdm-20007",
- "Couldn't chown all xattrs (%d)\n", err);
- return err;
-}
-
-#ifdef CONFIG_REISERFS_FS_XATTR
-/*
- * Returns a dentry corresponding to a specific extended attribute file
- * for the inode. If flags allow, the file is created. Otherwise, a
- * valid or negative dentry, or an error is returned.
- */
-static struct dentry *xattr_lookup(struct inode *inode, const char *name,
- int flags)
-{
- struct dentry *xadir, *xafile;
- int err = 0;
-
- xadir = open_xa_dir(inode, flags);
- if (IS_ERR(xadir))
- return ERR_CAST(xadir);
-
- inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
- xafile = lookup_one_len(name, xadir, strlen(name));
- if (IS_ERR(xafile)) {
- err = PTR_ERR(xafile);
- goto out;
- }
-
- if (d_really_is_positive(xafile) && (flags & XATTR_CREATE))
- err = -EEXIST;
-
- if (d_really_is_negative(xafile)) {
- err = -ENODATA;
- if (xattr_may_create(flags))
- err = xattr_create(d_inode(xadir), xafile,
- 0700|S_IFREG);
- }
-
- if (err)
- dput(xafile);
-out:
- inode_unlock(d_inode(xadir));
- dput(xadir);
- if (err)
- return ERR_PTR(err);
- return xafile;
-}
-
-/* Internal operations on file data */
-static inline void reiserfs_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
-static struct page *reiserfs_get_page(struct inode *dir, size_t n)
-{
- struct address_space *mapping = dir->i_mapping;
- struct page *page;
- /*
- * We can deadlock if we try to free dentries,
- * and an unlink/rmdir has just occurred - GFP_NOFS avoids this
- */
- mapping_set_gfp_mask(mapping, GFP_NOFS);
- page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL);
- if (!IS_ERR(page))
- kmap(page);
- return page;
-}
-
-static inline __u32 xattr_hash(const char *msg, int len)
-{
- /*
- * csum_partial() gives different results for little-endian and
- * big endian hosts. Images created on little-endian hosts and
- * mounted on big-endian hosts(and vice versa) will see csum mismatches
- * when trying to fetch xattrs. Treating the hash as __wsum_t would
- * lower the frequency of mismatch. This is an endianness bug in
- * reiserfs. The return statement would result in a sparse warning. Do
- * not fix the sparse warning so as to not hide a reminder of the bug.
- */
- return csum_partial(msg, len, 0);
-}
-
-int reiserfs_commit_write(struct file *f, struct page *page,
- unsigned from, unsigned to);
-
-static void update_ctime(struct inode *inode)
-{
- struct timespec64 now = current_time(inode);
- struct timespec64 ctime = inode_get_ctime(inode);
-
- if (inode_unhashed(inode) || !inode->i_nlink ||
- timespec64_equal(&ctime, &now))
- return;
-
- inode_set_ctime_to_ts(inode, now);
- mark_inode_dirty(inode);
-}
-
-static int lookup_and_delete_xattr(struct inode *inode, const char *name)
-{
- int err = 0;
- struct dentry *dentry, *xadir;
-
- xadir = open_xa_dir(inode, XATTR_REPLACE);
- if (IS_ERR(xadir))
- return PTR_ERR(xadir);
-
- inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
- dentry = lookup_one_len(name, xadir, strlen(name));
- if (IS_ERR(dentry)) {
- err = PTR_ERR(dentry);
- goto out_dput;
- }
-
- if (d_really_is_positive(dentry)) {
- err = xattr_unlink(d_inode(xadir), dentry);
- update_ctime(inode);
- }
-
- dput(dentry);
-out_dput:
- inode_unlock(d_inode(xadir));
- dput(xadir);
- return err;
-}
-
-
-/* Generic extended attribute operations that can be used by xa plugins */
-
-/*
- * inode->i_mutex: down
- */
-int
-reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
- struct inode *inode, const char *name,
- const void *buffer, size_t buffer_size, int flags)
-{
- int err = 0;
- struct dentry *dentry;
- struct page *page;
- char *data;
- size_t file_pos = 0;
- size_t buffer_pos = 0;
- size_t new_size;
- __u32 xahash = 0;
-
- if (get_inode_sd_version(inode) == STAT_DATA_V1)
- return -EOPNOTSUPP;
-
- if (!buffer) {
- err = lookup_and_delete_xattr(inode, name);
- return err;
- }
-
- dentry = xattr_lookup(inode, name, flags);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- down_write(&REISERFS_I(inode)->i_xattr_sem);
-
- xahash = xattr_hash(buffer, buffer_size);
- while (buffer_pos < buffer_size || buffer_pos == 0) {
- size_t chunk;
- size_t skip = 0;
- size_t page_offset = (file_pos & (PAGE_SIZE - 1));
-
- if (buffer_size - buffer_pos > PAGE_SIZE)
- chunk = PAGE_SIZE;
- else
- chunk = buffer_size - buffer_pos;
-
- page = reiserfs_get_page(d_inode(dentry), file_pos);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- goto out_unlock;
- }
-
- lock_page(page);
- data = page_address(page);
-
- if (file_pos == 0) {
- struct reiserfs_xattr_header *rxh;
-
- skip = file_pos = sizeof(struct reiserfs_xattr_header);
- if (chunk + skip > PAGE_SIZE)
- chunk = PAGE_SIZE - skip;
- rxh = (struct reiserfs_xattr_header *)data;
- rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
- rxh->h_hash = cpu_to_le32(xahash);
- }
-
- reiserfs_write_lock(inode->i_sb);
- err = __reiserfs_write_begin(page, page_offset, chunk + skip);
- if (!err) {
- if (buffer)
- memcpy(data + skip, buffer + buffer_pos, chunk);
- err = reiserfs_commit_write(NULL, page, page_offset,
- page_offset + chunk +
- skip);
- }
- reiserfs_write_unlock(inode->i_sb);
- unlock_page(page);
- reiserfs_put_page(page);
- buffer_pos += chunk;
- file_pos += chunk;
- skip = 0;
- if (err || buffer_size == 0 || !buffer)
- break;
- }
-
- new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
- if (!err && new_size < i_size_read(d_inode(dentry))) {
- struct iattr newattrs = {
- .ia_ctime = current_time(inode),
- .ia_size = new_size,
- .ia_valid = ATTR_SIZE | ATTR_CTIME,
- };
-
- inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
- inode_dio_wait(d_inode(dentry));
-
- err = reiserfs_setattr(&nop_mnt_idmap, dentry, &newattrs);
- inode_unlock(d_inode(dentry));
- } else
- update_ctime(inode);
-out_unlock:
- up_write(&REISERFS_I(inode)->i_xattr_sem);
- dput(dentry);
- return err;
-}
-
-/* We need to start a transaction to maintain lock ordering */
-int reiserfs_xattr_set(struct inode *inode, const char *name,
- const void *buffer, size_t buffer_size, int flags)
-{
-
- struct reiserfs_transaction_handle th;
- int error, error2;
- size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
-
- /* Check before we start a transaction and then do nothing. */
- if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root))
- return -EOPNOTSUPP;
-
- if (!(flags & XATTR_REPLACE))
- jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
-
- reiserfs_write_lock(inode->i_sb);
- error = journal_begin(&th, inode->i_sb, jbegin_count);
- reiserfs_write_unlock(inode->i_sb);
- if (error) {
- return error;
- }
-
- error = reiserfs_xattr_set_handle(&th, inode, name,
- buffer, buffer_size, flags);
-
- reiserfs_write_lock(inode->i_sb);
- error2 = journal_end(&th);
- reiserfs_write_unlock(inode->i_sb);
- if (error == 0)
- error = error2;
-
- return error;
-}
-
-/*
- * inode->i_mutex: down
- */
-int
-reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
- size_t buffer_size)
-{
- ssize_t err = 0;
- struct dentry *dentry;
- size_t isize;
- size_t file_pos = 0;
- size_t buffer_pos = 0;
- struct page *page;
- __u32 hash = 0;
-
- if (name == NULL)
- return -EINVAL;
-
- /*
- * We can't have xattrs attached to v1 items since they don't have
- * generation numbers
- */
- if (get_inode_sd_version(inode) == STAT_DATA_V1)
- return -EOPNOTSUPP;
-
- /*
- * priv_root needn't be initialized during mount so allow initial
- * lookups to succeed.
- */
- if (!REISERFS_SB(inode->i_sb)->priv_root)
- return 0;
-
- dentry = xattr_lookup(inode, name, XATTR_REPLACE);
- if (IS_ERR(dentry)) {
- err = PTR_ERR(dentry);
- goto out;
- }
-
- down_read(&REISERFS_I(inode)->i_xattr_sem);
-
- isize = i_size_read(d_inode(dentry));
-
- /* Just return the size needed */
- if (buffer == NULL) {
- err = isize - sizeof(struct reiserfs_xattr_header);
- goto out_unlock;
- }
-
- if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
- err = -ERANGE;
- goto out_unlock;
- }
-
- while (file_pos < isize) {
- size_t chunk;
- char *data;
- size_t skip = 0;
-
- if (isize - file_pos > PAGE_SIZE)
- chunk = PAGE_SIZE;
- else
- chunk = isize - file_pos;
-
- page = reiserfs_get_page(d_inode(dentry), file_pos);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- goto out_unlock;
- }
-
- lock_page(page);
- data = page_address(page);
- if (file_pos == 0) {
- struct reiserfs_xattr_header *rxh =
- (struct reiserfs_xattr_header *)data;
- skip = file_pos = sizeof(struct reiserfs_xattr_header);
- chunk -= skip;
- /* Magic doesn't match up.. */
- if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
- unlock_page(page);
- reiserfs_put_page(page);
- reiserfs_warning(inode->i_sb, "jdm-20001",
- "Invalid magic for xattr (%s) "
- "associated with %k", name,
- INODE_PKEY(inode));
- err = -EIO;
- goto out_unlock;
- }
- hash = le32_to_cpu(rxh->h_hash);
- }
- memcpy(buffer + buffer_pos, data + skip, chunk);
- unlock_page(page);
- reiserfs_put_page(page);
- file_pos += chunk;
- buffer_pos += chunk;
- skip = 0;
- }
- err = isize - sizeof(struct reiserfs_xattr_header);
-
- if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
- hash) {
- reiserfs_warning(inode->i_sb, "jdm-20002",
- "Invalid hash for xattr (%s) associated "
- "with %k", name, INODE_PKEY(inode));
- err = -EIO;
- }
-
-out_unlock:
- up_read(&REISERFS_I(inode)->i_xattr_sem);
- dput(dentry);
-
-out:
- return err;
-}
-
-/*
- * In order to implement different sets of xattr operations for each xattr
- * prefix with the generic xattr API, a filesystem should create a
- * null-terminated array of struct xattr_handler (one for each prefix) and
- * hang a pointer to it off of the s_xattr field of the superblock.
- *
- * The generic_fooxattr() functions will use this list to dispatch xattr
- * operations to the correct xattr_handler.
- */
-#define for_each_xattr_handler(handlers, handler) \
- for ((handler) = *(handlers)++; \
- (handler) != NULL; \
- (handler) = *(handlers)++)
-
-static inline bool reiserfs_posix_acl_list(const char *name,
- struct dentry *dentry)
-{
- return (posix_acl_type(name) >= 0) &&
- IS_POSIXACL(d_backing_inode(dentry));
-}
-
-/* This is the implementation for the xattr plugin infrastructure */
-static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers,
- const char *name, struct dentry *dentry)
-{
- if (handlers) {
- const struct xattr_handler *xah = NULL;
-
- for_each_xattr_handler(handlers, xah) {
- const char *prefix = xattr_prefix(xah);
-
- if (strncmp(prefix, name, strlen(prefix)))
- continue;
-
- if (!xattr_handler_can_list(xah, dentry))
- return false;
-
- return true;
- }
- }
-
- return reiserfs_posix_acl_list(name, dentry);
-}
-
-struct listxattr_buf {
- struct dir_context ctx;
- size_t size;
- size_t pos;
- char *buf;
- struct dentry *dentry;
-};
-
-static bool listxattr_filler(struct dir_context *ctx, const char *name,
- int namelen, loff_t offset, u64 ino,
- unsigned int d_type)
-{
- struct listxattr_buf *b =
- container_of(ctx, struct listxattr_buf, ctx);
- size_t size;
-
- if (name[0] != '.' ||
- (namelen != 1 && (name[1] != '.' || namelen != 2))) {
- if (!reiserfs_xattr_list(b->dentry->d_sb->s_xattr, name,
- b->dentry))
- return true;
- size = namelen + 1;
- if (b->buf) {
- if (b->pos + size > b->size) {
- b->pos = -ERANGE;
- return false;
- }
- memcpy(b->buf + b->pos, name, namelen);
- b->buf[b->pos + namelen] = 0;
- }
- b->pos += size;
- }
- return true;
-}
-
-/*
- * Inode operation listxattr()
- *
- * We totally ignore the generic listxattr here because it would be stupid
- * not to. Since the xattrs are organized in a directory, we can just
- * readdir to find them.
- */
-ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
-{
- struct dentry *dir;
- int err = 0;
- struct listxattr_buf buf = {
- .ctx.actor = listxattr_filler,
- .dentry = dentry,
- .buf = buffer,
- .size = buffer ? size : 0,
- };
-
- if (d_really_is_negative(dentry))
- return -EINVAL;
-
- if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
- return -EOPNOTSUPP;
-
- dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
- if (IS_ERR(dir)) {
- err = PTR_ERR(dir);
- if (err == -ENODATA)
- err = 0; /* Not an error if there aren't any xattrs */
- goto out;
- }
-
- inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
- err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
- inode_unlock(d_inode(dir));
-
- if (!err)
- err = buf.pos;
-
- dput(dir);
-out:
- return err;
-}
-
-static int create_privroot(struct dentry *dentry)
-{
- int err;
- struct inode *inode = d_inode(dentry->d_parent);
-
- WARN_ON_ONCE(!inode_is_locked(inode));
-
- err = xattr_mkdir(inode, dentry, 0700);
- if (err || d_really_is_negative(dentry)) {
- reiserfs_warning(dentry->d_sb, "jdm-20006",
- "xattrs/ACLs enabled and couldn't "
- "find/create .reiserfs_priv. "
- "Failing mount.");
- return -EOPNOTSUPP;
- }
-
- reiserfs_init_priv_inode(d_inode(dentry));
- reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
- "storage.\n", PRIVROOT_NAME);
-
- return 0;
-}
-
-#else
-int __init reiserfs_xattr_register_handlers(void) { return 0; }
-void reiserfs_xattr_unregister_handlers(void) {}
-static int create_privroot(struct dentry *dentry) { return 0; }
-#endif
-
-/* Actual operations that are exported to VFS-land */
-const struct xattr_handler * const reiserfs_xattr_handlers[] = {
-#ifdef CONFIG_REISERFS_FS_XATTR
- &reiserfs_xattr_user_handler,
- &reiserfs_xattr_trusted_handler,
-#endif
-#ifdef CONFIG_REISERFS_FS_SECURITY
- &reiserfs_xattr_security_handler,
-#endif
- NULL
-};
-
-static int xattr_mount_check(struct super_block *s)
-{
- /*
- * We need generation numbers to ensure that the oid mapping is correct
- * v3.5 filesystems don't have them.
- */
- if (old_format_only(s)) {
- if (reiserfs_xattrs_optional(s)) {
- /*
- * Old format filesystem, but optional xattrs have
- * been enabled. Error out.
- */
- reiserfs_warning(s, "jdm-2005",
- "xattrs/ACLs not supported "
- "on pre-v3.6 format filesystems. "
- "Failing mount.");
- return -EOPNOTSUPP;
- }
- }
-
- return 0;
-}
-
-int reiserfs_permission(struct mnt_idmap *idmap, struct inode *inode,
- int mask)
-{
- /*
- * We don't do permission checks on the internal objects.
- * Permissions are determined by the "owning" object.
- */
- if (IS_PRIVATE(inode))
- return 0;
-
- return generic_permission(&nop_mnt_idmap, inode, mask);
-}
-
-static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags)
-{
- return -EPERM;
-}
-
-static const struct dentry_operations xattr_lookup_poison_ops = {
- .d_revalidate = xattr_hide_revalidate,
-};
-
-int reiserfs_lookup_privroot(struct super_block *s)
-{
- struct dentry *dentry;
- int err = 0;
-
- /* If we don't have the privroot located yet - go find it */
- inode_lock(d_inode(s->s_root));
- dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
- strlen(PRIVROOT_NAME));
- if (!IS_ERR(dentry)) {
- REISERFS_SB(s)->priv_root = dentry;
- d_set_d_op(dentry, &xattr_lookup_poison_ops);
- if (d_really_is_positive(dentry))
- reiserfs_init_priv_inode(d_inode(dentry));
- } else
- err = PTR_ERR(dentry);
- inode_unlock(d_inode(s->s_root));
-
- return err;
-}
-
-/*
- * We need to take a copy of the mount flags since things like
- * SB_RDONLY don't get set until *after* we're called.
- * mount_flags != mount_options
- */
-int reiserfs_xattr_init(struct super_block *s, int mount_flags)
-{
- int err = 0;
- struct dentry *privroot = REISERFS_SB(s)->priv_root;
-
- err = xattr_mount_check(s);
- if (err)
- goto error;
-
- if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) {
- inode_lock(d_inode(s->s_root));
- err = create_privroot(REISERFS_SB(s)->priv_root);
- inode_unlock(d_inode(s->s_root));
- }
-
- if (d_really_is_positive(privroot)) {
- inode_lock(d_inode(privroot));
- if (!REISERFS_SB(s)->xattr_root) {
- struct dentry *dentry;
-
- dentry = lookup_one_len(XAROOT_NAME, privroot,
- strlen(XAROOT_NAME));
- if (!IS_ERR(dentry))
- REISERFS_SB(s)->xattr_root = dentry;
- else
- err = PTR_ERR(dentry);
- }
- inode_unlock(d_inode(privroot));
- }
-
-error:
- if (err) {
- clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt);
- clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
- }
-
- /* The super_block SB_POSIXACL must mirror the (no)acl mount option. */
- if (reiserfs_posixacl(s))
- s->s_flags |= SB_POSIXACL;
- else
- s->s_flags &= ~SB_POSIXACL;
-
- return err;
-}
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
deleted file mode 100644
index 5868a4e990e3..000000000000
--- a/fs/reiserfs/xattr.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/reiserfs_xattr.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/rwsem.h>
-#include <linux/xattr.h>
-
-struct inode;
-struct dentry;
-struct iattr;
-struct super_block;
-
-int reiserfs_xattr_register_handlers(void) __init;
-void reiserfs_xattr_unregister_handlers(void);
-int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
-int reiserfs_lookup_privroot(struct super_block *sb);
-int reiserfs_delete_xattrs(struct inode *inode);
-int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
-int reiserfs_permission(struct mnt_idmap *idmap,
- struct inode *inode, int mask);
-
-#ifdef CONFIG_REISERFS_FS_XATTR
-#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
-ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-
-int reiserfs_xattr_get(struct inode *, const char *, void *, size_t);
-int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
-int reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *,
- struct inode *, const char *, const void *,
- size_t, int);
-
-extern const struct xattr_handler reiserfs_xattr_user_handler;
-extern const struct xattr_handler reiserfs_xattr_trusted_handler;
-extern const struct xattr_handler reiserfs_xattr_security_handler;
-#ifdef CONFIG_REISERFS_FS_SECURITY
-int reiserfs_security_init(struct inode *dir, struct inode *inode,
- const struct qstr *qstr,
- struct reiserfs_security_handle *sec);
-int reiserfs_security_write(struct reiserfs_transaction_handle *th,
- struct inode *inode,
- struct reiserfs_security_handle *sec);
-void reiserfs_security_free(struct reiserfs_security_handle *sec);
-#endif
-
-static inline int reiserfs_xattrs_initialized(struct super_block *sb)
-{
- return REISERFS_SB(sb)->priv_root && REISERFS_SB(sb)->xattr_root;
-}
-
-#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
-static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
-{
- loff_t ret = 0;
- if (reiserfs_file_data_log(inode)) {
- ret = _ROUND_UP(xattr_size(size), inode->i_sb->s_blocksize);
- ret >>= inode->i_sb->s_blocksize_bits;
- }
- return ret;
-}
-
-/*
- * We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
- * Let's try to be smart about it.
- * xattr root: We cache it. If it's not cached, we may need to create it.
- * xattr dir: If anything has been loaded for this inode, we can set a flag
- * saying so.
- * xattr file: Since we don't cache xattrs, we can't tell. We always include
- * blocks for it.
- *
- * However, since root and dir can be created between calls - YOU MUST SAVE
- * THIS VALUE.
- */
-static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode)
-{
- size_t nblocks = JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-
- if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) {
- nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
- if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root))
- nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
- }
-
- return nblocks;
-}
-
-static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
-{
- init_rwsem(&REISERFS_I(inode)->i_xattr_sem);
-}
-
-#else
-
-#define reiserfs_listxattr NULL
-
-static inline void reiserfs_init_xattr_rwsem(struct inode *inode)
-{
-}
-#endif /* CONFIG_REISERFS_FS_XATTR */
-
-#ifndef CONFIG_REISERFS_FS_SECURITY
-static inline int reiserfs_security_init(struct inode *dir,
- struct inode *inode,
- const struct qstr *qstr,
- struct reiserfs_security_handle *sec)
-{
- return 0;
-}
-static inline int
-reiserfs_security_write(struct reiserfs_transaction_handle *th,
- struct inode *inode,
- struct reiserfs_security_handle *sec)
-{
- return 0;
-}
-static inline void reiserfs_security_free(struct reiserfs_security_handle *sec)
-{}
-#endif
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
deleted file mode 100644
index 064264992b49..000000000000
--- a/fs/reiserfs/xattr_acl.c
+++ /dev/null
@@ -1,411 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/capability.h>
-#include <linux/fs.h>
-#include <linux/posix_acl.h>
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include <linux/slab.h>
-#include <linux/posix_acl_xattr.h>
-#include "xattr.h"
-#include "acl.h"
-#include <linux/uaccess.h>
-
-static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
- struct inode *inode, int type,
- struct posix_acl *acl);
-
-
-int
-reiserfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
- struct posix_acl *acl, int type)
-{
- int error, error2;
- struct reiserfs_transaction_handle th;
- size_t jcreate_blocks;
- int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
- int update_mode = 0;
- struct inode *inode = d_inode(dentry);
- umode_t mode = inode->i_mode;
-
- /*
- * Pessimism: We can't assume that anything from the xattr root up
- * has been created.
- */
-
- jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
- reiserfs_xattr_nblocks(inode, size) * 2;
-
- reiserfs_write_lock(inode->i_sb);
- error = journal_begin(&th, inode->i_sb, jcreate_blocks);
- reiserfs_write_unlock(inode->i_sb);
- if (error == 0) {
- if (type == ACL_TYPE_ACCESS && acl) {
- error = posix_acl_update_mode(&nop_mnt_idmap, inode,
- &mode, &acl);
- if (error)
- goto unlock;
- update_mode = 1;
- }
- error = __reiserfs_set_acl(&th, inode, type, acl);
- if (!error && update_mode)
- inode->i_mode = mode;
-unlock:
- reiserfs_write_lock(inode->i_sb);
- error2 = journal_end(&th);
- reiserfs_write_unlock(inode->i_sb);
- if (error2)
- error = error2;
- }
-
- return error;
-}
-
-/*
- * Convert from filesystem to in-memory representation.
- */
-static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size)
-{
- const char *end = (char *)value + size;
- int n, count;
- struct posix_acl *acl;
-
- if (!value)
- return NULL;
- if (size < sizeof(reiserfs_acl_header))
- return ERR_PTR(-EINVAL);
- if (((reiserfs_acl_header *) value)->a_version !=
- cpu_to_le32(REISERFS_ACL_VERSION))
- return ERR_PTR(-EINVAL);
- value = (char *)value + sizeof(reiserfs_acl_header);
- count = reiserfs_acl_count(size);
- if (count < 0)
- return ERR_PTR(-EINVAL);
- if (count == 0)
- return NULL;
- acl = posix_acl_alloc(count, GFP_NOFS);
- if (!acl)
- return ERR_PTR(-ENOMEM);
- for (n = 0; n < count; n++) {
- reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value;
- if ((char *)value + sizeof(reiserfs_acl_entry_short) > end)
- goto fail;
- acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
- acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
- switch (acl->a_entries[n].e_tag) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- value = (char *)value +
- sizeof(reiserfs_acl_entry_short);
- break;
-
- case ACL_USER:
- value = (char *)value + sizeof(reiserfs_acl_entry);
- if ((char *)value > end)
- goto fail;
- acl->a_entries[n].e_uid =
- make_kuid(&init_user_ns,
- le32_to_cpu(entry->e_id));
- break;
- case ACL_GROUP:
- value = (char *)value + sizeof(reiserfs_acl_entry);
- if ((char *)value > end)
- goto fail;
- acl->a_entries[n].e_gid =
- make_kgid(&init_user_ns,
- le32_to_cpu(entry->e_id));
- break;
-
- default:
- goto fail;
- }
- }
- if (value != end)
- goto fail;
- return acl;
-
-fail:
- posix_acl_release(acl);
- return ERR_PTR(-EINVAL);
-}
-
-/*
- * Convert from in-memory to filesystem representation.
- */
-static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
-{
- reiserfs_acl_header *ext_acl;
- char *e;
- int n;
-
- *size = reiserfs_acl_size(acl->a_count);
- ext_acl = kmalloc(sizeof(reiserfs_acl_header) +
- acl->a_count *
- sizeof(reiserfs_acl_entry),
- GFP_NOFS);
- if (!ext_acl)
- return ERR_PTR(-ENOMEM);
- ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
- e = (char *)ext_acl + sizeof(reiserfs_acl_header);
- for (n = 0; n < acl->a_count; n++) {
- const struct posix_acl_entry *acl_e = &acl->a_entries[n];
- reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e;
- entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
- entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
- switch (acl->a_entries[n].e_tag) {
- case ACL_USER:
- entry->e_id = cpu_to_le32(
- from_kuid(&init_user_ns, acl_e->e_uid));
- e += sizeof(reiserfs_acl_entry);
- break;
- case ACL_GROUP:
- entry->e_id = cpu_to_le32(
- from_kgid(&init_user_ns, acl_e->e_gid));
- e += sizeof(reiserfs_acl_entry);
- break;
-
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- e += sizeof(reiserfs_acl_entry_short);
- break;
-
- default:
- goto fail;
- }
- }
- return (char *)ext_acl;
-
-fail:
- kfree(ext_acl);
- return ERR_PTR(-EINVAL);
-}
-
-/*
- * Inode operation get_posix_acl().
- *
- * inode->i_mutex: down
- * BKL held [before 2.5.x]
- */
-struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu)
-{
- char *name, *value;
- struct posix_acl *acl;
- int size;
- int retval;
-
- if (rcu)
- return ERR_PTR(-ECHILD);
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- name = XATTR_NAME_POSIX_ACL_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- name = XATTR_NAME_POSIX_ACL_DEFAULT;
- break;
- default:
- BUG();
- }
-
- size = reiserfs_xattr_get(inode, name, NULL, 0);
- if (size < 0) {
- if (size == -ENODATA || size == -ENOSYS)
- return NULL;
- return ERR_PTR(size);
- }
-
- value = kmalloc(size, GFP_NOFS);
- if (!value)
- return ERR_PTR(-ENOMEM);
-
- retval = reiserfs_xattr_get(inode, name, value, size);
- if (retval == -ENODATA || retval == -ENOSYS) {
- /*
- * This shouldn't actually happen as it should have
- * been caught above.. but just in case
- */
- acl = NULL;
- } else if (retval < 0) {
- acl = ERR_PTR(retval);
- } else {
- acl = reiserfs_posix_acl_from_disk(value, retval);
- }
-
- kfree(value);
- return acl;
-}
-
-/*
- * Inode operation set_posix_acl().
- *
- * inode->i_mutex: down
- * BKL held [before 2.5.x]
- */
-static int
-__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
- int type, struct posix_acl *acl)
-{
- char *name;
- void *value = NULL;
- size_t size = 0;
- int error;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- name = XATTR_NAME_POSIX_ACL_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- name = XATTR_NAME_POSIX_ACL_DEFAULT;
- if (!S_ISDIR(inode->i_mode))
- return acl ? -EACCES : 0;
- break;
- default:
- return -EINVAL;
- }
-
- if (acl) {
- value = reiserfs_posix_acl_to_disk(acl, &size);
- if (IS_ERR(value))
- return (int)PTR_ERR(value);
- }
-
- error = reiserfs_xattr_set_handle(th, inode, name, value, size, 0);
-
- /*
- * Ensure that the inode gets dirtied if we're only using
- * the mode bits and an old ACL didn't exist. We don't need
- * to check if the inode is hashed here since we won't get
- * called by reiserfs_inherit_default_acl().
- */
- if (error == -ENODATA) {
- error = 0;
- if (type == ACL_TYPE_ACCESS) {
- inode_set_ctime_current(inode);
- mark_inode_dirty(inode);
- }
- }
-
- kfree(value);
-
- if (!error)
- set_cached_acl(inode, type, acl);
-
- return error;
-}
-
-/*
- * dir->i_mutex: locked,
- * inode is new and not released into the wild yet
- */
-int
-reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
- struct inode *dir, struct dentry *dentry,
- struct inode *inode)
-{
- struct posix_acl *default_acl, *acl;
- int err = 0;
-
- /* ACLs only get applied to files and directories */
- if (S_ISLNK(inode->i_mode))
- return 0;
-
- /*
- * ACLs can only be used on "new" objects, so if it's an old object
- * there is nothing to inherit from
- */
- if (get_inode_sd_version(dir) == STAT_DATA_V1)
- goto apply_umask;
-
- /*
- * Don't apply ACLs to objects in the .reiserfs_priv tree.. This
- * would be useless since permissions are ignored, and a pain because
- * it introduces locking cycles
- */
- if (IS_PRIVATE(inode))
- goto apply_umask;
-
- err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
- if (err)
- return err;
-
- if (default_acl) {
- err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
- default_acl);
- posix_acl_release(default_acl);
- }
- if (acl) {
- if (!err)
- err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS,
- acl);
- posix_acl_release(acl);
- }
-
- return err;
-
-apply_umask:
- /* no ACL, apply umask */
- inode->i_mode &= ~current_umask();
- return err;
-}
-
-/* This is used to cache the default acl before a new object is created.
- * The biggest reason for this is to get an idea of how many blocks will
- * actually be required for the create operation if we must inherit an ACL.
- * An ACL write can add up to 3 object creations and an additional file write
- * so we'd prefer not to reserve that many blocks in the journal if we can.
- * It also has the advantage of not loading the ACL with a transaction open,
- * this may seem silly, but if the owner of the directory is doing the
- * creation, the ACL may not be loaded since the permissions wouldn't require
- * it.
- * We return the number of blocks required for the transaction.
- */
-int reiserfs_cache_default_acl(struct inode *inode)
-{
- struct posix_acl *acl;
- int nblocks = 0;
-
- if (IS_PRIVATE(inode))
- return 0;
-
- acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
-
- if (acl && !IS_ERR(acl)) {
- int size = reiserfs_acl_size(acl->a_count);
-
- /* Other xattrs can be created during inode creation. We don't
- * want to claim too many blocks, so we check to see if we
- * need to create the tree to the xattrs, and then we
- * just want two files. */
- nblocks = reiserfs_xattr_jcreate_nblocks(inode);
- nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
-
- REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
-
- /* We need to account for writes + bitmaps for two files */
- nblocks += reiserfs_xattr_nblocks(inode, size) * 4;
- posix_acl_release(acl);
- }
-
- return nblocks;
-}
-
-/*
- * Called under i_mutex
- */
-int reiserfs_acl_chmod(struct dentry *dentry)
-{
- struct inode *inode = d_inode(dentry);
-
- if (IS_PRIVATE(inode))
- return 0;
- if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
- !reiserfs_posixacl(inode->i_sb))
- return 0;
-
- return posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode);
-}
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
deleted file mode 100644
index 078dd8cc312f..000000000000
--- a/fs/reiserfs/xattr_security.c
+++ /dev/null
@@ -1,127 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include <linux/slab.h>
-#include "xattr.h"
-#include <linux/security.h>
-#include <linux/uaccess.h>
-
-static int
-security_get(const struct xattr_handler *handler, struct dentry *unused,
- struct inode *inode, const char *name, void *buffer, size_t size)
-{
- if (IS_PRIVATE(inode))
- return -EPERM;
-
- return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
- buffer, size);
-}
-
-static int
-security_set(const struct xattr_handler *handler,
- struct mnt_idmap *idmap, struct dentry *unused,
- struct inode *inode, const char *name, const void *buffer,
- size_t size, int flags)
-{
- if (IS_PRIVATE(inode))
- return -EPERM;
-
- return reiserfs_xattr_set(inode,
- xattr_full_name(handler, name),
- buffer, size, flags);
-}
-
-static bool security_list(struct dentry *dentry)
-{
- return !IS_PRIVATE(d_inode(dentry));
-}
-
-static int
-reiserfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
-{
- struct reiserfs_security_handle *sec = fs_info;
-
- sec->value = kmemdup(xattr_array->value, xattr_array->value_len,
- GFP_KERNEL);
- if (!sec->value)
- return -ENOMEM;
-
- sec->name = xattr_array->name;
- sec->length = xattr_array->value_len;
- return 0;
-}
-
-/* Initializes the security context for a new inode and returns the number
- * of blocks needed for the transaction. If successful, reiserfs_security
- * must be released using reiserfs_security_free when the caller is done. */
-int reiserfs_security_init(struct inode *dir, struct inode *inode,
- const struct qstr *qstr,
- struct reiserfs_security_handle *sec)
-{
- int blocks = 0;
- int error;
-
- sec->name = NULL;
- sec->value = NULL;
- sec->length = 0;
-
- /* Don't add selinux attributes on xattrs - they'll never get used */
- if (IS_PRIVATE(dir))
- return 0;
-
- error = security_inode_init_security(inode, dir, qstr,
- &reiserfs_initxattrs, sec);
- if (error) {
- sec->name = NULL;
- sec->value = NULL;
- sec->length = 0;
- return error;
- }
-
- if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
- blocks = reiserfs_xattr_jcreate_nblocks(inode) +
- reiserfs_xattr_nblocks(inode, sec->length);
- /* We don't want to count the directories twice if we have
- * a default ACL. */
- REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
- }
- return blocks;
-}
-
-int reiserfs_security_write(struct reiserfs_transaction_handle *th,
- struct inode *inode,
- struct reiserfs_security_handle *sec)
-{
- char xattr_name[XATTR_NAME_MAX + 1] = XATTR_SECURITY_PREFIX;
- int error;
-
- if (XATTR_SECURITY_PREFIX_LEN + strlen(sec->name) > XATTR_NAME_MAX)
- return -EINVAL;
-
- strlcat(xattr_name, sec->name, sizeof(xattr_name));
-
- error = reiserfs_xattr_set_handle(th, inode, xattr_name, sec->value,
- sec->length, XATTR_CREATE);
- if (error == -ENODATA || error == -EOPNOTSUPP)
- error = 0;
-
- return error;
-}
-
-void reiserfs_security_free(struct reiserfs_security_handle *sec)
-{
- kfree(sec->value);
- sec->name = NULL;
- sec->value = NULL;
-}
-
-const struct xattr_handler reiserfs_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .get = security_get,
- .set = security_set,
- .list = security_list,
-};
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
deleted file mode 100644
index 0c0c74d8db0e..000000000000
--- a/fs/reiserfs/xattr_trusted.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include "xattr.h"
-#include <linux/uaccess.h>
-
-static int
-trusted_get(const struct xattr_handler *handler, struct dentry *unused,
- struct inode *inode, const char *name, void *buffer, size_t size)
-{
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
- return -EPERM;
-
- return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
- buffer, size);
-}
-
-static int
-trusted_set(const struct xattr_handler *handler,
- struct mnt_idmap *idmap, struct dentry *unused,
- struct inode *inode, const char *name, const void *buffer,
- size_t size, int flags)
-{
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
- return -EPERM;
-
- return reiserfs_xattr_set(inode,
- xattr_full_name(handler, name),
- buffer, size, flags);
-}
-
-static bool trusted_list(struct dentry *dentry)
-{
- return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
-}
-
-const struct xattr_handler reiserfs_xattr_trusted_handler = {
- .prefix = XATTR_TRUSTED_PREFIX,
- .get = trusted_get,
- .set = trusted_set,
- .list = trusted_list,
-};
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
deleted file mode 100644
index 88195181e1d7..000000000000
--- a/fs/reiserfs/xattr_user.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "reiserfs.h"
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/xattr.h>
-#include "xattr.h"
-#include <linux/uaccess.h>
-
-static int
-user_get(const struct xattr_handler *handler, struct dentry *unused,
- struct inode *inode, const char *name, void *buffer, size_t size)
-{
- if (!reiserfs_xattrs_user(inode->i_sb))
- return -EOPNOTSUPP;
- return reiserfs_xattr_get(inode, xattr_full_name(handler, name),
- buffer, size);
-}
-
-static int
-user_set(const struct xattr_handler *handler, struct mnt_idmap *idmap,
- struct dentry *unused,
- struct inode *inode, const char *name, const void *buffer,
- size_t size, int flags)
-{
- if (!reiserfs_xattrs_user(inode->i_sb))
- return -EOPNOTSUPP;
- return reiserfs_xattr_set(inode,
- xattr_full_name(handler, name),
- buffer, size, flags);
-}
-
-static bool user_list(struct dentry *dentry)
-{
- return reiserfs_xattrs_user(dentry->d_sb);
-}
-
-const struct xattr_handler reiserfs_xattr_user_handler = {
- .prefix = XATTR_USER_PREFIX,
- .get = user_get,
- .set = user_set,
- .list = user_list,
-};
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 4403d5c68fcb..26afbbbfb10c 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -536,20 +536,19 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
}
for (i = 0, info = same->info; i < count; i++, info++) {
- struct fd dst_fd = fdget(info->dest_fd);
- struct file *dst_file = fd_file(dst_fd);
+ CLASS(fd, dst_fd)(info->dest_fd);
- if (!dst_file) {
+ if (fd_empty(dst_fd)) {
info->status = -EBADF;
goto next_loop;
}
if (info->reserved) {
info->status = -EINVAL;
- goto next_fdput;
+ goto next_loop;
}
- deduped = vfs_dedupe_file_range_one(file, off, dst_file,
+ deduped = vfs_dedupe_file_range_one(file, off, fd_file(dst_fd),
info->dest_offset, len,
REMAP_FILE_CAN_SHORTEN);
if (deduped == -EBADE)
@@ -559,8 +558,6 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
else
info->bytes_deduped = len;
-next_fdput:
- fdput(dst_fd);
next_loop:
if (fatal_signal_pending(current))
break;
diff --git a/fs/select.c b/fs/select.c
index a77907faf2b4..e223d1fe9d55 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -462,15 +462,22 @@ get_max:
EPOLLNVAL)
#define POLLEX_SET (EPOLLPRI | EPOLLNVAL)
-static inline void wait_key_set(poll_table *wait, unsigned long in,
+static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in,
unsigned long out, unsigned long bit,
__poll_t ll_flag)
{
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return EPOLLNVAL;
+
wait->_key = POLLEX_SET | ll_flag;
if (in & bit)
wait->_key |= POLLIN_SET;
if (out & bit)
wait->_key |= POLLOUT_SET;
+
+ return vfs_poll(fd_file(f), wait);
}
static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
@@ -522,20 +529,12 @@ static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec
}
for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
- struct fd f;
if (i >= n)
break;
if (!(bit & all_bits))
continue;
- mask = EPOLLNVAL;
- f = fdget(i);
- if (fd_file(f)) {
- wait_key_set(wait, in, out, bit,
- busy_flag);
- mask = vfs_poll(fd_file(f), wait);
-
- fdput(f);
- }
+ mask = select_poll_one(i, wait, in, out, bit,
+ busy_flag);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
@@ -856,15 +855,14 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
__poll_t busy_flag)
{
int fd = pollfd->fd;
- __poll_t mask = 0, filter;
- struct fd f;
+ __poll_t mask, filter;
if (fd < 0)
- goto out;
- mask = EPOLLNVAL;
- f = fdget(fd);
- if (!fd_file(f))
- goto out;
+ return 0;
+
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return EPOLLNVAL;
/* userland u16 ->events contains POLL... bitmap */
filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
@@ -872,13 +870,7 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
mask = vfs_poll(fd_file(f), pwait);
if (mask & busy_flag)
*can_busy_poll = true;
- mask &= filter; /* Mask out unneeded events. */
- fdput(f);
-
-out:
- /* ... and so does ->revents */
- pollfd->revents = mangle_poll(mask);
- return mask;
+ return mask & filter; /* Mask out unneeded events. */
}
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
@@ -910,6 +902,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
+ __poll_t mask;
/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
@@ -917,8 +910,9 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
* this. They'll get immediately deregistered
* when we break out and return.
*/
- if (do_pollfd(pfd, pt, &can_busy_loop,
- busy_flag)) {
+ mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag);
+ pfd->revents = mangle_poll(mask);
+ if (mask) {
count++;
pt->_qproc = NULL;
/* found something, stop busy polling */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e676c8b0cf5d..8bbb1ad46335 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -343,8 +343,8 @@ EXPORT_SYMBOL(seq_lseek);
/**
* seq_release - free the structures associated with sequential file.
- * @file: file in question
* @inode: its inode
+ * @file: file in question
*
* Frees the structures associated with sequential file; can be used
* as ->f_op->release() if you don't have private data to destroy.
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 736bebf93591..d1a5f43ce466 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -288,20 +288,17 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
fd_install(ufd, file);
} else {
- struct fd f = fdget(ufd);
- if (!fd_file(f))
+ CLASS(fd, f)(ufd);
+ if (fd_empty(f))
return -EBADF;
ctx = fd_file(f)->private_data;
- if (fd_file(f)->f_op != &signalfd_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &signalfd_fops)
return -EINVAL;
- }
spin_lock_irq(&current->sighand->siglock);
ctx->sigmask = *mask;
spin_unlock_irq(&current->sighand->siglock);
wake_up(&current->sighand->signalfd_wqh);
- fdput(f);
}
return ufd;
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index 2ce193609d8b..56439da4f119 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -72,7 +72,6 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
unsigned long srcfd)
{
int rc;
- struct fd src_file;
struct inode *src_inode;
cifs_dbg(FYI, "ioctl copychunk range\n");
@@ -89,8 +88,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
return rc;
}
- src_file = fdget(srcfd);
- if (!fd_file(src_file)) {
+ CLASS(fd, src_file)(srcfd);
+ if (fd_empty(src_file)) {
rc = -EBADF;
goto out_drop_write;
}
@@ -98,20 +97,18 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
if (fd_file(src_file)->f_op->unlocked_ioctl != cifs_ioctl) {
rc = -EBADF;
cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
- goto out_fput;
+ goto out_drop_write;
}
src_inode = file_inode(fd_file(src_file));
rc = -EINVAL;
if (S_ISDIR(src_inode->i_mode))
- goto out_fput;
+ goto out_drop_write;
rc = cifs_file_copychunk_range(xid, fd_file(src_file), 0, dst_file, 0,
src_inode->i_size, 0);
if (rc > 0)
rc = 0;
-out_fput:
- fdput(src_file);
out_drop_write:
mnt_drop_write_file(dst_file);
return rc;
diff --git a/fs/splice.c b/fs/splice.c
index 06232d7e505f..2898fa1e9e63 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1564,21 +1564,6 @@ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
return ret;
}
-static int vmsplice_type(struct fd f, int *type)
-{
- if (!fd_file(f))
- return -EBADF;
- if (fd_file(f)->f_mode & FMODE_WRITE) {
- *type = ITER_SOURCE;
- } else if (fd_file(f)->f_mode & FMODE_READ) {
- *type = ITER_DEST;
- } else {
- fdput(f);
- return -EBADF;
- }
- return 0;
-}
-
/*
* Note that vmsplice only really supports true splicing _from_ user memory
* to a pipe, not the other way around. Splicing from user memory is a simple
@@ -1602,21 +1587,25 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
struct iovec *iov = iovstack;
struct iov_iter iter;
ssize_t error;
- struct fd f;
int type;
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
- f = fdget(fd);
- error = vmsplice_type(f, &type);
- if (error)
- return error;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+ if (fd_file(f)->f_mode & FMODE_WRITE)
+ type = ITER_SOURCE;
+ else if (fd_file(f)->f_mode & FMODE_READ)
+ type = ITER_DEST;
+ else
+ return -EBADF;
error = import_iovec(type, uiov, nr_segs,
ARRAY_SIZE(iovstack), &iov, &iter);
if (error < 0)
- goto out_fdput;
+ return error;
if (!iov_iter_count(&iter))
error = 0;
@@ -1626,8 +1615,6 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
error = vmsplice_to_user(fd_file(f), &iter, flags);
kfree(iov);
-out_fdput:
- fdput(f);
return error;
}
@@ -1635,27 +1622,22 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
int, fd_out, loff_t __user *, off_out,
size_t, len, unsigned int, flags)
{
- struct fd in, out;
- ssize_t error;
-
if (unlikely(!len))
return 0;
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
- error = -EBADF;
- in = fdget(fd_in);
- if (fd_file(in)) {
- out = fdget(fd_out);
- if (fd_file(out)) {
- error = __do_splice(fd_file(in), off_in, fd_file(out), off_out,
+ CLASS(fd, in)(fd_in);
+ if (fd_empty(in))
+ return -EBADF;
+
+ CLASS(fd, out)(fd_out);
+ if (fd_empty(out))
+ return -EBADF;
+
+ return __do_splice(fd_file(in), off_in, fd_file(out), off_out,
len, flags);
- fdput(out);
- }
- fdput(in);
- }
- return error;
}
/*
@@ -2005,25 +1987,19 @@ ssize_t do_tee(struct file *in, struct file *out, size_t len,
SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
{
- struct fd in, out;
- ssize_t error;
-
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
if (unlikely(!len))
return 0;
- error = -EBADF;
- in = fdget(fdin);
- if (fd_file(in)) {
- out = fdget(fdout);
- if (fd_file(out)) {
- error = do_tee(fd_file(in), fd_file(out), len, flags);
- fdput(out);
- }
- fdput(in);
- }
+ CLASS(fd, in)(fdin);
+ if (fd_empty(in))
+ return -EBADF;
- return error;
+ CLASS(fd, out)(fdout);
+ if (fd_empty(out))
+ return -EBADF;
+
+ return do_tee(fd_file(in), fd_file(out), len, flags);
}
diff --git a/fs/stat.c b/fs/stat.c
index 41e598376d7e..0870e969a8a0 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -23,10 +23,46 @@
#include <linux/uaccess.h>
#include <asm/unistd.h>
+#include <trace/events/timestamp.h>
+
#include "internal.h"
#include "mount.h"
/**
+ * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
+ * @stat: where to store the resulting values
+ * @request_mask: STATX_* values requested
+ * @inode: inode from which to grab the c/mtime
+ *
+ * Given @inode, grab the ctime and mtime out if it and store the result
+ * in @stat. When fetching the value, flag it as QUERIED (if not already)
+ * so the next write will record a distinct timestamp.
+ *
+ * NB: The QUERIED flag is tracked in the ctime, but we set it there even
+ * if only the mtime was requested, as that ensures that the next mtime
+ * change will be distinct.
+ */
+void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
+{
+ atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec;
+
+ /* If neither time was requested, then don't report them */
+ if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
+ stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
+ return;
+ }
+
+ stat->mtime = inode_get_mtime(inode);
+ stat->ctime.tv_sec = inode->i_ctime_sec;
+ stat->ctime.tv_nsec = (u32)atomic_read(pcn);
+ if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED))
+ stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn));
+ stat->ctime.tv_nsec &= ~I_CTIME_QUERIED;
+ trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime);
+}
+EXPORT_SYMBOL(fill_mg_cmtime);
+
+/**
* generic_fillattr - Fill in the basic attributes from the inode struct
* @idmap: idmap of the mount the inode was found from
* @request_mask: statx request_mask
@@ -58,8 +94,14 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
stat->atime = inode_get_atime(inode);
- stat->mtime = inode_get_mtime(inode);
- stat->ctime = inode_get_ctime(inode);
+
+ if (is_mgtime(inode)) {
+ fill_mg_cmtime(stat, request_mask, inode);
+ } else {
+ stat->ctime = inode_get_ctime(inode);
+ stat->mtime = inode_get_mtime(inode);
+ }
+
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
@@ -165,7 +207,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
if (inode->i_op->getattr)
return inode->i_op->getattr(idmap, path, stat,
request_mask,
- query_flags | AT_GETATTR_NOSEC);
+ query_flags);
generic_fillattr(idmap, request_mask, inode, stat);
return 0;
@@ -198,9 +240,6 @@ int vfs_getattr(const struct path *path, struct kstat *stat,
{
int retval;
- if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC))
- return -EPERM;
-
retval = security_inode_getattr(path);
if (retval)
return retval;
@@ -220,18 +259,13 @@ EXPORT_SYMBOL(vfs_getattr);
*/
int vfs_fstat(int fd, struct kstat *stat)
{
- struct fd f;
- int error;
-
- f = fdget_raw(fd);
- if (!fd_file(f))
+ CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- error = vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0);
- fdput(f);
- return error;
+ return vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0);
}
-int getname_statx_lookup_flags(int flags)
+static int statx_lookup_flags(int flags)
{
int lookup_flags = 0;
@@ -239,8 +273,6 @@ int getname_statx_lookup_flags(int flags)
lookup_flags |= LOOKUP_FOLLOW;
if (!(flags & AT_NO_AUTOMOUNT))
lookup_flags |= LOOKUP_AUTOMOUNT;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
return lookup_flags;
}
@@ -277,7 +309,7 @@ static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
u32 request_mask)
{
CLASS(fd_raw, f)(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
return vfs_statx_path(&fd_file(f)->f_path, flags, stat, request_mask);
}
@@ -301,7 +333,7 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
struct kstat *stat, u32 request_mask)
{
struct path path;
- unsigned int lookup_flags = getname_statx_lookup_flags(flags);
+ unsigned int lookup_flags = statx_lookup_flags(flags);
int error;
if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
@@ -326,18 +358,11 @@ int vfs_fstatat(int dfd, const char __user *filename,
{
int ret;
int statx_flags = flags | AT_NO_AUTOMOUNT;
- struct filename *name;
+ struct filename *name = getname_maybe_null(filename, flags);
- /*
- * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH)
- *
- * If AT_EMPTY_PATH is set, we expect the common case to be that
- * empty path, and avoid doing all the extra pathname work.
- */
- if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+ if (!name && dfd >= 0)
return vfs_fstat(dfd, stat);
- name = getname_flags(filename, getname_statx_lookup_flags(statx_flags));
ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
putname(name);
@@ -774,24 +799,11 @@ SYSCALL_DEFINE5(statx,
struct statx __user *, buffer)
{
int ret;
- unsigned lflags;
- struct filename *name;
+ struct filename *name = getname_maybe_null(filename, flags);
- /*
- * Short-circuit handling of NULL and "" paths.
- *
- * For a NULL path we require and accept only the AT_EMPTY_PATH flag
- * (possibly |'d with AT_STATX flags).
- *
- * However, glibc on 32-bit architectures implements fstatat as statx
- * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags.
- * Supporting this results in the uglification below.
- */
- lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE);
- if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+ if (!name && dfd >= 0)
return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer);
- name = getname_flags(filename, getname_statx_lookup_flags(flags));
ret = do_statx(dfd, name, flags, mask, buffer);
putname(name);
diff --git a/fs/statfs.c b/fs/statfs.c
index 9c7bb27e7932..a45ac85e6048 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -114,13 +114,11 @@ retry:
int fd_statfs(int fd, struct kstatfs *st)
{
- struct fd f = fdget_raw(fd);
- int error = -EBADF;
- if (fd_file(f)) {
- error = vfs_statfs(&fd_file(f)->f_path, st);
- fdput(f);
- }
- return error;
+ CLASS(fd_raw, f)(fd);
+
+ if (fd_empty(f))
+ return -EBADF;
+ return vfs_statfs(&fd_file(f)->f_path, st);
}
static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
diff --git a/fs/sync.c b/fs/sync.c
index 67df255eb189..2955cd4c77a3 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -148,11 +148,11 @@ void emergency_sync(void)
*/
SYSCALL_DEFINE1(syncfs, int, fd)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
struct super_block *sb;
int ret, ret2;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
sb = fd_file(f)->f_path.dentry->d_sb;
@@ -162,7 +162,6 @@ SYSCALL_DEFINE1(syncfs, int, fd)
ret2 = errseq_check_and_advance(&sb->s_wb_err, &fd_file(f)->f_sb_err);
- fdput(f);
return ret ? ret : ret2;
}
@@ -205,14 +204,12 @@ EXPORT_SYMBOL(vfs_fsync);
static int do_fsync(unsigned int fd, int datasync)
{
- struct fd f = fdget(fd);
- int ret = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- ret = vfs_fsync(fd_file(f), datasync);
- fdput(f);
- }
- return ret;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fsync(fd_file(f), datasync);
}
SYSCALL_DEFINE1(fsync, unsigned int, fd)
@@ -355,16 +352,12 @@ out:
int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
unsigned int flags)
{
- int ret;
- struct fd f;
+ CLASS(fd, f)(fd);
- ret = -EBADF;
- f = fdget(fd);
- if (fd_file(f))
- ret = sync_file_range(fd_file(f), offset, nbytes, flags);
+ if (fd_empty(f))
+ return -EBADF;
- fdput(f);
- return ret;
+ return sync_file_range(fd_file(f), offset, nbytes, flags);
}
SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 137523e0bb21..9f7eb451a60f 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -79,13 +79,11 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
return HRTIMER_NORESTART;
}
-static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
- ktime_t now)
+static void timerfd_alarmproc(struct alarm *alarm, ktime_t now)
{
struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
t.alarm);
timerfd_triggered(ctx);
- return ALARMTIMER_NORESTART;
}
/*
@@ -394,19 +392,6 @@ static const struct file_operations timerfd_fops = {
.unlocked_ioctl = timerfd_ioctl,
};
-static int timerfd_fget(int fd, struct fd *p)
-{
- struct fd f = fdget(fd);
- if (!fd_file(f))
- return -EBADF;
- if (fd_file(f)->f_op != &timerfd_fops) {
- fdput(f);
- return -EINVAL;
- }
- *p = f;
- return 0;
-}
-
SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
{
int ufd;
@@ -471,7 +456,6 @@ static int do_timerfd_settime(int ufd, int flags,
const struct itimerspec64 *new,
struct itimerspec64 *old)
{
- struct fd f;
struct timerfd_ctx *ctx;
int ret;
@@ -479,15 +463,17 @@ static int do_timerfd_settime(int ufd, int flags,
!itimerspec64_valid(new))
return -EINVAL;
- ret = timerfd_fget(ufd, &f);
- if (ret)
- return ret;
+ CLASS(fd, f)(ufd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ if (fd_file(f)->f_op != &timerfd_fops)
+ return -EINVAL;
+
ctx = fd_file(f)->private_data;
- if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) {
- fdput(f);
+ if (isalarm(ctx) && !capable(CAP_WAKE_ALARM))
return -EPERM;
- }
timerfd_setup_cancel(ctx, flags);
@@ -535,17 +521,18 @@ static int do_timerfd_settime(int ufd, int flags,
ret = timerfd_setup(ctx, flags, new);
spin_unlock_irq(&ctx->wqh.lock);
- fdput(f);
return ret;
}
static int do_timerfd_gettime(int ufd, struct itimerspec64 *t)
{
- struct fd f;
struct timerfd_ctx *ctx;
- int ret = timerfd_fget(ufd, &f);
- if (ret)
- return ret;
+ CLASS(fd, f)(ufd);
+
+ if (fd_empty(f))
+ return -EBADF;
+ if (fd_file(f)->f_op != &timerfd_fops)
+ return -EINVAL;
ctx = fd_file(f)->private_data;
spin_lock_irq(&ctx->wqh.lock);
@@ -567,7 +554,6 @@ static int do_timerfd_gettime(int ufd, struct itimerspec64 *t)
t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx));
t->it_interval = ktime_to_timespec64(ctx->tintv);
spin_unlock_irq(&ctx->wqh.lock);
- fdput(f);
return 0;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 291583005dd1..3fb308b6e167 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -19,9 +19,9 @@
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/kthread.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/seq_file.h>
-#include <linux/mount.h>
#include <linux/math64.h>
#include <linux/writeback.h>
#include "ubifs.h"
@@ -981,177 +981,120 @@ enum {
Opt_auth_key,
Opt_auth_hash_name,
Opt_ignore,
- Opt_err,
};
-static const match_table_t tokens = {
- {Opt_fast_unmount, "fast_unmount"},
- {Opt_norm_unmount, "norm_unmount"},
- {Opt_bulk_read, "bulk_read"},
- {Opt_no_bulk_read, "no_bulk_read"},
- {Opt_chk_data_crc, "chk_data_crc"},
- {Opt_no_chk_data_crc, "no_chk_data_crc"},
- {Opt_override_compr, "compr=%s"},
- {Opt_auth_key, "auth_key=%s"},
- {Opt_auth_hash_name, "auth_hash_name=%s"},
- {Opt_ignore, "ubi=%s"},
- {Opt_ignore, "vol=%s"},
- {Opt_assert, "assert=%s"},
- {Opt_err, NULL},
+static const struct constant_table ubifs_param_compr[] = {
+ { "none", UBIFS_COMPR_NONE },
+ { "lzo", UBIFS_COMPR_LZO },
+ { "zlib", UBIFS_COMPR_ZLIB },
+ { "zstd", UBIFS_COMPR_ZSTD },
+ {}
};
-/**
- * parse_standard_option - parse a standard mount option.
- * @option: the option to parse
- *
- * Normally, standard mount options like "sync" are passed to file-systems as
- * flags. However, when a "rootflags=" kernel boot parameter is used, they may
- * be present in the options string. This function tries to deal with this
- * situation and parse standard options. Returns 0 if the option was not
- * recognized, and the corresponding integer flag if it was.
- *
- * UBIFS is only interested in the "sync" option, so do not check for anything
- * else.
- */
-static int parse_standard_option(const char *option)
-{
+static const struct constant_table ubifs_param_assert[] = {
+ { "report", ASSACT_REPORT },
+ { "read-only", ASSACT_RO },
+ { "panic", ASSACT_PANIC },
+ {}
+};
- pr_notice("UBIFS: parse %s\n", option);
- if (!strcmp(option, "sync"))
- return SB_SYNCHRONOUS;
- return 0;
-}
+static const struct fs_parameter_spec ubifs_fs_param_spec[] = {
+ fsparam_flag ("fast_unmount", Opt_fast_unmount),
+ fsparam_flag ("norm_unmount", Opt_norm_unmount),
+ fsparam_flag ("bulk_read", Opt_bulk_read),
+ fsparam_flag ("no_bulk_read", Opt_no_bulk_read),
+ fsparam_flag ("chk_data_crc", Opt_chk_data_crc),
+ fsparam_flag ("no_chk_data_crc", Opt_no_chk_data_crc),
+ fsparam_enum ("compr", Opt_override_compr, ubifs_param_compr),
+ fsparam_enum ("assert", Opt_assert, ubifs_param_assert),
+ fsparam_string ("auth_key", Opt_auth_key),
+ fsparam_string ("auth_hash_name", Opt_auth_hash_name),
+ fsparam_string ("ubi", Opt_ignore),
+ fsparam_string ("vol", Opt_ignore),
+ {}
+};
+
+struct ubifs_fs_context {
+ struct ubifs_mount_opts mount_opts;
+ char *auth_key_name;
+ char *auth_hash_name;
+ unsigned int no_chk_data_crc:1;
+ unsigned int bulk_read:1;
+ unsigned int default_compr:2;
+ unsigned int assert_action:2;
+};
/**
- * ubifs_parse_options - parse mount parameters.
- * @c: UBIFS file-system description object
- * @options: parameters to parse
- * @is_remount: non-zero if this is FS re-mount
+ * ubifs_parse_param - parse a parameter.
+ * @fc: the filesystem context
+ * @param: the parameter to parse
*
* This function parses UBIFS mount options and returns zero in case success
* and a negative error code in case of failure.
*/
-static int ubifs_parse_options(struct ubifs_info *c, char *options,
- int is_remount)
+static int ubifs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
-
- if (!options)
- return 0;
+ struct ubifs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE);
+ int opt;
- while ((p = strsep(&options, ","))) {
- int token;
+ opt = fs_parse(fc, ubifs_fs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
+ switch (opt) {
/*
* %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
* We accept them in order to be backward-compatible. But this
* should be removed at some point.
*/
- case Opt_fast_unmount:
- c->mount_opts.unmount_mode = 2;
- break;
- case Opt_norm_unmount:
- c->mount_opts.unmount_mode = 1;
- break;
- case Opt_bulk_read:
- c->mount_opts.bulk_read = 2;
- c->bulk_read = 1;
- break;
- case Opt_no_bulk_read:
- c->mount_opts.bulk_read = 1;
- c->bulk_read = 0;
- break;
- case Opt_chk_data_crc:
- c->mount_opts.chk_data_crc = 2;
- c->no_chk_data_crc = 0;
- break;
- case Opt_no_chk_data_crc:
- c->mount_opts.chk_data_crc = 1;
- c->no_chk_data_crc = 1;
- break;
- case Opt_override_compr:
- {
- char *name = match_strdup(&args[0]);
-
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "none"))
- c->mount_opts.compr_type = UBIFS_COMPR_NONE;
- else if (!strcmp(name, "lzo"))
- c->mount_opts.compr_type = UBIFS_COMPR_LZO;
- else if (!strcmp(name, "zlib"))
- c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
- else if (!strcmp(name, "zstd"))
- c->mount_opts.compr_type = UBIFS_COMPR_ZSTD;
- else {
- ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready?
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- c->mount_opts.override_compr = 1;
- c->default_compr = c->mount_opts.compr_type;
- break;
- }
- case Opt_assert:
- {
- char *act = match_strdup(&args[0]);
-
- if (!act)
- return -ENOMEM;
- if (!strcmp(act, "report"))
- c->assert_action = ASSACT_REPORT;
- else if (!strcmp(act, "read-only"))
- c->assert_action = ASSACT_RO;
- else if (!strcmp(act, "panic"))
- c->assert_action = ASSACT_PANIC;
- else {
- ubifs_err(c, "unknown assert action \"%s\"", act);
- kfree(act);
- return -EINVAL;
- }
- kfree(act);
- break;
- }
- case Opt_auth_key:
- if (!is_remount) {
- c->auth_key_name = kstrdup(args[0].from,
- GFP_KERNEL);
- if (!c->auth_key_name)
- return -ENOMEM;
- }
- break;
- case Opt_auth_hash_name:
- if (!is_remount) {
- c->auth_hash_name = kstrdup(args[0].from,
- GFP_KERNEL);
- if (!c->auth_hash_name)
- return -ENOMEM;
- }
- break;
- case Opt_ignore:
- break;
- default:
- {
- unsigned long flag;
- struct super_block *sb = c->vfs_sb;
-
- flag = parse_standard_option(p);
- if (!flag) {
- ubifs_err(c, "unrecognized mount option \"%s\" or missing value",
- p);
- return -EINVAL;
- }
- sb->s_flags |= flag;
- break;
+ case Opt_fast_unmount:
+ ctx->mount_opts.unmount_mode = 2;
+ break;
+ case Opt_norm_unmount:
+ ctx->mount_opts.unmount_mode = 1;
+ break;
+ case Opt_bulk_read:
+ ctx->mount_opts.bulk_read = 2;
+ ctx->bulk_read = 1;
+ break;
+ case Opt_no_bulk_read:
+ ctx->mount_opts.bulk_read = 1;
+ ctx->bulk_read = 0;
+ break;
+ case Opt_chk_data_crc:
+ ctx->mount_opts.chk_data_crc = 2;
+ ctx->no_chk_data_crc = 0;
+ break;
+ case Opt_no_chk_data_crc:
+ ctx->mount_opts.chk_data_crc = 1;
+ ctx->no_chk_data_crc = 1;
+ break;
+ case Opt_override_compr:
+ ctx->mount_opts.compr_type = result.uint_32;
+ ctx->mount_opts.override_compr = 1;
+ ctx->default_compr = ctx->mount_opts.compr_type;
+ break;
+ case Opt_assert:
+ ctx->assert_action = result.uint_32;
+ break;
+ case Opt_auth_key:
+ if (!is_remount) {
+ kfree(ctx->auth_key_name);
+ ctx->auth_key_name = param->string;
+ param->string = NULL;
}
+ break;
+ case Opt_auth_hash_name:
+ if (!is_remount) {
+ kfree(ctx->auth_hash_name);
+ ctx->auth_hash_name = param->string;
+ param->string = NULL;
}
+ break;
+ case Opt_ignore:
+ break;
}
return 0;
@@ -2003,21 +1946,27 @@ static void ubifs_put_super(struct super_block *sb)
mutex_unlock(&c->umount_mutex);
}
-static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
+static int ubifs_reconfigure(struct fs_context *fc)
{
+ struct ubifs_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
int err;
struct ubifs_info *c = sb->s_fs_info;
sync_filesystem(sb);
- dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
+ dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, fc->sb_flags);
- err = ubifs_parse_options(c, data, 1);
- if (err) {
- ubifs_err(c, "invalid or unknown remount parameter");
- return err;
- }
+ /*
+ * Apply the mount option changes.
+ * auth_key_name and auth_hash_name are ignored on remount.
+ */
+ c->mount_opts = ctx->mount_opts;
+ c->bulk_read = ctx->bulk_read;
+ c->no_chk_data_crc = ctx->no_chk_data_crc;
+ c->default_compr = ctx->default_compr;
+ c->assert_action = ctx->assert_action;
- if (c->ro_mount && !(*flags & SB_RDONLY)) {
+ if (c->ro_mount && !(fc->sb_flags & SB_RDONLY)) {
if (c->ro_error) {
ubifs_msg(c, "cannot re-mount R/W due to prior errors");
return -EROFS;
@@ -2029,7 +1978,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
err = ubifs_remount_rw(c);
if (err)
return err;
- } else if (!c->ro_mount && (*flags & SB_RDONLY)) {
+ } else if (!c->ro_mount && (fc->sb_flags & SB_RDONLY)) {
if (c->ro_error) {
ubifs_msg(c, "cannot re-mount R/O due to prior errors");
return -EROFS;
@@ -2062,14 +2011,13 @@ const struct super_operations ubifs_super_operations = {
.evict_inode = ubifs_evict_inode,
.statfs = ubifs_statfs,
.dirty_inode = ubifs_dirty_inode,
- .remount_fs = ubifs_remount_fs,
.show_options = ubifs_show_options,
.sync_fs = ubifs_sync_fs,
};
/**
* open_ubi - parse UBI device name string and open the UBI device.
- * @name: UBI volume name
+ * @fc: The filesystem context
* @mode: UBI volume open mode
*
* The primary method of mounting UBIFS is by specifying the UBI volume
@@ -2086,15 +2034,13 @@ const struct super_operations ubifs_super_operations = {
* returns UBI volume description object in case of success and a negative
* error code in case of failure.
*/
-static struct ubi_volume_desc *open_ubi(const char *name, int mode)
+static struct ubi_volume_desc *open_ubi(struct fs_context *fc, int mode)
{
struct ubi_volume_desc *ubi;
+ const char *name = fc->source;
int dev, vol;
char *endptr;
- if (!name || !*name)
- return ERR_PTR(-EINVAL);
-
/* First, try to open using the device node path method */
ubi = ubi_open_volume_path(name, mode);
if (!IS_ERR(ubi))
@@ -2102,14 +2048,14 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
/* Try the "nodev" method */
if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
- return ERR_PTR(-EINVAL);
+ goto invalid_source;
/* ubi:NAME method */
if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
return ubi_open_volume_nm(0, name + 4, mode);
if (!isdigit(name[3]))
- return ERR_PTR(-EINVAL);
+ goto invalid_source;
dev = simple_strtoul(name + 3, &endptr, 0);
@@ -2121,7 +2067,7 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
if (*endptr == '_' && isdigit(endptr[1])) {
vol = simple_strtoul(endptr + 1, &endptr, 0);
if (*endptr != '\0')
- return ERR_PTR(-EINVAL);
+ goto invalid_source;
return ubi_open_volume(dev, vol, mode);
}
@@ -2129,7 +2075,8 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
return ubi_open_volume_nm(dev, ++endptr, mode);
- return ERR_PTR(-EINVAL);
+invalid_source:
+ return ERR_PTR(invalf(fc, "Invalid source name"));
}
static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
@@ -2181,9 +2128,10 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
return c;
}
-static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
+static int ubifs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct ubifs_info *c = sb->s_fs_info;
+ struct ubifs_fs_context *ctx = fc->fs_private;
struct inode *root;
int err;
@@ -2195,9 +2143,18 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out;
}
- err = ubifs_parse_options(c, data, 0);
- if (err)
- goto out_close;
+ /* Copy in parsed mount options */
+ c->mount_opts = ctx->mount_opts;
+ c->auth_key_name = ctx->auth_key_name;
+ c->auth_hash_name = ctx->auth_hash_name;
+ c->no_chk_data_crc = ctx->no_chk_data_crc;
+ c->bulk_read = ctx->bulk_read;
+ c->default_compr = ctx->default_compr;
+ c->assert_action = ctx->assert_action;
+
+ /* ubifs_info owns auth strings now */
+ ctx->auth_key_name = NULL;
+ ctx->auth_hash_name = NULL;
/*
* UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
@@ -2264,41 +2221,38 @@ out:
return err;
}
-static int sb_test(struct super_block *sb, void *data)
+static int sb_test(struct super_block *sb, struct fs_context *fc)
{
- struct ubifs_info *c1 = data;
+ struct ubifs_info *c1 = fc->s_fs_info;
struct ubifs_info *c = sb->s_fs_info;
return c->vi.cdev == c1->vi.cdev;
}
-static int sb_set(struct super_block *sb, void *data)
-{
- sb->s_fs_info = data;
- return set_anon_super(sb, NULL);
-}
-
-static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
- const char *name, void *data)
+static int ubifs_get_tree(struct fs_context *fc)
{
struct ubi_volume_desc *ubi;
struct ubifs_info *c;
struct super_block *sb;
int err;
- dbg_gen("name %s, flags %#x", name, flags);
+ if (!fc->source || !*fc->source)
+ return invalf(fc, "No source specified");
+
+ dbg_gen("name %s, flags %#x", fc->source, fc->sb_flags);
/*
* Get UBI device number and volume ID. Mount it read-only so far
* because this might be a new mount point, and UBI allows only one
* read-write user at a time.
*/
- ubi = open_ubi(name, UBI_READONLY);
+ ubi = open_ubi(fc, UBI_READONLY);
if (IS_ERR(ubi)) {
- if (!(flags & SB_SILENT))
+ err = PTR_ERR(ubi);
+ if (!(fc->sb_flags & SB_SILENT))
pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
- current->pid, name, (int)PTR_ERR(ubi));
- return ERR_CAST(ubi);
+ current->pid, fc->source, err);
+ return err;
}
c = alloc_ubifs_info(ubi);
@@ -2306,10 +2260,11 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
err = -ENOMEM;
goto out_close;
}
+ fc->s_fs_info = c;
dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
- sb = sget(fs_type, sb_test, sb_set, flags, c);
+ sb = sget_fc(fc, sb_test, set_anon_super_fc);
if (IS_ERR(sb)) {
err = PTR_ERR(sb);
kfree(c);
@@ -2321,12 +2276,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
kfree(c);
/* A new mount point for already mounted UBIFS */
dbg_gen("this ubi volume is already mounted");
- if (!!(flags & SB_RDONLY) != c1->ro_mount) {
+ if (!!(fc->sb_flags & SB_RDONLY) != c1->ro_mount) {
err = -EBUSY;
goto out_deact;
}
} else {
- err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
+ err = ubifs_fill_super(sb, fc);
if (err)
goto out_deact;
/* We do not support atime */
@@ -2340,13 +2295,14 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
/* 'fill_super()' opens ubi again so we must close it here */
ubi_close_volume(ubi);
- return dget(sb->s_root);
+ fc->root = dget(sb->s_root);
+ return 0;
out_deact:
deactivate_locked_super(sb);
out_close:
ubi_close_volume(ubi);
- return ERR_PTR(err);
+ return err;
}
static void kill_ubifs_super(struct super_block *s)
@@ -2356,10 +2312,61 @@ static void kill_ubifs_super(struct super_block *s)
kfree(c);
}
+static void ubifs_free_fc(struct fs_context *fc)
+{
+ struct ubifs_fs_context *ctx = fc->fs_private;
+
+ if (ctx) {
+ kfree(ctx->auth_key_name);
+ kfree(ctx->auth_hash_name);
+ kfree(ctx);
+ }
+}
+
+static const struct fs_context_operations ubifs_context_ops = {
+ .free = ubifs_free_fc,
+ .parse_param = ubifs_parse_param,
+ .get_tree = ubifs_get_tree,
+ .reconfigure = ubifs_reconfigure,
+};
+
+static int ubifs_init_fs_context(struct fs_context *fc)
+{
+ struct ubifs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct ubifs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) {
+ /* Iniitialize for first mount */
+ ctx->no_chk_data_crc = 1;
+ ctx->assert_action = ASSACT_RO;
+ } else {
+ struct ubifs_info *c = fc->root->d_sb->s_fs_info;
+
+ /*
+ * Preserve existing options across remounts.
+ * auth_key_name and auth_hash_name are not remountable.
+ */
+ ctx->mount_opts = c->mount_opts;
+ ctx->bulk_read = c->bulk_read;
+ ctx->no_chk_data_crc = c->no_chk_data_crc;
+ ctx->default_compr = c->default_compr;
+ ctx->assert_action = c->assert_action;
+ }
+
+ fc->ops = &ubifs_context_ops;
+ fc->fs_private = ctx;
+
+ return 0;
+}
+
static struct file_system_type ubifs_fs_type = {
.name = "ubifs",
.owner = THIS_MODULE,
- .mount = ubifs_mount,
+ .init_fs_context = ubifs_init_fs_context,
+ .parameters = ubifs_fs_param_spec,
.kill_sb = kill_ubifs_super,
};
MODULE_ALIAS_FS("ubifs");
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 53c11be2b2c1..194ed3ab945e 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -33,6 +33,29 @@ static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *
static unsigned char ufs_fragtable_8fpb[], ufs_fragtable_other[];
static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, unsigned, int);
+static void adjust_free_blocks(struct super_block *sb,
+ struct ufs_cylinder_group *ucg,
+ struct ufs_cg_private_info *ucpi,
+ unsigned fragment, int delta)
+{
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+
+ if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
+ ufs_clusteracct(sb, ucpi, fragment, delta);
+
+ fs32_add(sb, &ucg->cg_cs.cs_nbfree, delta);
+ uspi->cs_total.cs_nbfree += delta;
+ fs32_add(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, delta);
+
+ if (uspi->fs_magic != UFS2_MAGIC) {
+ unsigned cylno = ufs_cbtocylno(fragment);
+
+ fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
+ ufs_cbtorpos(fragment)), delta);
+ fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), delta);
+ }
+}
+
/*
* Free 'count' fragments from fragment number 'fragment'
*/
@@ -43,7 +66,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned cgno, bit, end_bit, bbase, blkmap, i;
- u64 blkno;
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
@@ -51,7 +73,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
UFSD("ENTER, fragment %llu, count %u\n",
(unsigned long long)fragment, count);
- if (ufs_fragnum(fragment) + count > uspi->s_fpg)
+ if (ufs_fragnum(fragment) + count > uspi->s_fpb)
ufs_error (sb, "ufs_free_fragments", "internal error");
mutex_lock(&UFS_SB(sb)->s_lock);
@@ -94,23 +116,11 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
/*
* Trying to reassemble free fragments into block
*/
- blkno = ufs_fragstoblks (bbase);
- if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
+ if (ubh_isblockset(uspi, ucpi, bbase)) {
fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
uspi->cs_total.cs_nffree -= uspi->s_fpb;
fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
- if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
- ufs_clusteracct (sb, ucpi, blkno, 1);
- fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
- uspi->cs_total.cs_nbfree++;
- fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
- if (uspi->fs_magic != UFS2_MAGIC) {
- unsigned cylno = ufs_cbtocylno (bbase);
-
- fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
- ufs_cbtorpos(bbase)), 1);
- fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
- }
+ adjust_free_blocks(sb, ucg, ucpi, bbase, 1);
}
ubh_mark_buffer_dirty (USPI_UBH(uspi));
@@ -139,7 +149,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned overflow, cgno, bit, end_bit, i;
- u64 blkno;
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
@@ -181,26 +190,12 @@ do_more:
}
for (i = bit; i < end_bit; i += uspi->s_fpb) {
- blkno = ufs_fragstoblks(i);
- if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
+ if (ubh_isblockset(uspi, ucpi, i)) {
ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
}
- ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
+ ubh_setblock(uspi, ucpi, i);
inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift);
- if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
- ufs_clusteracct (sb, ucpi, blkno, 1);
-
- fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
- uspi->cs_total.cs_nbfree++;
- fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
-
- if (uspi->fs_magic != UFS2_MAGIC) {
- unsigned cylno = ufs_cbtocylno(i);
-
- fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
- ufs_cbtorpos(i)), 1);
- fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
- }
+ adjust_free_blocks(sb, ucg, ucpi, i, 1);
}
ubh_mark_buffer_dirty (USPI_UBH(uspi));
@@ -234,13 +229,13 @@ failed:
* situated at the end of file.
*
* We can come here from ufs_writepage or ufs_prepare_write,
- * locked_page is argument of these functions, so we already lock it.
+ * locked_folio is argument of these functions, so we already lock it.
*/
static void ufs_change_blocknr(struct inode *inode, sector_t beg,
unsigned int count, sector_t oldb,
- sector_t newb, struct page *locked_page)
+ sector_t newb, struct folio *locked_folio)
{
- struct folio *folio, *locked_folio = page_folio(locked_page);
+ struct folio *folio;
const unsigned blks_per_page =
1 << (PAGE_SHIFT - inode->i_blkbits);
const unsigned mask = blks_per_page - 1;
@@ -337,7 +332,7 @@ static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n,
u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
u64 goal, unsigned count, int *err,
- struct page *locked_page)
+ struct folio *locked_folio)
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
@@ -417,7 +412,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
result = ufs_alloc_fragments (inode, cgno, goal, count, err);
if (result) {
ufs_clear_frags(inode, result + oldcount,
- newcount - oldcount, locked_page != NULL);
+ newcount - oldcount, locked_folio != NULL);
*err = 0;
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
@@ -441,7 +436,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
fragment + count);
read_sequnlock_excl(&UFS_I(inode)->meta_lock);
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
- locked_page != NULL);
+ locked_folio != NULL);
mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
@@ -462,11 +457,11 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
result = ufs_alloc_fragments (inode, cgno, goal, request, err);
if (result) {
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
- locked_page != NULL);
+ locked_folio != NULL);
mutex_unlock(&UFS_SB(sb)->s_lock);
ufs_change_blocknr(inode, fragment - oldcount, oldcount,
uspi->s_sbbase + tmp,
- uspi->s_sbbase + result, locked_page);
+ uspi->s_sbbase + result, locked_folio);
*err = 0;
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
@@ -698,7 +693,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
struct super_block * sb;
struct ufs_sb_private_info * uspi;
struct ufs_cylinder_group * ucg;
- u64 result, blkno;
+ u64 result;
UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -716,7 +711,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
/*
* If the requested block is available, use it.
*/
- if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
+ if (ubh_isblockset(uspi, ucpi, goal)) {
result = goal;
goto gotit;
}
@@ -729,22 +724,8 @@ norot:
gotit:
if (!try_add_frags(inode, uspi->s_fpb))
return 0;
- blkno = ufs_fragstoblks(result);
- ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
- if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
- ufs_clusteracct (sb, ucpi, blkno, -1);
-
- fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
- uspi->cs_total.cs_nbfree--;
- fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
-
- if (uspi->fs_magic != UFS2_MAGIC) {
- unsigned cylno = ufs_cbtocylno((unsigned)result);
-
- fs16_sub(sb, &ubh_cg_blks(ucpi, cylno,
- ufs_cbtorpos((unsigned)result)), 1);
- fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
- }
+ ubh_clrblock(uspi, ucpi, result);
+ adjust_free_blocks(sb, ucg, ucpi, result, -1);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
@@ -863,12 +844,12 @@ static u64 ufs_bitmap_search(struct super_block *sb,
}
static void ufs_clusteracct(struct super_block * sb,
- struct ufs_cg_private_info * ucpi, unsigned blkno, int cnt)
+ struct ufs_cg_private_info * ucpi, unsigned frag, int cnt)
{
- struct ufs_sb_private_info * uspi;
+ struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
int i, start, end, forw, back;
+ unsigned blkno = ufs_fragstoblks(frag);
- uspi = UFS_SB(sb)->s_uspi;
if (uspi->s_contigsumsize <= 0)
return;
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 1abe5454de47..a2813270c303 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -26,7 +26,7 @@
* Read cylinder group into cache. The memory space for ufs_cg_private_info
* structure is already allocated during ufs_read_super.
*/
-static void ufs_read_cylinder (struct super_block * sb,
+static bool ufs_read_cylinder(struct super_block *sb,
unsigned cgno, unsigned bitmap_nr)
{
struct ufs_sb_info * sbi = UFS_SB(sb);
@@ -46,9 +46,11 @@ static void ufs_read_cylinder (struct super_block * sb,
* We have already the first fragment of cylinder group block in buffer
*/
UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
- for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
- if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
+ for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
+ UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i);
+ if (!UCPI_UBH(ucpi)->bh[i])
goto failed;
+ }
sbi->s_cgno[bitmap_nr] = cgno;
ucpi->c_cgx = fs32_to_cpu(sb, ucg->cg_cgx);
@@ -67,13 +69,14 @@ static void ufs_read_cylinder (struct super_block * sb,
ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
UFSD("EXIT\n");
- return;
+ return true;
failed:
for (j = 1; j < i; j++)
- brelse (sbi->s_ucg[j]);
+ brelse(UCPI_UBH(ucpi)->bh[j]);
sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
ufs_error (sb, "ufs_read_cylinder", "can't read cylinder group block %u", cgno);
+ return false;
}
/*
@@ -156,15 +159,14 @@ struct ufs_cg_private_info * ufs_load_cylinder (
UFSD("EXIT (FAILED)\n");
return NULL;
}
- else {
- UFSD("EXIT\n");
- return sbi->s_ucpi[cgno];
- }
} else {
- ufs_read_cylinder (sb, cgno, cgno);
- UFSD("EXIT\n");
- return sbi->s_ucpi[cgno];
+ if (unlikely(!ufs_read_cylinder (sb, cgno, cgno))) {
+ UFSD("EXIT (FAILED)\n");
+ return NULL;
+ }
}
+ UFSD("EXIT\n");
+ return sbi->s_ucpi[cgno];
}
/*
* Cylinder group number cg is in cache but it was not last used,
@@ -195,7 +197,10 @@ struct ufs_cg_private_info * ufs_load_cylinder (
sbi->s_ucpi[j] = sbi->s_ucpi[j-1];
}
sbi->s_ucpi[0] = ucpi;
- ufs_read_cylinder (sb, cgno, 0);
+ if (unlikely(!ufs_read_cylinder (sb, cgno, 0))) {
+ UFSD("EXIT (FAILED)\n");
+ return NULL;
+ }
}
UFSD("EXIT\n");
return sbi->s_ucpi[0];
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index d6e6a2198971..88d0062cfdb9 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -81,10 +81,9 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
}
-/* Releases the page */
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- struct folio *folio, struct inode *inode,
- bool update_times)
+int ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+ struct folio *folio, struct inode *inode,
+ bool update_times)
{
loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen);
@@ -92,17 +91,19 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
folio_lock(folio);
err = ufs_prepare_chunk(folio, pos, len);
- BUG_ON(err);
+ if (unlikely(err)) {
+ folio_unlock(folio);
+ return err;
+ }
de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
ufs_set_de_type(dir->i_sb, de, inode->i_mode);
ufs_commit_chunk(folio, pos, len);
- folio_release_kmap(folio, de);
if (update_times)
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
- ufs_handle_dirsync(dir);
+ return ufs_handle_dirsync(dir);
}
static bool ufs_check_folio(struct folio *folio, char *kaddr)
@@ -505,8 +506,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
if (de->d_reclen == 0) {
ufs_error(inode->i_sb, __func__,
"zero-length directory entry");
- err = -EIO;
- goto out;
+ return -EIO;
}
pde = de;
de = ufs_next_entry(sb, de);
@@ -516,18 +516,17 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
pos = folio_pos(folio) + from;
folio_lock(folio);
err = ufs_prepare_chunk(folio, pos, to - from);
- BUG_ON(err);
+ if (unlikely(err)) {
+ folio_unlock(folio);
+ return err;
+ }
if (pde)
pde->d_reclen = cpu_to_fs16(sb, to - from);
dir->d_ino = 0;
ufs_commit_chunk(folio, pos, to - from);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
- err = ufs_handle_dirsync(inode);
-out:
- folio_release_kmap(folio, kaddr);
- UFSD("EXIT\n");
- return err;
+ return ufs_handle_dirsync(inode);
}
int ufs_make_empty(struct inode * inode, struct inode *dir)
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 6558882a89ef..487ad1fc2de6 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -42,4 +42,5 @@ const struct file_operations ufs_file_operations = {
.open = generic_file_open,
.fsync = generic_file_fsync,
.splice_read = filemap_splice_read,
+ .splice_write = iter_file_splice_write,
};
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 5331ae7ebf3e..7dc38fdef2ea 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -220,7 +220,7 @@ changed:
*/
static bool
ufs_extend_tail(struct inode *inode, u64 writes_to,
- int *err, struct page *locked_page)
+ int *err, struct folio *locked_folio)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
@@ -239,7 +239,7 @@ ufs_extend_tail(struct inode *inode, u64 writes_to,
p = ufs_get_direct_data_ptr(uspi, ufsi, block);
tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p),
new_size - (lastfrag & uspi->s_fpbmask), err,
- locked_page);
+ locked_folio);
return tmp != 0;
}
@@ -250,12 +250,11 @@ ufs_extend_tail(struct inode *inode, u64 writes_to,
* @new_fragment: number of new allocated fragment(s)
* @err: we set it if something wrong
* @new: we set it if we allocate new block
- * @locked_page: for ufs_new_fragments()
+ * @locked_folio: for ufs_new_fragments()
*/
-static u64
-ufs_inode_getfrag(struct inode *inode, unsigned index,
+static u64 ufs_inode_getfrag(struct inode *inode, unsigned index,
sector_t new_fragment, int *err,
- int *new, struct page *locked_page)
+ int *new, struct folio *locked_folio)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
@@ -264,11 +263,6 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
unsigned nfrags = uspi->s_fpb;
void *p;
- /* TODO : to be done for write support
- if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
- goto ufs2;
- */
-
p = ufs_get_direct_data_ptr(uspi, ufsi, index);
tmp = ufs_data_ptr_to_cpu(sb, p);
if (tmp)
@@ -288,7 +282,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
goal += uspi->s_fpb;
}
tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment),
- goal, nfrags, err, locked_page);
+ goal, nfrags, err, locked_folio);
if (!tmp) {
*err = -ENOSPC;
@@ -303,21 +297,6 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
mark_inode_dirty(inode);
out:
return tmp + uspi->s_sbbase;
-
- /* This part : To be implemented ....
- Required only for writing, not required for READ-ONLY.
-ufs2:
-
- u2_block = ufs_fragstoblks(fragment);
- u2_blockoff = ufs_fragnum(fragment);
- p = ufsi->i_u1.u2_i_data + block;
- goal = 0;
-
-repeat2:
- tmp = fs32_to_cpu(sb, *p);
- lastfrag = ufsi->i_lastfrag;
-
- */
}
/**
@@ -329,12 +308,11 @@ repeat2:
* (block will hold this fragment and also uspi->s_fpb-1)
* @err: see ufs_inode_getfrag()
* @new: see ufs_inode_getfrag()
- * @locked_page: see ufs_inode_getfrag()
+ * @locked_folio: see ufs_inode_getfrag()
*/
-static u64
-ufs_inode_getblock(struct inode *inode, u64 ind_block,
- unsigned index, sector_t new_fragment, int *err,
- int *new, struct page *locked_page)
+static u64 ufs_inode_getblock(struct inode *inode, u64 ind_block,
+ unsigned index, sector_t new_fragment, int *err,
+ int *new, struct folio *locked_folio)
{
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -369,7 +347,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block,
else
goal = bh->b_blocknr + uspi->s_fpb;
tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
- uspi->s_fpb, err, locked_page);
+ uspi->s_fpb, err, locked_folio);
if (!tmp)
goto out;
@@ -434,14 +412,14 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff
unsigned tailfrags = lastfrag & uspi->s_fpbmask;
if (tailfrags && fragment >= lastfrag) {
if (!ufs_extend_tail(inode, fragment,
- &err, bh_result->b_page))
+ &err, bh_result->b_folio))
goto out;
}
}
if (depth == 1) {
phys64 = ufs_inode_getfrag(inode, offsets[0], fragment,
- &err, &new, bh_result->b_page);
+ &err, &new, bh_result->b_folio);
} else {
int i;
phys64 = ufs_inode_getfrag(inode, offsets[0], fragment,
@@ -450,7 +428,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff
phys64 = ufs_inode_getblock(inode, phys64, offsets[i],
fragment, &err, NULL, NULL);
phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1],
- fragment, &err, &new, bh_result->b_page);
+ fragment, &err, &new, bh_result->b_folio);
}
out:
if (phys64) {
@@ -898,91 +876,84 @@ static inline void free_data(struct to_free *ctx, u64 from, unsigned count)
#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
+/*
+ * used only for truncation down to direct blocks.
+ */
static void ufs_trunc_direct(struct inode *inode)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
- struct super_block * sb;
- struct ufs_sb_private_info * uspi;
- void *p;
- u64 frag1, frag2, frag3, frag4, block1, block2;
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned int new_frags, old_frags;
+ unsigned int old_slot, new_slot;
+ unsigned int old_tail, new_tail;
struct to_free ctx = {.inode = inode};
- unsigned i, tmp;
UFSD("ENTER: ino %lu\n", inode->i_ino);
- sb = inode->i_sb;
- uspi = UFS_SB(sb)->s_uspi;
-
- frag1 = DIRECT_FRAGMENT;
- frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
- frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
- frag3 = frag4 & ~uspi->s_fpbmask;
- block1 = block2 = 0;
- if (frag2 > frag3) {
- frag2 = frag4;
- frag3 = frag4 = 0;
- } else if (frag2 < frag3) {
- block1 = ufs_fragstoblks (frag2);
- block2 = ufs_fragstoblks (frag3);
- }
-
- UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu,"
- " frag3 %llu, frag4 %llu\n", inode->i_ino,
- (unsigned long long)frag1, (unsigned long long)frag2,
- (unsigned long long)block1, (unsigned long long)block2,
- (unsigned long long)frag3, (unsigned long long)frag4);
-
- if (frag1 >= frag2)
- goto next1;
+ new_frags = DIRECT_FRAGMENT;
+ // new_frags = first fragment past the new EOF
+ old_frags = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
+ // old_frags = first fragment past the old EOF or covered by indirects
- /*
- * Free first free fragments
- */
- p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1));
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp )
- ufs_panic (sb, "ufs_trunc_direct", "internal error");
- frag2 -= frag1;
- frag1 = ufs_fragnum (frag1);
+ if (new_frags >= old_frags) // expanding - nothing to free
+ goto done;
- ufs_free_fragments(inode, tmp + frag1, frag2);
+ old_tail = ufs_fragnum(old_frags);
+ old_slot = ufs_fragstoblks(old_frags);
+ new_tail = ufs_fragnum(new_frags);
+ new_slot = ufs_fragstoblks(new_frags);
-next1:
- /*
- * Free whole blocks
- */
- for (i = block1 ; i < block2; i++) {
- p = ufs_get_direct_data_ptr(uspi, ufsi, i);
- tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (old_slot == new_slot) { // old_tail > 0
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, old_slot);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
if (!tmp)
- continue;
- write_seqlock(&ufsi->meta_lock);
- ufs_data_ptr_clear(uspi, p);
- write_sequnlock(&ufsi->meta_lock);
+ ufs_panic(sb, __func__, "internal error");
+ if (!new_tail) {
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+ }
+ ufs_free_fragments(inode, tmp + new_tail, old_tail - new_tail);
+ } else {
+ unsigned int slot = new_slot;
- free_data(&ctx, tmp, uspi->s_fpb);
- }
+ if (new_tail) {
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot++);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ ufs_panic(sb, __func__, "internal error");
- free_data(&ctx, 0, 0);
+ ufs_free_fragments(inode, tmp + new_tail,
+ uspi->s_fpb - new_tail);
+ }
+ while (slot < old_slot) {
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot++);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ continue;
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
- if (frag3 >= frag4)
- goto next3;
+ free_data(&ctx, tmp, uspi->s_fpb);
+ }
- /*
- * Free last free fragments
- */
- p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3));
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp )
- ufs_panic(sb, "ufs_truncate_direct", "internal error");
- frag4 = ufs_fragnum (frag4);
- write_seqlock(&ufsi->meta_lock);
- ufs_data_ptr_clear(uspi, p);
- write_sequnlock(&ufsi->meta_lock);
+ free_data(&ctx, 0, 0);
- ufs_free_fragments (inode, tmp, frag4);
- next3:
+ if (old_tail) {
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ ufs_panic(sb, __func__, "internal error");
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+ ufs_free_fragments(inode, tmp, old_tail);
+ }
+ }
+done:
UFSD("EXIT: ino %lu\n", inode->i_ino);
}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index c8390976ab6a..38a024c8cccd 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -210,20 +210,18 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
struct inode * inode = d_inode(dentry);
struct ufs_dir_entry *de;
struct folio *folio;
- int err = -ENOENT;
+ int err;
de = ufs_find_entry(dir, &dentry->d_name, &folio);
if (!de)
- goto out;
+ return -ENOENT;
err = ufs_delete_entry(dir, de, folio);
- if (err)
- goto out;
-
- inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
- inode_dec_link_count(inode);
- err = 0;
-out:
+ if (!err) {
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
+ inode_dec_link_count(inode);
+ }
+ folio_release_kmap(folio, de);
return err;
}
@@ -253,14 +251,14 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct ufs_dir_entry * dir_de = NULL;
struct folio *old_folio;
struct ufs_dir_entry *old_de;
- int err = -ENOENT;
+ int err;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
if (!old_de)
- goto out;
+ return -ENOENT;
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
@@ -281,7 +279,10 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_folio);
if (!new_de)
goto out_dir;
- ufs_set_link(new_dir, new_de, new_folio, old_inode, 1);
+ err = ufs_set_link(new_dir, new_de, new_folio, old_inode, 1);
+ folio_release_kmap(new_folio, new_de);
+ if (err)
+ goto out_dir;
inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
@@ -299,26 +300,20 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
* rename.
*/
inode_set_ctime_current(old_inode);
-
- ufs_delete_entry(old_dir, old_de, old_folio);
mark_inode_dirty(old_inode);
- if (dir_de) {
+ err = ufs_delete_entry(old_dir, old_de, old_folio);
+ if (!err && dir_de) {
if (old_dir != new_dir)
- ufs_set_link(old_inode, dir_de, dir_folio, new_dir, 0);
- else
- folio_release_kmap(dir_folio, dir_de);
+ err = ufs_set_link(old_inode, dir_de, dir_folio,
+ new_dir, 0);
inode_dec_link_count(old_dir);
}
- return 0;
-
-
out_dir:
if (dir_de)
folio_release_kmap(dir_folio, dir_de);
out_old:
folio_release_kmap(old_folio, old_de);
-out:
return err;
}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index bc625788589c..762699c1bcf6 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -505,7 +505,6 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
{
struct ufs_sb_info *sbi = UFS_SB(sb);
struct ufs_sb_private_info *uspi = sbi->s_uspi;
- struct ufs_buffer_head * ubh;
unsigned char * base, * space;
unsigned size, blks, i;
@@ -521,21 +520,13 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
if (!base)
goto failed;
sbi->s_csp = (struct ufs_csum *)space;
- for (i = 0; i < blks; i += uspi->s_fpb) {
- size = uspi->s_bsize;
- if (i + uspi->s_fpb > blks)
- size = (blks - i) * uspi->s_fsize;
-
- ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
-
- if (!ubh)
+ for (i = 0; i < blks; i++) {
+ struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i);
+ if (!bh)
goto failed;
-
- ubh_ubhcpymem (space, ubh, size);
-
- space += size;
- ubh_brelse (ubh);
- ubh = NULL;
+ memcpy(space, bh->b_data, uspi->s_fsize);
+ space += uspi->s_fsize;
+ brelse (bh);
}
/*
@@ -645,7 +636,6 @@ static void ufs_put_super_internal(struct super_block *sb)
{
struct ufs_sb_info *sbi = UFS_SB(sb);
struct ufs_sb_private_info *uspi = sbi->s_uspi;
- struct ufs_buffer_head * ubh;
unsigned char * base, * space;
unsigned blks, size, i;
@@ -656,18 +646,17 @@ static void ufs_put_super_internal(struct super_block *sb)
size = uspi->s_cssize;
blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
base = space = (char*) sbi->s_csp;
- for (i = 0; i < blks; i += uspi->s_fpb) {
- size = uspi->s_bsize;
- if (i + uspi->s_fpb > blks)
- size = (blks - i) * uspi->s_fsize;
-
- ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
-
- ubh_memcpyubh (ubh, space, size);
- space += size;
- ubh_mark_buffer_uptodate (ubh, 1);
- ubh_mark_buffer_dirty (ubh);
- ubh_brelse (ubh);
+ for (i = 0; i < blks; i++, space += uspi->s_fsize) {
+ struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i);
+
+ if (unlikely(!bh)) { // better than an oops...
+ ufs_panic(sb, __func__,
+ "can't write part of cylinder group summary");
+ continue;
+ }
+ memcpy(bh->b_data, space, uspi->s_fsize);
+ mark_buffer_dirty(bh);
+ brelse(bh);
}
for (i = 0; i < sbi->s_cg_loaded; i++) {
ufs_put_cylinder (sb, i);
@@ -1240,11 +1229,7 @@ magic_found:
else
uspi->s_apbshift = uspi->s_bshift - 2;
- uspi->s_2apbshift = uspi->s_apbshift * 2;
- uspi->s_3apbshift = uspi->s_apbshift * 3;
uspi->s_apb = 1 << uspi->s_apbshift;
- uspi->s_2apb = 1 << uspi->s_2apbshift;
- uspi->s_3apb = 1 << uspi->s_3apbshift;
uspi->s_apbmask = uspi->s_apb - 1;
uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS;
uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index a2c762cb65a0..e7df65dd4351 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -88,10 +88,10 @@ struct ufs_inode_info {
#endif
/* balloc.c */
-extern void ufs_free_fragments (struct inode *, u64, unsigned);
-extern void ufs_free_blocks (struct inode *, u64, unsigned);
-extern u64 ufs_new_fragments(struct inode *, void *, u64, u64,
- unsigned, int *, struct page *);
+void ufs_free_fragments (struct inode *, u64 fragment, unsigned count);
+void ufs_free_blocks (struct inode *, u64 fragment, unsigned count);
+u64 ufs_new_fragments(struct inode *, void *, u64 fragment, u64 goal,
+ unsigned count, int *err, struct folio *);
/* cylinder.c */
extern struct ufs_cg_private_info * ufs_load_cylinder (struct super_block *, unsigned);
@@ -108,8 +108,8 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *,
int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct folio *);
int ufs_empty_dir(struct inode *);
struct ufs_dir_entry *ufs_dotdot(struct inode *, struct folio **);
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- struct folio *folio, struct inode *inode, bool update_times);
+int ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+ struct folio *folio, struct inode *inode, bool update_times);
/* file.c */
extern const struct inode_operations ufs_file_inode_operations;
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index ef9ead44776a..0905f9a16b91 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -775,12 +775,8 @@ struct ufs_sb_private_info {
__u32 s_fpbmask; /* fragments per block mask */
__u32 s_apb; /* address per block */
- __u32 s_2apb; /* address per block^2 */
- __u32 s_3apb; /* address per block^3 */
__u32 s_apbmask; /* address per block mask */
__u32 s_apbshift; /* address per block shift */
- __u32 s_2apbshift; /* address per block shift * 2 */
- __u32 s_3apbshift; /* address per block shift * 3 */
__u32 s_nspfshift; /* number of sector per fragment shift */
__u32 s_nspb; /* number of sector per block */
__u32 s_inopf; /* inodes per fragment */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 2acf191eb89e..f0e906ab4ddd 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -99,20 +99,6 @@ void ubh_mark_buffer_dirty (struct ufs_buffer_head * ubh)
mark_buffer_dirty (ubh->bh[i]);
}
-void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
-{
- unsigned i;
- if (!ubh)
- return;
- if (flag) {
- for ( i = 0; i < ubh->count; i++ )
- set_buffer_uptodate (ubh->bh[i]);
- } else {
- for ( i = 0; i < ubh->count; i++ )
- clear_buffer_uptodate (ubh->bh[i]);
- }
-}
-
void ubh_sync_block(struct ufs_buffer_head *ubh)
{
if (ubh) {
@@ -146,38 +132,6 @@ int ubh_buffer_dirty (struct ufs_buffer_head * ubh)
return result;
}
-void _ubh_ubhcpymem_(struct ufs_sb_private_info * uspi,
- unsigned char * mem, struct ufs_buffer_head * ubh, unsigned size)
-{
- unsigned len, bhno;
- if (size > (ubh->count << uspi->s_fshift))
- size = ubh->count << uspi->s_fshift;
- bhno = 0;
- while (size) {
- len = min_t(unsigned int, size, uspi->s_fsize);
- memcpy (mem, ubh->bh[bhno]->b_data, len);
- mem += uspi->s_fsize;
- size -= len;
- bhno++;
- }
-}
-
-void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned char * mem, unsigned size)
-{
- unsigned len, bhno;
- if (size > (ubh->count << uspi->s_fshift))
- size = ubh->count << uspi->s_fshift;
- bhno = 0;
- while (size) {
- len = min_t(unsigned int, size, uspi->s_fsize);
- memcpy (ubh->bh[bhno]->b_data, mem, len);
- mem += uspi->s_fsize;
- size -= len;
- bhno++;
- }
-}
-
dev_t
ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
{
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index bf708b68f150..391bb4f11d74 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -263,14 +263,9 @@ extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, str
extern void ubh_brelse (struct ufs_buffer_head *);
extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
-extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
extern void ubh_sync_block(struct ufs_buffer_head *);
extern void ubh_bforget (struct ufs_buffer_head *);
extern int ubh_buffer_dirty (struct ufs_buffer_head *);
-#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
-extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struct ufs_buffer_head *, unsigned);
-#define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size)
-extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned);
/* This functions works with cache pages*/
struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index);
@@ -455,65 +450,69 @@ static inline unsigned _ubh_find_last_zero_bit_(
return (base << uspi->s_bpfshift) + pos - begin;
}
-#define ubh_isblockclear(ubh,begin,block) (!_ubh_isblockset_(uspi,ubh,begin,block))
-
-#define ubh_isblockset(ubh,begin,block) _ubh_isblockset_(uspi,ubh,begin,block)
-static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+static inline int ubh_isblockset(struct ufs_sb_private_info *uspi,
+ struct ufs_cg_private_info *ucpi, unsigned int frag)
{
+ struct ufs_buffer_head *ubh = UCPI_UBH(ucpi);
+ u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3));
u8 mask;
+
switch (uspi->s_fpb) {
case 8:
- return (*ubh_get_addr (ubh, begin + block) == 0xff);
+ return *p == 0xff;
case 4:
- mask = 0x0f << ((block & 0x01) << 2);
- return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask;
+ mask = 0x0f << (frag & 4);
+ return (*p & mask) == mask;
case 2:
- mask = 0x03 << ((block & 0x03) << 1);
- return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask;
+ mask = 0x03 << (frag & 6);
+ return (*p & mask) == mask;
case 1:
- mask = 0x01 << (block & 0x07);
- return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask;
+ mask = 0x01 << (frag & 7);
+ return (*p & mask) == mask;
}
return 0;
}
-#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block)
-static inline void _ubh_clrblock_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+static inline void ubh_clrblock(struct ufs_sb_private_info *uspi,
+ struct ufs_cg_private_info *ucpi, unsigned int frag)
{
+ struct ufs_buffer_head *ubh = UCPI_UBH(ucpi);
+ u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3));
+
switch (uspi->s_fpb) {
case 8:
- *ubh_get_addr (ubh, begin + block) = 0x00;
+ *p = 0x00;
return;
case 4:
- *ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f << ((block & 0x01) << 2));
+ *p &= ~(0x0f << (frag & 4));
return;
case 2:
- *ubh_get_addr (ubh, begin + (block >> 2)) &= ~(0x03 << ((block & 0x03) << 1));
+ *p &= ~(0x03 << (frag & 6));
return;
case 1:
- *ubh_get_addr (ubh, begin + (block >> 3)) &= ~(0x01 << ((block & 0x07)));
+ *p &= ~(0x01 << (frag & 7));
return;
}
}
-#define ubh_setblock(ubh,begin,block) _ubh_setblock_(uspi,ubh,begin,block)
-static inline void _ubh_setblock_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+static inline void ubh_setblock(struct ufs_sb_private_info * uspi,
+ struct ufs_cg_private_info *ucpi, unsigned int frag)
{
+ struct ufs_buffer_head *ubh = UCPI_UBH(ucpi);
+ u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3));
+
switch (uspi->s_fpb) {
case 8:
- *ubh_get_addr(ubh, begin + block) = 0xff;
+ *p = 0xff;
return;
case 4:
- *ubh_get_addr(ubh, begin + (block >> 1)) |= (0x0f << ((block & 0x01) << 2));
+ *p |= 0x0f << (frag & 4);
return;
case 2:
- *ubh_get_addr(ubh, begin + (block >> 2)) |= (0x03 << ((block & 0x03) << 1));
+ *p |= 0x03 << (frag & 6);
return;
case 1:
- *ubh_get_addr(ubh, begin + (block >> 3)) |= (0x01 << ((block & 0x07)));
+ *p |= 0x01 << (frag & 7);
return;
}
}
diff --git a/fs/unicode/README.utf8data b/fs/unicode/README.utf8data
index c73786807d3b..f75567e28138 100644
--- a/fs/unicode/README.utf8data
+++ b/fs/unicode/README.utf8data
@@ -1,4 +1,4 @@
-The utf8data.h file in this directory is generated from the Unicode
+The utf8data.c file in this directory is generated from the Unicode
Character Database for version 12.1.0 of the Unicode standard.
The full set of files can be found here:
@@ -45,13 +45,13 @@ Then, build under fs/unicode/ with REGENERATE_UTF8DATA=1:
make REGENERATE_UTF8DATA=1 fs/unicode/
-After sanity checking the newly generated utf8data.h file (the
+After sanity checking the newly generated utf8data.c file (the
version generated from the 12.1.0 UCD should be 4,109 lines long, and
have a total size of 324k) and/or comparing it with the older version
-of utf8data.h_shipped, rename it to utf8data.h_shipped.
+of utf8data.c_shipped, rename it to utf8data.c_shipped.
If you are a kernel developer updating to a newer version of the
Unicode Character Database, please update this README.utf8data file
with the version of the UCD that was used, the md5sum and sha1sums of
-the *.txt files, before checking in the new versions of the utf8data.h
+the *.txt files, before checking in the new versions of the utf8data.c
and README.utf8data files.
diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c
index b2bd08250c7a..6b095dd32e0f 100644
--- a/fs/unicode/mkutf8data.c
+++ b/fs/unicode/mkutf8data.c
@@ -36,7 +36,7 @@
#define FOLD_NAME "CaseFolding.txt"
#define NORM_NAME "NormalizationCorrections.txt"
#define TEST_NAME "NormalizationTest.txt"
-#define UTF8_NAME "utf8data.h"
+#define UTF8_NAME "utf8data.c"
const char *age_name = AGE_NAME;
const char *ccc_name = CCC_NAME;
@@ -3269,7 +3269,7 @@ static void write_file(void)
}
fprintf(file, "};\n");
fprintf(file, "\n");
- fprintf(file, "struct utf8data_table utf8_data_table = {\n");
+ fprintf(file, "const struct utf8data_table utf8_data_table = {\n");
fprintf(file, "\t.utf8agetab = utf8agetab,\n");
fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
fprintf(file, "\n");
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index 8395066341a4..6fc9ab8667e6 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -198,7 +198,7 @@ struct unicode_map *utf8_load(unsigned int version)
return um;
out_symbol_put:
- symbol_put(um->tables);
+ symbol_put(utf8_data_table);
out_free_um:
kfree(um);
return ERR_PTR(-EINVAL);
@@ -214,3 +214,29 @@ void utf8_unload(struct unicode_map *um)
}
EXPORT_SYMBOL(utf8_unload);
+/**
+ * utf8_parse_version - Parse a UTF-8 version number from a string
+ *
+ * @version: input string
+ *
+ * Returns the parsed version on success, negative code on error
+ */
+int utf8_parse_version(char *version)
+{
+ substring_t args[3];
+ unsigned int maj, min, rev;
+ static const struct match_token token[] = {
+ {1, "%d.%d.%d"},
+ {0, NULL}
+ };
+
+ if (match_token(version, token, args) != 1)
+ return -EINVAL;
+
+ if (match_int(&args[0], &maj) || match_int(&args[1], &min) ||
+ match_int(&args[2], &rev))
+ return -EINVAL;
+
+ return UNICODE_AGE(maj, min, rev);
+}
+EXPORT_SYMBOL(utf8_parse_version);
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index 600e15efe9ed..5ddaf27b21a6 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -17,9 +17,6 @@
static unsigned int failed_tests;
static unsigned int total_tests;
-/* Tests will be based on this version. */
-#define UTF8_LATEST UNICODE_AGE(12, 1, 0)
-
#define _test(cond, func, line, fmt, ...) do { \
total_tests++; \
if (!cond) { \
diff --git a/fs/unicode/utf8data.c_shipped b/fs/unicode/utf8data.c_shipped
index ac2da4ba2dc0..ef15d52900d0 100644
--- a/fs/unicode/utf8data.c_shipped
+++ b/fs/unicode/utf8data.c_shipped
@@ -4096,7 +4096,7 @@ static const unsigned char utf8data[64080] = {
0x80,0xcf,0x86,0x85,0x84,0xcf,0x86,0xcf,0x06,0x02,0x00,0x00,0x00,0x00,0x00,0x00
};
-struct utf8data_table utf8_data_table = {
+const struct utf8data_table utf8_data_table = {
.utf8agetab = utf8agetab,
.utf8agetab_size = ARRAY_SIZE(utf8agetab),
diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h
index bd00d587747a..fc703aa4b28e 100644
--- a/fs/unicode/utf8n.h
+++ b/fs/unicode/utf8n.h
@@ -78,6 +78,6 @@ struct utf8data_table {
const unsigned char *utf8data;
};
-extern struct utf8data_table utf8_data_table;
+extern const struct utf8data_table utf8_data_table;
#endif /* UTF8NORM_H */
diff --git a/fs/utimes.c b/fs/utimes.c
index 99b26f792b89..c7c7958e57b2 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -108,18 +108,13 @@ retry:
static int do_utimes_fd(int fd, struct timespec64 *times, int flags)
{
- struct fd f;
- int error;
-
if (flags)
return -EINVAL;
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- error = vfs_utimes(&fd_file(f)->f_path, times);
- fdput(f);
- return error;
+ return vfs_utimes(&fd_file(f)->f_path, times);
}
/*
diff --git a/fs/xattr.c b/fs/xattr.c
index 05ec7e7d9e87..02bee149ad96 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -586,25 +586,32 @@ retry_deleg:
}
EXPORT_SYMBOL_GPL(vfs_removexattr);
+int import_xattr_name(struct xattr_name *kname, const char __user *name)
+{
+ int error = strncpy_from_user(kname->name, name,
+ sizeof(kname->name));
+ if (error == 0 || error == sizeof(kname->name))
+ return -ERANGE;
+ if (error < 0)
+ return error;
+ return 0;
+}
+
/*
* Extended attribute SET operations
*/
-int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
+int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx)
{
int error;
if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
return -EINVAL;
- error = strncpy_from_user(ctx->kname->name, name,
- sizeof(ctx->kname->name));
- if (error == 0 || error == sizeof(ctx->kname->name))
- return -ERANGE;
- if (error < 0)
+ error = import_xattr_name(ctx->kname, name);
+ if (error)
return error;
- error = 0;
if (ctx->size) {
if (ctx->size > XATTR_SIZE_MAX)
return -E2BIG;
@@ -619,8 +626,8 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
return error;
}
-int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct xattr_ctx *ctx)
+static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct kernel_xattr_ctx *ctx)
{
if (is_posix_acl_xattr(ctx->kname->name))
return do_set_acl(idmap, dentry, ctx->kname->name,
@@ -630,32 +637,32 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
ctx->kvalue, ctx->size, ctx->flags);
}
-static int path_setxattr(const char __user *pathname,
- const char __user *name, const void __user *value,
- size_t size, int flags, unsigned int lookup_flags)
+int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx)
+{
+ int error = mnt_want_write_file(f);
+
+ if (!error) {
+ audit_file(f);
+ error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx);
+ mnt_drop_write_file(f);
+ }
+ return error;
+}
+
+/* unconditionally consumes filename */
+int filename_setxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx)
{
- struct xattr_name kname;
- struct xattr_ctx ctx = {
- .cvalue = value,
- .kvalue = NULL,
- .size = size,
- .kname = &kname,
- .flags = flags,
- };
struct path path;
int error;
- error = setxattr_copy(name, &ctx);
- if (error)
- return error;
-
retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
goto out;
error = mnt_want_write(path.mnt);
if (!error) {
- error = do_setxattr(mnt_idmap(path.mnt), path.dentry, &ctx);
+ error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx);
mnt_drop_write(path.mnt);
}
path_put(&path);
@@ -665,80 +672,121 @@ retry:
}
out:
+ putname(filename);
+ return error;
+}
+
+static int path_setxattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, const char __user *name,
+ const void __user *value, size_t size, int flags)
+{
+ struct xattr_name kname;
+ struct kernel_xattr_ctx ctx = {
+ .cvalue = value,
+ .kvalue = NULL,
+ .size = size,
+ .kname = &kname,
+ .flags = flags,
+ };
+ struct filename *filename;
+ unsigned int lookup_flags = 0;
+ int error;
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags = LOOKUP_FOLLOW;
+
+ error = setxattr_copy(name, &ctx);
+ if (error)
+ return error;
+
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ error = -EBADF;
+ else
+ error = file_setxattr(fd_file(f), &ctx);
+ } else {
+ error = filename_setxattr(dfd, filename, lookup_flags, &ctx);
+ }
kvfree(ctx.kvalue);
return error;
}
+SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
+ const char __user *, name, const struct xattr_args __user *, uargs,
+ size_t, usize)
+{
+ struct xattr_args args = {};
+ int error;
+
+ BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);
+
+ if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
+ return -EINVAL;
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+
+ error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+ if (error)
+ return error;
+
+ return path_setxattrat(dfd, pathname, at_flags, name,
+ u64_to_user_ptr(args.value), args.size,
+ args.flags);
+}
+
SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
const char __user *, name, const void __user *, value,
size_t, size, int, flags)
{
- return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
+ return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags);
}
SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
const char __user *, name, const void __user *, value,
size_t, size, int, flags)
{
- return path_setxattr(pathname, name, value, size, flags, 0);
+ return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
+ value, size, flags);
}
SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
const void __user *,value, size_t, size, int, flags)
{
- struct xattr_name kname;
- struct xattr_ctx ctx = {
- .cvalue = value,
- .kvalue = NULL,
- .size = size,
- .kname = &kname,
- .flags = flags,
- };
- int error;
-
- CLASS(fd, f)(fd);
- if (!fd_file(f))
- return -EBADF;
-
- audit_file(fd_file(f));
- error = setxattr_copy(name, &ctx);
- if (error)
- return error;
-
- error = mnt_want_write_file(fd_file(f));
- if (!error) {
- error = do_setxattr(file_mnt_idmap(fd_file(f)),
- fd_file(f)->f_path.dentry, &ctx);
- mnt_drop_write_file(fd_file(f));
- }
- kvfree(ctx.kvalue);
- return error;
+ return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name,
+ value, size, flags);
}
/*
* Extended attribute GET operations
*/
-ssize_t
+static ssize_t
do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
- struct xattr_ctx *ctx)
+ struct kernel_xattr_ctx *ctx)
{
ssize_t error;
char *kname = ctx->kname->name;
+ void *kvalue = NULL;
if (ctx->size) {
if (ctx->size > XATTR_SIZE_MAX)
ctx->size = XATTR_SIZE_MAX;
- ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL);
- if (!ctx->kvalue)
+ kvalue = kvzalloc(ctx->size, GFP_KERNEL);
+ if (!kvalue)
return -ENOMEM;
}
- if (is_posix_acl_xattr(ctx->kname->name))
- error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size);
+ if (is_posix_acl_xattr(kname))
+ error = do_get_acl(idmap, d, kname, kvalue, ctx->size);
else
- error = vfs_getxattr(idmap, d, kname, ctx->kvalue, ctx->size);
+ error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size);
if (error > 0) {
- if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
+ if (ctx->size && copy_to_user(ctx->value, kvalue, error))
error = -EFAULT;
} else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
/* The file system tried to returned a value bigger
@@ -746,79 +794,114 @@ do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
error = -E2BIG;
}
+ kvfree(kvalue);
return error;
}
-static ssize_t
-getxattr(struct mnt_idmap *idmap, struct dentry *d,
- const char __user *name, void __user *value, size_t size)
+ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx)
{
+ audit_file(f);
+ return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx);
+}
+
+/* unconditionally consumes filename */
+ssize_t filename_getxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx)
+{
+ struct path path;
ssize_t error;
+retry:
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
+ if (error)
+ goto out;
+ error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx);
+ path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+out:
+ putname(filename);
+ return error;
+}
+
+static ssize_t path_getxattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, const char __user *name,
+ void __user *value, size_t size)
+{
struct xattr_name kname;
- struct xattr_ctx ctx = {
+ struct kernel_xattr_ctx ctx = {
.value = value,
- .kvalue = NULL,
.size = size,
.kname = &kname,
.flags = 0,
};
+ struct filename *filename;
+ ssize_t error;
- error = strncpy_from_user(kname.name, name, sizeof(kname.name));
- if (error == 0 || error == sizeof(kname.name))
- error = -ERANGE;
- if (error < 0)
- return error;
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
- error = do_getxattr(idmap, d, &ctx);
+ error = import_xattr_name(&kname, name);
+ if (error)
+ return error;
- kvfree(ctx.kvalue);
- return error;
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+ return file_getxattr(fd_file(f), &ctx);
+ } else {
+ int lookup_flags = 0;
+ if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags = LOOKUP_FOLLOW;
+ return filename_getxattr(dfd, filename, lookup_flags, &ctx);
+ }
}
-static ssize_t path_getxattr(const char __user *pathname,
- const char __user *name, void __user *value,
- size_t size, unsigned int lookup_flags)
+SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
+ const char __user *, name, struct xattr_args __user *, uargs, size_t, usize)
{
- struct path path;
- ssize_t error;
-retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ struct xattr_args args = {};
+ int error;
+
+ BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);
+
+ if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
+ return -EINVAL;
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+
+ error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
if (error)
return error;
- error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size);
- path_put(&path);
- if (retry_estale(error, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- return error;
+
+ if (args.flags != 0)
+ return -EINVAL;
+
+ return path_getxattrat(dfd, pathname, at_flags, name,
+ u64_to_user_ptr(args.value), args.size);
}
SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
const char __user *, name, void __user *, value, size_t, size)
{
- return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
+ return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size);
}
SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
const char __user *, name, void __user *, value, size_t, size)
{
- return path_getxattr(pathname, name, value, size, 0);
+ return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
+ value, size);
}
SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
void __user *, value, size_t, size)
{
- struct fd f = fdget(fd);
- ssize_t error = -EBADF;
-
- if (!fd_file(f))
- return error;
- audit_file(fd_file(f));
- error = getxattr(file_mnt_idmap(fd_file(f)), fd_file(f)->f_path.dentry,
- name, value, size);
- fdput(f);
- return error;
+ return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size);
}
/*
@@ -853,47 +936,80 @@ listxattr(struct dentry *d, char __user *list, size_t size)
return error;
}
-static ssize_t path_listxattr(const char __user *pathname, char __user *list,
- size_t size, unsigned int lookup_flags)
+static
+ssize_t file_listxattr(struct file *f, char __user *list, size_t size)
+{
+ audit_file(f);
+ return listxattr(f->f_path.dentry, list, size);
+}
+
+/* unconditionally consumes filename */
+static
+ssize_t filename_listxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags,
+ char __user *list, size_t size)
{
struct path path;
ssize_t error;
retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
- return error;
+ goto out;
error = listxattr(path.dentry, list, size);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out:
+ putname(filename);
return error;
}
+static ssize_t path_listxattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, char __user *list,
+ size_t size)
+{
+ struct filename *filename;
+ int lookup_flags;
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+ return file_listxattr(fd_file(f), list, size);
+ }
+
+ lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+ return filename_listxattr(dfd, filename, lookup_flags, list, size);
+}
+
+SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname,
+ unsigned int, at_flags,
+ char __user *, list, size_t, size)
+{
+ return path_listxattrat(dfd, pathname, at_flags, list, size);
+}
+
SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
size_t, size)
{
- return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
+ return path_listxattrat(AT_FDCWD, pathname, 0, list, size);
}
SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
size_t, size)
{
- return path_listxattr(pathname, list, size, 0);
+ return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size);
}
SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
{
- struct fd f = fdget(fd);
- ssize_t error = -EBADF;
-
- if (!fd_file(f))
- return error;
- audit_file(fd_file(f));
- error = listxattr(fd_file(f)->f_path.dentry, list, size);
- fdput(f);
- return error;
+ return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size);
}
/*
@@ -907,25 +1023,33 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name)
return vfs_removexattr(idmap, d, name);
}
-static int path_removexattr(const char __user *pathname,
- const char __user *name, unsigned int lookup_flags)
+static int file_removexattr(struct file *f, struct xattr_name *kname)
+{
+ int error = mnt_want_write_file(f);
+
+ if (!error) {
+ audit_file(f);
+ error = removexattr(file_mnt_idmap(f),
+ f->f_path.dentry, kname->name);
+ mnt_drop_write_file(f);
+ }
+ return error;
+}
+
+/* unconditionally consumes filename */
+static int filename_removexattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct xattr_name *kname)
{
struct path path;
int error;
- char kname[XATTR_NAME_MAX + 1];
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
- if (error < 0)
- return error;
retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
- return error;
+ goto out;
error = mnt_want_write(path.mnt);
if (!error) {
- error = removexattr(mnt_idmap(path.mnt), path.dentry, kname);
+ error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name);
mnt_drop_write(path.mnt);
}
path_put(&path);
@@ -933,45 +1057,58 @@ retry:
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out:
+ putname(filename);
return error;
}
+static int path_removexattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, const char __user *name)
+{
+ struct xattr_name kname;
+ struct filename *filename;
+ unsigned int lookup_flags;
+ int error;
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ error = import_xattr_name(&kname, name);
+ if (error)
+ return error;
+
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+ return file_removexattr(fd_file(f), &kname);
+ }
+ lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+ return filename_removexattr(dfd, filename, lookup_flags, &kname);
+}
+
+SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname,
+ unsigned int, at_flags, const char __user *, name)
+{
+ return path_removexattrat(dfd, pathname, at_flags, name);
+}
+
SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
const char __user *, name)
{
- return path_removexattr(pathname, name, LOOKUP_FOLLOW);
+ return path_removexattrat(AT_FDCWD, pathname, 0, name);
}
SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
const char __user *, name)
{
- return path_removexattr(pathname, name, 0);
+ return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name);
}
SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
{
- struct fd f = fdget(fd);
- char kname[XATTR_NAME_MAX + 1];
- int error = -EBADF;
-
- if (!fd_file(f))
- return error;
- audit_file(fd_file(f));
-
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
- if (error < 0)
- return error;
-
- error = mnt_want_write_file(fd_file(f));
- if (!error) {
- error = removexattr(file_mnt_idmap(fd_file(f)),
- fd_file(f)->f_path.dentry, kname);
- mnt_drop_write_file(fd_file(f));
- }
- fdput(f);
- return error;
+ return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name);
}
int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
@@ -1005,9 +1142,10 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
ssize_t remaining_size = buffer_size;
- int err = 0;
for_each_xattr_handler(handlers, handler) {
+ int err;
+
if (!handler->name || (handler->list && !handler->list(dentry)))
continue;
err = xattr_list_one(&buffer, &remaining_size, handler->name);
@@ -1015,7 +1153,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return err;
}
- return err ? err : buffer_size - remaining_size;
+ return buffer_size - remaining_size;
}
EXPORT_SYMBOL(generic_listxattr);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index dd692619bed5..ed9b0dabc1f1 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -14,7 +14,9 @@ xfs-y += xfs_trace.o
# build the libxfs code first
xfs-y += $(addprefix libxfs/, \
+ xfs_group.o \
xfs_ag.o \
+ xfs_ag_resv.o \
xfs_alloc.o \
xfs_alloc_btree.o \
xfs_attr.o \
@@ -42,7 +44,8 @@ xfs-y += $(addprefix libxfs/, \
xfs_inode_buf.o \
xfs_inode_util.o \
xfs_log_rlimit.o \
- xfs_ag_resv.o \
+ xfs_metadir.o \
+ xfs_metafile.o \
xfs_parent.o \
xfs_rmap.o \
xfs_rmap_btree.o \
@@ -58,6 +61,7 @@ xfs-y += $(addprefix libxfs/, \
# xfs_rtbitmap is shared with libxfs
xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
xfs_rtbitmap.o \
+ xfs_rtgroup.o \
)
# highlevel code
@@ -171,6 +175,7 @@ xfs-y += $(addprefix scrub/, \
inode.o \
iscan.o \
listxattr.o \
+ metapath.o \
nlinks.o \
parent.o \
readdir.o \
@@ -186,6 +191,7 @@ xfs-y += $(addprefix scrub/, \
xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o
xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
+ rgsuper.o \
rtbitmap.o \
rtsummary.o \
)
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 5ca8d0106827..b59cb461e096 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -30,86 +30,7 @@
#include "xfs_trace.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
-
-
-/*
- * Passive reference counting access wrappers to the perag structures. If the
- * per-ag structure is to be freed, the freeing code is responsible for cleaning
- * up objects with passive references before freeing the structure. This is
- * things like cached buffers.
- */
-struct xfs_perag *
-xfs_perag_get(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
-
- rcu_read_lock();
- pag = xa_load(&mp->m_perags, agno);
- if (pag) {
- trace_xfs_perag_get(pag, _RET_IP_);
- ASSERT(atomic_read(&pag->pag_ref) >= 0);
- atomic_inc(&pag->pag_ref);
- }
- rcu_read_unlock();
- return pag;
-}
-
-/* Get a passive reference to the given perag. */
-struct xfs_perag *
-xfs_perag_hold(
- struct xfs_perag *pag)
-{
- ASSERT(atomic_read(&pag->pag_ref) > 0 ||
- atomic_read(&pag->pag_active_ref) > 0);
-
- trace_xfs_perag_hold(pag, _RET_IP_);
- atomic_inc(&pag->pag_ref);
- return pag;
-}
-
-void
-xfs_perag_put(
- struct xfs_perag *pag)
-{
- trace_xfs_perag_put(pag, _RET_IP_);
- ASSERT(atomic_read(&pag->pag_ref) > 0);
- atomic_dec(&pag->pag_ref);
-}
-
-/*
- * Active references for perag structures. This is for short term access to the
- * per ag structures for walking trees or accessing state. If an AG is being
- * shrunk or is offline, then this will fail to find that AG and return NULL
- * instead.
- */
-struct xfs_perag *
-xfs_perag_grab(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
-
- rcu_read_lock();
- pag = xa_load(&mp->m_perags, agno);
- if (pag) {
- trace_xfs_perag_grab(pag, _RET_IP_);
- if (!atomic_inc_not_zero(&pag->pag_active_ref))
- pag = NULL;
- }
- rcu_read_unlock();
- return pag;
-}
-
-void
-xfs_perag_rele(
- struct xfs_perag *pag)
-{
- trace_xfs_perag_rele(pag, _RET_IP_);
- if (atomic_dec_and_test(&pag->pag_active_ref))
- wake_up(&pag->pag_active_wq);
-}
+#include "xfs_group.h"
/*
* xfs_initialize_perag_data
@@ -184,6 +105,18 @@ out:
return error;
}
+static void
+xfs_perag_uninit(
+ struct xfs_group *xg)
+{
+#ifdef __KERNEL__
+ struct xfs_perag *pag = to_perag(xg);
+
+ cancel_delayed_work_sync(&pag->pag_blockgc_work);
+ xfs_buf_cache_destroy(&pag->pag_bcache);
+#endif
+}
+
/*
* Free up the per-ag resources within the specified AG range.
*/
@@ -196,22 +129,8 @@ xfs_free_perag_range(
{
xfs_agnumber_t agno;
- for (agno = first_agno; agno < end_agno; agno++) {
- struct xfs_perag *pag = xa_erase(&mp->m_perags, agno);
-
- ASSERT(pag);
- XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
- xfs_defer_drain_free(&pag->pag_intents_drain);
-
- cancel_delayed_work_sync(&pag->pag_blockgc_work);
- xfs_buf_cache_destroy(&pag->pag_bcache);
-
- /* drop the mount's active reference */
- xfs_perag_rele(pag);
- XFS_IS_CORRUPT(pag->pag_mount,
- atomic_read(&pag->pag_active_ref) != 0);
- kfree_rcu_mightsleep(pag);
- }
+ for (agno = first_agno; agno < end_agno; agno++)
+ xfs_group_free(mp, agno, XG_TYPE_AG, xfs_perag_uninit);
}
/* Find the size of the AG, in blocks. */
@@ -273,6 +192,10 @@ xfs_agino_range(
return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
}
+/*
+ * Update the perag of the previous tail AG if it has been changed during
+ * recovery (i.e. recovery of a growfs).
+ */
int
xfs_update_last_ag_size(
struct xfs_mount *mp,
@@ -282,88 +205,88 @@ xfs_update_last_ag_size(
if (!pag)
return -EFSCORRUPTED;
- pag->block_count = __xfs_ag_block_count(mp, prev_agcount - 1,
- mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks);
- __xfs_agino_range(mp, pag->block_count, &pag->agino_min,
+ pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp,
+ prev_agcount - 1, mp->m_sb.sb_agcount,
+ mp->m_sb.sb_dblocks);
+ __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min,
&pag->agino_max);
xfs_perag_rele(pag);
return 0;
}
-int
-xfs_initialize_perag(
+static int
+xfs_perag_alloc(
struct xfs_mount *mp,
- xfs_agnumber_t old_agcount,
- xfs_agnumber_t new_agcount,
- xfs_rfsblock_t dblocks,
- xfs_agnumber_t *maxagi)
+ xfs_agnumber_t index,
+ xfs_agnumber_t agcount,
+ xfs_rfsblock_t dblocks)
{
struct xfs_perag *pag;
- xfs_agnumber_t index;
int error;
- for (index = old_agcount; index < new_agcount; index++) {
- pag = kzalloc(sizeof(*pag), GFP_KERNEL);
- if (!pag) {
- error = -ENOMEM;
- goto out_unwind_new_pags;
- }
- pag->pag_agno = index;
- pag->pag_mount = mp;
-
- error = xa_insert(&mp->m_perags, index, pag, GFP_KERNEL);
- if (error) {
- WARN_ON_ONCE(error == -EBUSY);
- goto out_free_pag;
- }
+ pag = kzalloc(sizeof(*pag), GFP_KERNEL);
+ if (!pag)
+ return -ENOMEM;
#ifdef __KERNEL__
- /* Place kernel structure only init below this point. */
- spin_lock_init(&pag->pag_ici_lock);
- spin_lock_init(&pag->pagb_lock);
- spin_lock_init(&pag->pag_state_lock);
- INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
- INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
- xfs_defer_drain_init(&pag->pag_intents_drain);
- init_waitqueue_head(&pag->pagb_wait);
- init_waitqueue_head(&pag->pag_active_wq);
- pag->pagb_count = 0;
- pag->pagb_tree = RB_ROOT;
- xfs_hooks_init(&pag->pag_rmap_update_hooks);
+ /* Place kernel structure only init below this point. */
+ spin_lock_init(&pag->pag_ici_lock);
+ INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
+ INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
#endif /* __KERNEL__ */
- error = xfs_buf_cache_init(&pag->pag_bcache);
- if (error)
- goto out_remove_pag;
-
- /* Active ref owned by mount indicates AG is online. */
- atomic_set(&pag->pag_active_ref, 1);
+ error = xfs_buf_cache_init(&pag->pag_bcache);
+ if (error)
+ goto out_free_perag;
- /*
- * Pre-calculated geometry
- */
- pag->block_count = __xfs_ag_block_count(mp, index, new_agcount,
+ /*
+ * Pre-calculated geometry
+ */
+ pag_group(pag)->xg_block_count = __xfs_ag_block_count(mp, index, agcount,
dblocks);
- pag->min_block = XFS_AGFL_BLOCK(mp);
- __xfs_agino_range(mp, pag->block_count, &pag->agino_min,
- &pag->agino_max);
- }
+ pag_group(pag)->xg_min_gbno = XFS_AGFL_BLOCK(mp) + 1;
+ __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min,
+ &pag->agino_max);
- index = xfs_set_inode_alloc(mp, new_agcount);
+ error = xfs_group_insert(mp, pag_group(pag), index, XG_TYPE_AG);
+ if (error)
+ goto out_buf_cache_destroy;
- if (maxagi)
- *maxagi = index;
+ return 0;
+
+out_buf_cache_destroy:
+ xfs_buf_cache_destroy(&pag->pag_bcache);
+out_free_perag:
+ kfree(pag);
+ return error;
+}
+int
+xfs_initialize_perag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t orig_agcount,
+ xfs_agnumber_t new_agcount,
+ xfs_rfsblock_t dblocks,
+ xfs_agnumber_t *maxagi)
+{
+ xfs_agnumber_t index;
+ int error;
+
+ if (orig_agcount >= new_agcount)
+ return 0;
+
+ for (index = orig_agcount; index < new_agcount; index++) {
+ error = xfs_perag_alloc(mp, index, new_agcount, dblocks);
+ if (error)
+ goto out_unwind_new_pags;
+ }
+
+ *maxagi = xfs_set_inode_alloc(mp, new_agcount);
mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
return 0;
-out_remove_pag:
- xfs_defer_drain_free(&pag->pag_intents_drain);
- pag = xa_erase(&mp->m_perags, index);
-out_free_pag:
- kfree(pag);
out_unwind_new_pags:
- xfs_free_perag_range(mp, old_agcount, index);
+ xfs_free_perag_range(mp, orig_agcount, index);
return error;
}
@@ -818,7 +741,7 @@ xfs_ag_shrink_space(
struct xfs_trans **tpp,
xfs_extlen_t delta)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_alloc_arg args = {
.tp = *tpp,
.mp = mp,
@@ -835,7 +758,7 @@ xfs_ag_shrink_space(
xfs_agblock_t aglen;
int error, err2;
- ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1);
+ ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1);
error = xfs_ialloc_read_agi(pag, *tpp, 0, &agibp);
if (error)
return error;
@@ -872,7 +795,7 @@ xfs_ag_shrink_space(
/* internal log shouldn't also show up in the free space btrees */
error = xfs_alloc_vextent_exact_bno(&args,
- XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta));
+ xfs_agbno_to_fsb(pag, aglen - delta));
if (!error && args.agbno == NULLAGBLOCK)
error = -ENOSPC;
@@ -931,9 +854,9 @@ xfs_ag_shrink_space(
}
/* Update perag geometry */
- pag->block_count -= delta;
- __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
- &pag->agino_max);
+ pag_group(pag)->xg_block_count -= delta;
+ __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min,
+ &pag->agino_max);
xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
@@ -958,12 +881,13 @@ xfs_ag_extend_space(
struct xfs_trans *tp,
xfs_extlen_t len)
{
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_buf *bp;
struct xfs_agi *agi;
struct xfs_agf *agf;
int error;
- ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1);
+ ASSERT(pag_agno(pag) == mp->m_sb.sb_agcount - 1);
error = xfs_ialloc_read_agi(pag, tp, 0, &bp);
if (error)
@@ -1002,9 +926,9 @@ xfs_ag_extend_space(
return error;
/* Update perag geometry */
- pag->block_count = be32_to_cpu(agf->agf_length);
- __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
- &pag->agino_max);
+ pag_group(pag)->xg_block_count = be32_to_cpu(agf->agf_length);
+ __xfs_agino_range(mp, pag_group(pag)->xg_block_count, &pag->agino_min,
+ &pag->agino_max);
return 0;
}
@@ -1031,7 +955,7 @@ xfs_ag_get_geometry(
/* Fill out form. */
memset(ageo, 0, sizeof(*ageo));
- ageo->ag_number = pag->pag_agno;
+ ageo->ag_number = pag_agno(pag);
agi = agi_bp->b_addr;
ageo->ag_icount = be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 9edfe0e96439..1f24cfa27321 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -7,6 +7,8 @@
#ifndef __LIBXFS_AG_H
#define __LIBXFS_AG_H 1
+#include "xfs_group.h"
+
struct xfs_mount;
struct xfs_trans;
struct xfs_perag;
@@ -30,11 +32,7 @@ struct xfs_ag_resv {
* performance of allocation group selection.
*/
struct xfs_perag {
- struct xfs_mount *pag_mount; /* owner filesystem */
- xfs_agnumber_t pag_agno; /* AG this structure belongs to */
- atomic_t pag_ref; /* passive reference count */
- atomic_t pag_active_ref; /* active reference count */
- wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */
+ struct xfs_group pag_group;
unsigned long pag_opstate;
uint8_t pagf_bno_level; /* # of levels in bno btree */
uint8_t pagf_cnt_level; /* # of levels in cnt btree */
@@ -55,7 +53,6 @@ struct xfs_perag {
xfs_agino_t pagl_leftrec;
xfs_agino_t pagl_rightrec;
- int pagb_count; /* pagb slots in use */
uint8_t pagf_refcount_level; /* recount btree height */
/* Blocks reserved for all kinds of metadata. */
@@ -64,21 +61,12 @@ struct xfs_perag {
struct xfs_ag_resv pag_rmapbt_resv;
/* Precalculated geometry info */
- xfs_agblock_t block_count;
- xfs_agblock_t min_block;
xfs_agino_t agino_min;
xfs_agino_t agino_max;
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
- /*
- * Bitsets of per-ag metadata that have been checked and/or are sick.
- * Callers should hold pag_state_lock before accessing this field.
- */
- uint16_t pag_checked;
- uint16_t pag_sick;
-
#ifdef CONFIG_XFS_ONLINE_REPAIR
/*
* Alternate btree heights so that online repair won't trip the write
@@ -90,13 +78,6 @@ struct xfs_perag {
uint8_t pagf_repair_rmap_level;
#endif
- spinlock_t pag_state_lock;
-
- spinlock_t pagb_lock; /* lock for pagb_tree */
- struct rb_root pagb_tree; /* ordered tree of busy extents */
- unsigned int pagb_gen; /* generation count for pagb_tree */
- wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */
-
atomic_t pagf_fstrms; /* # of filestreams active in this AG */
spinlock_t pag_ici_lock; /* incore inode cache lock */
@@ -108,21 +89,29 @@ struct xfs_perag {
/* background prealloc block trimming */
struct delayed_work pag_blockgc_work;
-
- /*
- * We use xfs_drain to track the number of deferred log intent items
- * that have been queued (but not yet processed) so that waiters (e.g.
- * scrub) will not lock resources when other threads are in the middle
- * of processing a chain of intent items only to find momentary
- * inconsistencies.
- */
- struct xfs_defer_drain pag_intents_drain;
-
- /* Hook to feed rmapbt updates to an active online repair. */
- struct xfs_hooks pag_rmap_update_hooks;
#endif /* __KERNEL__ */
};
+static inline struct xfs_perag *to_perag(struct xfs_group *xg)
+{
+ return container_of(xg, struct xfs_perag, pag_group);
+}
+
+static inline struct xfs_group *pag_group(struct xfs_perag *pag)
+{
+ return &pag->pag_group;
+}
+
+static inline struct xfs_mount *pag_mount(const struct xfs_perag *pag)
+{
+ return pag->pag_group.xg_mount;
+}
+
+static inline xfs_agnumber_t pag_agno(const struct xfs_perag *pag)
+{
+ return pag->pag_group.xg_gno;
+}
+
/*
* Per-AG operational state. These are atomic flag bits.
*/
@@ -144,8 +133,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA)
__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES)
__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET)
-int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t old_agcount,
- xfs_agnumber_t agcount, xfs_rfsblock_t dcount,
+int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t orig_agcount,
+ xfs_agnumber_t new_agcount, xfs_rfsblock_t dcount,
xfs_agnumber_t *maxagi);
void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno,
xfs_agnumber_t end_agno);
@@ -153,13 +142,71 @@ int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount);
/* Passive AG references */
-struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
-struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag);
-void xfs_perag_put(struct xfs_perag *pag);
+static inline struct xfs_perag *
+xfs_perag_get(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ return to_perag(xfs_group_get(mp, agno, XG_TYPE_AG));
+}
+
+static inline struct xfs_perag *
+xfs_perag_hold(
+ struct xfs_perag *pag)
+{
+ return to_perag(xfs_group_hold(pag_group(pag)));
+}
+
+static inline void
+xfs_perag_put(
+ struct xfs_perag *pag)
+{
+ xfs_group_put(pag_group(pag));
+}
/* Active AG references */
-struct xfs_perag *xfs_perag_grab(struct xfs_mount *, xfs_agnumber_t);
-void xfs_perag_rele(struct xfs_perag *pag);
+static inline struct xfs_perag *
+xfs_perag_grab(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ return to_perag(xfs_group_grab(mp, agno, XG_TYPE_AG));
+}
+
+static inline void
+xfs_perag_rele(
+ struct xfs_perag *pag)
+{
+ xfs_group_rele(pag_group(pag));
+}
+
+static inline struct xfs_perag *
+xfs_perag_next_range(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ xfs_agnumber_t start_agno,
+ xfs_agnumber_t end_agno)
+{
+ return to_perag(xfs_group_next_range(mp, pag ? pag_group(pag) : NULL,
+ start_agno, end_agno, XG_TYPE_AG));
+}
+
+static inline struct xfs_perag *
+xfs_perag_next_from(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ xfs_agnumber_t start_agno)
+{
+ return xfs_perag_next_range(mp, pag, start_agno, mp->m_sb.sb_agcount - 1);
+}
+
+static inline struct xfs_perag *
+xfs_perag_next(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag)
+{
+ return xfs_perag_next_from(mp, pag, 0);
+}
/*
* Per-ag geometry infomation and validation
@@ -171,11 +218,7 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
static inline bool
xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
{
- if (agbno >= pag->block_count)
- return false;
- if (agbno <= pag->min_block)
- return false;
- return true;
+ return xfs_verify_gbno(pag_group(pag), agbno);
}
static inline bool
@@ -184,13 +227,7 @@ xfs_verify_agbext(
xfs_agblock_t agbno,
xfs_agblock_t len)
{
- if (agbno + len <= agbno)
- return false;
-
- if (!xfs_verify_agbno(pag, agbno))
- return false;
-
- return xfs_verify_agbno(pag, agbno + len - 1);
+ return xfs_verify_gbext(pag_group(pag), agbno, len);
}
/*
@@ -226,40 +263,6 @@ xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno)
agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
}
-/*
- * Perag iteration APIs
- */
-static inline struct xfs_perag *
-xfs_perag_next(
- struct xfs_perag *pag,
- xfs_agnumber_t *agno,
- xfs_agnumber_t end_agno)
-{
- struct xfs_mount *mp = pag->pag_mount;
-
- *agno = pag->pag_agno + 1;
- xfs_perag_rele(pag);
- while (*agno <= end_agno) {
- pag = xfs_perag_grab(mp, *agno);
- if (pag)
- return pag;
- (*agno)++;
- }
- return NULL;
-}
-
-#define for_each_perag_range(mp, agno, end_agno, pag) \
- for ((pag) = xfs_perag_grab((mp), (agno)); \
- (pag) != NULL; \
- (pag) = xfs_perag_next((pag), &(agno), (end_agno)))
-
-#define for_each_perag_from(mp, agno, pag) \
- for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag))
-
-#define for_each_perag(mp, agno, pag) \
- (agno) = 0; \
- for_each_perag_from((mp), (agno), (pag))
-
static inline struct xfs_perag *
xfs_perag_next_wrap(
struct xfs_perag *pag,
@@ -268,9 +271,9 @@ xfs_perag_next_wrap(
xfs_agnumber_t restart_agno,
xfs_agnumber_t wrap_agno)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
- *agno = pag->pag_agno + 1;
+ *agno = pag_agno(pag) + 1;
xfs_perag_rele(pag);
while (*agno != stop_agno) {
if (*agno >= wrap_agno) {
@@ -332,4 +335,28 @@ int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp,
xfs_extlen_t len);
int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+static inline xfs_fsblock_t
+xfs_agbno_to_fsb(
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno)
+{
+ return XFS_AGB_TO_FSB(pag_mount(pag), pag_agno(pag), agbno);
+}
+
+static inline xfs_daddr_t
+xfs_agbno_to_daddr(
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno)
+{
+ return XFS_AGB_TO_DADDR(pag_mount(pag), pag_agno(pag), agbno);
+}
+
+static inline xfs_ino_t
+xfs_agino_to_ino(
+ struct xfs_perag *pag,
+ xfs_agino_t agino)
+{
+ return XFS_AGINO_TO_INO(pag_mount(pag), pag_agno(pag), agino);
+}
+
#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 216423df939e..f5d853089019 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -70,6 +70,7 @@ xfs_ag_resv_critical(
struct xfs_perag *pag,
enum xfs_ag_resv_type type)
{
+ struct xfs_mount *mp = pag_mount(pag);
xfs_extlen_t avail;
xfs_extlen_t orig;
@@ -92,8 +93,8 @@ xfs_ag_resv_critical(
/* Critically low if less than 10% or max btree height remains. */
return XFS_TEST_ERROR(avail < orig / 10 ||
- avail < pag->pag_mount->m_agbtree_maxlevels,
- pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
+ avail < mp->m_agbtree_maxlevels,
+ mp, XFS_ERRTAG_AG_RESV_CRITICAL);
}
/*
@@ -137,8 +138,8 @@ __xfs_ag_resv_free(
trace_xfs_ag_resv_free(pag, type, 0);
resv = xfs_perag_resv(pag, type);
- if (pag->pag_agno == 0)
- pag->pag_mount->m_ag_max_usable += resv->ar_asked;
+ if (pag_agno(pag) == 0)
+ pag_mount(pag)->m_ag_max_usable += resv->ar_asked;
/*
* RMAPBT blocks come from the AGFL and AGFL blocks are always
* considered "free", so whatever was reserved at mount time must be
@@ -148,7 +149,7 @@ __xfs_ag_resv_free(
oldresv = resv->ar_orig_reserved;
else
oldresv = resv->ar_reserved;
- xfs_add_fdblocks(pag->pag_mount, oldresv);
+ xfs_add_fdblocks(pag_mount(pag), oldresv);
resv->ar_reserved = 0;
resv->ar_asked = 0;
resv->ar_orig_reserved = 0;
@@ -170,7 +171,7 @@ __xfs_ag_resv_init(
xfs_extlen_t ask,
xfs_extlen_t used)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_ag_resv *resv;
int error;
xfs_extlen_t hidden_space;
@@ -206,11 +207,10 @@ __xfs_ag_resv_init(
else
error = xfs_dec_fdblocks(mp, hidden_space, true);
if (error) {
- trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
- error, _RET_IP_);
+ trace_xfs_ag_resv_init_error(pag, error, _RET_IP_);
xfs_warn(mp,
"Per-AG reservation for AG %u failed. Filesystem may run out of space.",
- pag->pag_agno);
+ pag_agno(pag));
return error;
}
@@ -220,7 +220,7 @@ __xfs_ag_resv_init(
* counter, we only make the adjustment for AG 0. This assumes that
* there aren't any AGs hungrier for per-AG reservation than AG 0.
*/
- if (pag->pag_agno == 0)
+ if (pag_agno(pag) == 0)
mp->m_ag_max_usable -= ask;
resv = xfs_perag_resv(pag, type);
@@ -238,7 +238,7 @@ xfs_ag_resv_init(
struct xfs_perag *pag,
struct xfs_trans *tp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
xfs_extlen_t ask;
xfs_extlen_t used;
int error = 0, error2;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 22bdbb3e9980..3d33e17f2e5c 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -275,7 +275,7 @@ xfs_alloc_complain_bad_rec(
xfs_warn(mp,
"%sbt record corruption in AG %d detected at %pS!",
- cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
+ cur->bc_ops->name, cur->bc_group->xg_gno, fa);
xfs_warn(mp,
"start block 0x%x block count 0x%x", irec->ar_startblock,
irec->ar_blockcount);
@@ -303,7 +303,7 @@ xfs_alloc_get_rec(
return error;
xfs_alloc_btrec_to_irec(rec, &irec);
- fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
+ fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
@@ -331,7 +331,8 @@ xfs_alloc_compute_aligned(
bool busy;
/* Trim busy sections out of found extent */
- busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
+ busy = xfs_extent_busy_trim(pag_group(args->pag), args->minlen,
+ args->maxlen, &bno, &len, busy_gen);
/*
* If we have a largish extent that happens to start before min_agbno,
@@ -539,7 +540,7 @@ static int
xfs_alloc_fixup_longest(
struct xfs_btree_cur *cnt_cur)
{
- struct xfs_perag *pag = cnt_cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(cnt_cur->bc_group);
struct xfs_buf *bp = cnt_cur->bc_ag.agbp;
struct xfs_agf *agf = bp->b_addr;
xfs_extlen_t longest = 0;
@@ -799,7 +800,7 @@ xfs_agfl_verify(
* use it by using uncached buffers that don't have the perag attached
* so we can detect and avoid this problem.
*/
- if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+ if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != pag_agno((bp->b_pag)))
return __this_address;
for (i = 0; i < xfs_agfl_size(mp); i++) {
@@ -879,13 +880,12 @@ xfs_alloc_read_agfl(
struct xfs_trans *tp,
struct xfs_buf **bpp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_buf *bp;
int error;
- error = xfs_trans_read_buf(
- mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
@@ -1252,14 +1252,14 @@ xfs_alloc_ag_vextent_small(
if (fbno == NULLAGBLOCK)
goto out;
- xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
+ xfs_extent_busy_reuse(pag_group(args->pag), fbno, 1,
(args->datatype & XFS_ALLOC_NOBUSY));
if (args->datatype & XFS_ALLOC_USERDATA) {
struct xfs_buf *bp;
error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
+ xfs_agbno_to_daddr(args->pag, fbno),
args->mp->m_bsize, 0, &bp);
if (error)
goto error;
@@ -1365,7 +1365,8 @@ xfs_alloc_ag_vextent_exact(
*/
tbno = fbno;
tlen = flen;
- xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
+ xfs_extent_busy_trim(pag_group(args->pag), args->minlen, args->maxlen,
+ &tbno, &tlen, &busy_gen);
/*
* Give up if the start of the extent is busy, or the freespace isn't
@@ -1758,8 +1759,9 @@ restart:
* the allocation can be retried.
*/
trace_xfs_alloc_near_busy(args);
- error = xfs_extent_busy_flush(args->tp, args->pag,
- acur.busy_gen, alloc_flags);
+ error = xfs_extent_busy_flush(args->tp,
+ pag_group(args->pag), acur.busy_gen,
+ alloc_flags);
if (error)
goto out;
@@ -1874,8 +1876,9 @@ restart:
* the allocation can be retried.
*/
trace_xfs_alloc_size_busy(args);
- error = xfs_extent_busy_flush(args->tp, args->pag,
- busy_gen, alloc_flags);
+ error = xfs_extent_busy_flush(args->tp,
+ pag_group(args->pag), busy_gen,
+ alloc_flags);
if (error)
goto error0;
@@ -1973,8 +1976,9 @@ restart:
* the allocation can be retried.
*/
trace_xfs_alloc_size_busy(args);
- error = xfs_extent_busy_flush(args->tp, args->pag,
- busy_gen, alloc_flags);
+ error = xfs_extent_busy_flush(args->tp,
+ pag_group(args->pag), busy_gen,
+ alloc_flags);
if (error)
goto error0;
@@ -2037,7 +2041,6 @@ int
xfs_free_ag_extent(
struct xfs_trans *tp,
struct xfs_buf *agbp,
- xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len,
const struct xfs_owner_info *oinfo,
@@ -2358,19 +2361,19 @@ xfs_free_ag_extent(
* Update the freespace totals in the ag and superblock.
*/
error = xfs_alloc_update_counters(tp, agbp, len);
- xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
+ xfs_ag_resv_free_extent(pag, type, tp, len);
if (error)
goto error0;
XFS_STATS_INC(mp, xs_freex);
XFS_STATS_ADD(mp, xs_freeb, len);
- trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
+ trace_xfs_free_extent(pag, bno, len, type, haveleft, haveright);
return 0;
error0:
- trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
+ trace_xfs_free_extent(pag, bno, len, type, -1, -1);
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
if (cnt_cur)
@@ -2429,7 +2432,7 @@ xfs_alloc_longest_free_extent(
* reservations and AGFL rules in place, we can return this extent.
*/
if (pag->pagf_longest > delta)
- return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
+ return min_t(xfs_extlen_t, pag_mount(pag)->m_ag_max_usable,
pag->pagf_longest - delta);
/* Otherwise, let the caller try for 1 block if there's space. */
@@ -2612,7 +2615,7 @@ xfs_agfl_reset(
xfs_warn(mp,
"WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
"Please unmount and run xfs_repair.",
- pag->pag_agno, pag->pagf_flcount);
+ pag_agno(pag), pag->pagf_flcount);
agf->agf_flfirst = 0;
agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
@@ -2645,8 +2648,17 @@ xfs_defer_extent_free(
ASSERT(!isnullstartblock(bno));
ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS));
- if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
- return -EFSCORRUPTED;
+ if (free_flags & XFS_FREE_EXTENT_REALTIME) {
+ if (type != XFS_AG_RESV_NONE) {
+ ASSERT(type == XFS_AG_RESV_NONE);
+ return -EFSCORRUPTED;
+ }
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len)))
+ return -EFSCORRUPTED;
+ } else {
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
+ return -EFSCORRUPTED;
+ }
xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
@@ -2655,6 +2667,8 @@ xfs_defer_extent_free(
xefi->xefi_agresv = type;
if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD)
xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ if (free_flags & XFS_FREE_EXTENT_REALTIME)
+ xefi->xefi_flags |= XFS_EFI_REALTIME;
if (oinfo) {
ASSERT(oinfo->oi_offset == 0);
@@ -2934,9 +2948,8 @@ xfs_alloc_fix_freelist(
* Deferring the free disconnects freeing up the AGFL slot from
* freeing the block.
*/
- error = xfs_free_extent_later(tp,
- XFS_AGB_TO_FSB(mp, args->agno, bno), 1,
- &targs.oinfo, XFS_AG_RESV_AGFL, 0);
+ error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, bno),
+ 1, &targs.oinfo, XFS_AG_RESV_AGFL, 0);
if (error)
goto out_agbp_relse;
}
@@ -3156,8 +3169,6 @@ xfs_alloc_put_freelist(
logflags |= XFS_AGF_BTREEBLKS;
}
- xfs_alloc_log_agf(tp, agbp, logflags);
-
ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
agfl_bno = xfs_buf_to_agfl_bno(agflbp);
@@ -3190,7 +3201,7 @@ xfs_validate_ag_length(
* use it by using uncached buffers that don't have the perag attached
* so we can detect and avoid this problem.
*/
- if (bp->b_pag && seqno != bp->b_pag->pag_agno)
+ if (bp->b_pag && seqno != pag_agno(bp->b_pag))
return __this_address;
/*
@@ -3359,13 +3370,13 @@ xfs_read_agf(
int flags,
struct xfs_buf **agfbpp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
int error;
- trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
+ trace_xfs_read_agf(pag);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
+ XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
@@ -3388,12 +3399,13 @@ xfs_alloc_read_agf(
int flags,
struct xfs_buf **agfbpp)
{
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_buf *agfbp;
struct xfs_agf *agf;
int error;
int allocbt_blks;
- trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
+ trace_xfs_alloc_read_agf(pag);
/* We don't support trylock when freeing. */
ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
@@ -3414,7 +3426,7 @@ xfs_alloc_read_agf(
pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
- if (xfs_agfl_needs_reset(pag->pag_mount, agf))
+ if (xfs_agfl_needs_reset(mp, agf))
set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
else
clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
@@ -3427,16 +3439,15 @@ xfs_alloc_read_agf(
* counter only tracks non-root blocks.
*/
allocbt_blks = pag->pagf_btreeblks;
- if (xfs_has_rmapbt(pag->pag_mount))
+ if (xfs_has_rmapbt(mp))
allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
if (allocbt_blks > 0)
- atomic64_add(allocbt_blks,
- &pag->pag_mount->m_allocbt_blks);
+ atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
}
#ifdef DEBUG
- else if (!xfs_is_shutdown(pag->pag_mount)) {
+ else if (!xfs_is_shutdown(mp)) {
ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
@@ -3597,7 +3608,7 @@ xfs_alloc_vextent_finish(
goto out_drop_perag;
}
- args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+ args->fsbno = xfs_agbno_to_fsb(args->pag, args->agbno);
ASSERT(args->len >= args->minlen);
ASSERT(args->len <= args->maxlen);
@@ -3618,8 +3629,8 @@ xfs_alloc_vextent_finish(
if (error)
goto out_drop_perag;
- ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno,
- args->len));
+ ASSERT(!xfs_extent_busy_search(pag_group(args->pag),
+ args->agbno, args->len));
}
xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
@@ -3649,21 +3660,20 @@ xfs_alloc_vextent_this_ag(
struct xfs_alloc_arg *args,
xfs_agnumber_t agno)
{
- struct xfs_mount *mp = args->mp;
xfs_agnumber_t minimum_agno;
uint32_t alloc_flags = 0;
int error;
ASSERT(args->pag != NULL);
- ASSERT(args->pag->pag_agno == agno);
+ ASSERT(pag_agno(args->pag) == agno);
args->agno = agno;
args->agbno = 0;
trace_xfs_alloc_vextent_this_ag(args);
- error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0),
- &minimum_agno);
+ error = xfs_alloc_vextent_check_args(args,
+ xfs_agbno_to_fsb(args->pag, 0), &minimum_agno);
if (error) {
if (error == -ENOSPC)
return 0;
@@ -3868,7 +3878,7 @@ xfs_alloc_vextent_exact_bno(
int error;
ASSERT(args->pag != NULL);
- ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
+ ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target));
args->agno = XFS_FSB_TO_AGNO(mp, target);
args->agbno = XFS_FSB_TO_AGBNO(mp, target);
@@ -3907,7 +3917,7 @@ xfs_alloc_vextent_near_bno(
int error;
if (!needs_perag)
- ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
+ ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target));
args->agno = XFS_FSB_TO_AGNO(mp, target);
args->agbno = XFS_FSB_TO_AGBNO(mp, target);
@@ -3944,7 +3954,7 @@ xfs_free_extent_fix_freelist(
memset(&args, 0, sizeof(struct xfs_alloc_arg));
args.tp = tp;
args.mp = tp->t_mountp;
- args.agno = pag->pag_agno;
+ args.agno = pag_agno(pag);
args.pag = pag;
/*
@@ -4012,14 +4022,13 @@ __xfs_free_extent(
goto err_release;
}
- error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
- type);
+ error = xfs_free_ag_extent(tp, agbp, agbno, len, oinfo, type);
if (error)
goto err_release;
if (skip_discard)
busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
- xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
+ xfs_extent_busy_insert(tp, pag_group(pag), agbno, len, busy_flags);
return 0;
err_release:
@@ -4044,7 +4053,7 @@ xfs_alloc_query_range_helper(
xfs_failaddr_t fa;
xfs_alloc_btrec_to_irec(rec, &irec);
- fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
+ fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 0165452e7cd0..50ef79a1ed41 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -79,9 +79,8 @@ int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp,
struct xfs_buf *agfbp, struct xfs_buf *agflbp,
xfs_agblock_t bno, int btreeblk);
int xfs_free_ag_extent(struct xfs_trans *tp, struct xfs_buf *agbp,
- xfs_agnumber_t agno, xfs_agblock_t bno,
- xfs_extlen_t len, const struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type);
+ xfs_agblock_t bno, xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
/*
* Compute and fill in value of m_alloc_maxlevels.
@@ -238,7 +237,11 @@ int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
/* Don't issue a discard for the blocks freed. */
#define XFS_FREE_EXTENT_SKIP_DISCARD (1U << 0)
-#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD)
+/* Free blocks on the realtime device. */
+#define XFS_FREE_EXTENT_REALTIME (1U << 1)
+
+#define XFS_FREE_EXTENT_ALL_FLAGS (XFS_FREE_EXTENT_SKIP_DISCARD | \
+ XFS_FREE_EXTENT_REALTIME)
/*
* List of extents to be free "later".
@@ -249,7 +252,7 @@ struct xfs_extent_free_item {
uint64_t xefi_owner;
xfs_fsblock_t xefi_startblock;/* starting fs block number */
xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
- struct xfs_perag *xefi_pag;
+ struct xfs_group *xefi_group;
unsigned int xefi_flags;
enum xfs_ag_resv_type xefi_agresv;
};
@@ -258,6 +261,12 @@ struct xfs_extent_free_item {
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
#define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */
+#define XFS_EFI_REALTIME (1U << 4) /* freeing realtime extent */
+
+static inline bool xfs_efi_is_realtime(const struct xfs_extent_free_item *xefi)
+{
+ return xefi->xefi_flags & XFS_EFI_REALTIME;
+}
struct xfs_alloc_autoreap {
struct xfs_defer_pending *dfp;
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index aada676eee51..a4ac37ba5d51 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -28,7 +28,7 @@ xfs_bnobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
- cur->bc_ag.pag);
+ to_perag(cur->bc_group));
}
STATIC struct xfs_btree_cur *
@@ -36,29 +36,29 @@ xfs_cntbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
- cur->bc_ag.pag);
+ to_perag(cur->bc_group));
}
-
STATIC void
xfs_allocbt_set_root(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr,
int inc)
{
- struct xfs_buf *agbp = cur->bc_ag.agbp;
- struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
ASSERT(ptr->s != 0);
if (xfs_btree_is_bno(cur->bc_ops)) {
agf->agf_bno_root = ptr->s;
be32_add_cpu(&agf->agf_bno_level, inc);
- cur->bc_ag.pag->pagf_bno_level += inc;
+ pag->pagf_bno_level += inc;
} else {
agf->agf_cnt_root = ptr->s;
be32_add_cpu(&agf->agf_cnt_level, inc);
- cur->bc_ag.pag->pagf_cnt_level += inc;
+ pag->pagf_cnt_level += inc;
}
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
@@ -75,7 +75,7 @@ xfs_allocbt_alloc_block(
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp,
+ error = xfs_alloc_get_freelist(to_perag(cur->bc_group), cur->bc_tp,
cur->bc_ag.agbp, &bno, 1);
if (error)
return error;
@@ -86,7 +86,7 @@ xfs_allocbt_alloc_block(
}
atomic64_inc(&cur->bc_mp->m_allocbt_blks);
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false);
+ xfs_extent_busy_reuse(cur->bc_group, bno, 1, false);
new->s = cpu_to_be32(bno);
@@ -104,13 +104,13 @@ xfs_allocbt_free_block(
int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
- error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL,
- bno, 1);
+ error = xfs_alloc_put_freelist(to_perag(cur->bc_group), cur->bc_tp,
+ agbp, NULL, bno, 1);
if (error)
return error;
atomic64_dec(&cur->bc_mp->m_allocbt_blks);
- xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1,
+ xfs_extent_busy_insert(cur->bc_tp, pag_group(agbp->b_pag), bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
return 0;
}
@@ -178,7 +178,7 @@ xfs_allocbt_init_ptr_from_cur(
{
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno));
if (xfs_btree_is_bno(cur->bc_ops))
ptr->s = agf->agf_bno_root;
@@ -492,7 +492,7 @@ xfs_bnobt_init_cursor(
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops,
mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_ag.agbp = agbp;
if (agbp) {
struct xfs_agf *agf = agbp->b_addr;
@@ -518,7 +518,7 @@ xfs_cntbt_init_cursor(
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops,
mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_ag.agbp = agbp;
if (agbp) {
struct xfs_agf *agf = agbp->b_addr;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index c63da14eee04..17875ad865f5 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1004,7 +1004,10 @@ xfs_attr_add_fork(
unsigned int blks; /* space reservation */
int error; /* error return value */
- ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ if (xfs_is_metadir_inode(ip))
+ ASSERT(XFS_IS_DQDETACHED(ip));
+ else
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
blks = XFS_ADDAFORK_SPACE_RES(mp);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 36dd08d13293..9052839305e2 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -40,6 +40,7 @@
#include "xfs_bmap_item.h"
#include "xfs_symlink_remote.h"
#include "xfs_inode_util.h"
+#include "xfs_rtgroup.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -1042,7 +1043,10 @@ xfs_bmap_add_attrfork(
int error; /* error return value */
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ if (xfs_is_metadir_inode(ip))
+ ASSERT(XFS_IS_DQDETACHED(ip));
+ else
+ ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
ASSERT(!xfs_inode_has_attr_fork(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1423,6 +1427,24 @@ xfs_bmap_last_offset(
* Extent tree manipulation functions used during allocation.
*/
+static inline bool
+xfs_bmap_same_rtgroup(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *left,
+ struct xfs_bmbt_irec *right)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (xfs_ifork_is_realtime(ip, whichfork) && xfs_has_rtgroups(mp)) {
+ if (xfs_rtb_to_rgno(mp, left->br_startblock) !=
+ xfs_rtb_to_rgno(mp, right->br_startblock))
+ return false;
+ }
+
+ return true;
+}
+
/*
* Convert a delayed allocation to a real allocation.
*/
@@ -1492,7 +1514,8 @@ xfs_bmap_add_extent_delay_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ xfs_bmap_same_rtgroup(bma->ip, whichfork, &LEFT, new))
state |= BMAP_LEFT_CONTIG;
/*
@@ -1516,7 +1539,8 @@ xfs_bmap_add_extent_delay_real(
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= XFS_MAX_BMBT_EXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN) &&
+ xfs_bmap_same_rtgroup(bma->ip, whichfork, new, &RIGHT))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -2061,7 +2085,8 @@ xfs_bmap_add_extent_unwritten_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ xfs_bmap_same_rtgroup(ip, whichfork, &LEFT, new))
state |= BMAP_LEFT_CONTIG;
/*
@@ -2085,7 +2110,8 @@ xfs_bmap_add_extent_unwritten_real(
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= XFS_MAX_BMBT_EXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN) &&
+ xfs_bmap_same_rtgroup(ip, whichfork, new, &RIGHT))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2594,7 +2620,8 @@ xfs_bmap_add_extent_hole_delay(
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ xfs_bmap_same_rtgroup(ip, whichfork, &left, new))
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
@@ -2602,7 +2629,8 @@ xfs_bmap_add_extent_hole_delay(
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)) &&
+ xfs_bmap_same_rtgroup(ip, whichfork, new, &right))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2745,7 +2773,8 @@ xfs_bmap_add_extent_hole_real(
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_startblock + left.br_blockcount == new->br_startblock &&
left.br_state == new->br_state &&
- left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ xfs_bmap_same_rtgroup(ip, whichfork, &left, new))
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
@@ -2755,7 +2784,8 @@ xfs_bmap_add_extent_hole_real(
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN) &&
+ xfs_bmap_same_rtgroup(ip, whichfork, new, &right))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -3121,8 +3151,15 @@ xfs_bmap_adjacent_valid(
struct xfs_mount *mp = ap->ip->i_mount;
if (XFS_IS_REALTIME_INODE(ap->ip) &&
- (ap->datatype & XFS_ALLOC_USERDATA))
- return x < mp->m_sb.sb_rblocks;
+ (ap->datatype & XFS_ALLOC_USERDATA)) {
+ if (!xfs_has_rtgroups(mp))
+ return x < mp->m_sb.sb_rblocks;
+
+ return xfs_rtb_to_rgno(mp, x) == xfs_rtb_to_rgno(mp, y) &&
+ xfs_rtb_to_rgno(mp, x) < mp->m_sb.sb_rgcount &&
+ xfs_rtb_to_rtx(mp, x) < mp->m_sb.sb_rgextents;
+
+ }
return XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) &&
XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount &&
@@ -3280,7 +3317,7 @@ xfs_bmap_longest_free_extent(
}
longest = xfs_alloc_longest_free_extent(pag,
- xfs_alloc_min_freelist(pag->pag_mount, pag),
+ xfs_alloc_min_freelist(pag_mount(pag), pag),
xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE));
if (*blen < longest)
*blen = longest;
@@ -4091,7 +4128,7 @@ retry:
fdblocks = indlen;
if (XFS_IS_REALTIME_INODE(ip)) {
- error = xfs_dec_frextents(mp, xfs_rtb_to_rtx(mp, alen));
+ error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
if (error)
goto out_unreserve_quota;
} else {
@@ -4126,7 +4163,7 @@ retry:
out_unreserve_frextents:
if (XFS_IS_REALTIME_INODE(ip))
- xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, alen));
+ xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen);
@@ -5034,7 +5071,7 @@ xfs_bmap_del_extent_delay(
fdblocks = da_diff;
if (isrt)
- xfs_add_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount));
+ xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
else
fdblocks += del->br_blockcount;
@@ -5113,6 +5150,34 @@ xfs_bmap_del_extent_cow(
ip->i_delayed_blks -= del->br_blockcount;
}
+static int
+xfs_bmap_free_rtblocks(
+ struct xfs_trans *tp,
+ struct xfs_bmbt_irec *del)
+{
+ struct xfs_rtgroup *rtg;
+ int error;
+
+ rtg = xfs_rtgroup_grab(tp->t_mountp, 0);
+ if (!rtg)
+ return -EIO;
+
+ /*
+ * Ensure the bitmap and summary inodes are locked and joined to the
+ * transaction before modifying them.
+ */
+ if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) {
+ tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED;
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_BITMAP);
+ }
+
+ error = xfs_rtfree_blocks(tp, rtg, del->br_startblock,
+ del->br_blockcount);
+ xfs_rtgroup_rele(rtg);
+ return error;
+}
+
/*
* Called by xfs_bmapi to update file extent records and the btree
* after removing space.
@@ -5325,20 +5390,12 @@ xfs_bmap_del_extent_real(
* If we need to, add to list of extents to delete.
*/
if (!(bflags & XFS_BMAPI_REMAP)) {
+ bool isrt = xfs_ifork_is_realtime(ip, whichfork);
+
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
- } else if (xfs_ifork_is_realtime(ip, whichfork)) {
- /*
- * Ensure the bitmap and summary inodes are locked
- * and joined to the transaction before modifying them.
- */
- if (!(tp->t_flags & XFS_TRANS_RTBITMAP_LOCKED)) {
- tp->t_flags |= XFS_TRANS_RTBITMAP_LOCKED;
- xfs_rtbitmap_lock(mp);
- xfs_rtbitmap_trans_join(tp);
- }
- error = xfs_rtfree_blocks(tp, del->br_startblock,
- del->br_blockcount);
+ } else if (isrt && !xfs_has_rtgroups(mp)) {
+ error = xfs_bmap_free_rtblocks(tp, del);
} else {
unsigned int efi_flags = 0;
@@ -5346,6 +5403,19 @@ xfs_bmap_del_extent_real(
del->br_state == XFS_EXT_UNWRITTEN)
efi_flags |= XFS_FREE_EXTENT_SKIP_DISCARD;
+ /*
+ * Historically, we did not use EFIs to free realtime
+ * extents. However, when reverse mapping is enabled,
+ * we must maintain the same order of operations as the
+ * data device, which is: Remove the file mapping,
+ * remove the reverse mapping, and then free the
+ * blocks. Reflink for realtime volumes requires the
+ * same sort of ordering. Both features rely on
+ * rtgroups, so let's gate rt EFI usage on rtgroups.
+ */
+ if (isrt)
+ efi_flags |= XFS_FREE_EXTENT_REALTIME;
+
error = xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
XFS_AG_RESV_NONE, efi_flags);
@@ -5694,6 +5764,8 @@ xfs_bunmapi(
*/
STATIC bool
xfs_bmse_can_merge(
+ struct xfs_inode *ip,
+ int whichfork,
struct xfs_bmbt_irec *left, /* preceding extent */
struct xfs_bmbt_irec *got, /* current extent to shift */
xfs_fileoff_t shift) /* shift fsb */
@@ -5709,7 +5781,8 @@ xfs_bmse_can_merge(
if ((left->br_startoff + left->br_blockcount != startoff) ||
(left->br_startblock + left->br_blockcount != got->br_startblock) ||
(left->br_state != got->br_state) ||
- (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
+ (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN) ||
+ !xfs_bmap_same_rtgroup(ip, whichfork, left, got))
return false;
return true;
@@ -5745,7 +5818,7 @@ xfs_bmse_merge(
blockcount = left->br_blockcount + got->br_blockcount;
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
- ASSERT(xfs_bmse_can_merge(left, got, shift));
+ ASSERT(xfs_bmse_can_merge(ip, whichfork, left, got, shift));
new = *left;
new.br_blockcount = blockcount;
@@ -5907,7 +5980,8 @@ xfs_bmap_collapse_extents(
goto del_cursor;
}
- if (xfs_bmse_can_merge(&prev, &got, offset_shift_fsb)) {
+ if (xfs_bmse_can_merge(ip, whichfork, &prev, &got,
+ offset_shift_fsb)) {
error = xfs_bmse_merge(tp, ip, whichfork,
offset_shift_fsb, &icur, &got, &prev,
cur, &logflags);
@@ -6043,7 +6117,8 @@ xfs_bmap_insert_extents(
* never find mergeable extents in this scenario. Check anyways
* and warn if we encounter two extents that could be one.
*/
- if (xfs_bmse_can_merge(&got, &next, offset_shift_fsb))
+ if (xfs_bmse_can_merge(ip, whichfork, &got, &next,
+ offset_shift_fsb))
WARN_ON_ONCE(1);
}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 7592d46e97c6..4b721d935994 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -248,7 +248,7 @@ struct xfs_bmap_intent {
enum xfs_bmap_intent_type bi_type;
int bi_whichfork;
struct xfs_inode *bi_owner;
- struct xfs_perag *bi_pag;
+ struct xfs_group *bi_group;
struct xfs_bmbt_irec bi_bmap;
};
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index a5c4af148853..2b5fc5fd1643 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -225,7 +225,7 @@ __xfs_btree_check_agblock(
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
xfs_failaddr_t fa;
xfs_agblock_t agbno;
@@ -331,7 +331,7 @@ __xfs_btree_check_ptr(
return -EFSCORRUPTED;
break;
case XFS_BTREE_TYPE_AG:
- if (!xfs_verify_agbno(cur->bc_ag.pag,
+ if (!xfs_verify_agbno(to_perag(cur->bc_group),
be32_to_cpu((&ptr->s)[index])))
return -EFSCORRUPTED;
break;
@@ -372,7 +372,7 @@ xfs_btree_check_ptr(
case XFS_BTREE_TYPE_AG:
xfs_err(cur->bc_mp,
"AG %u: Corrupt %sbt pointer at level %d index %d.",
- cur->bc_ag.pag->pag_agno, cur->bc_ops->name,
+ cur->bc_group->xg_gno, cur->bc_ops->name,
level, index);
break;
}
@@ -523,20 +523,8 @@ xfs_btree_del_cursor(
ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 ||
xfs_is_shutdown(cur->bc_mp) || error != 0);
- switch (cur->bc_ops->type) {
- case XFS_BTREE_TYPE_AG:
- if (cur->bc_ag.pag)
- xfs_perag_put(cur->bc_ag.pag);
- break;
- case XFS_BTREE_TYPE_INODE:
- /* nothing to do */
- break;
- case XFS_BTREE_TYPE_MEM:
- if (cur->bc_mem.pag)
- xfs_perag_put(cur->bc_mem.pag);
- break;
- }
-
+ if (cur->bc_group)
+ xfs_group_put(cur->bc_group);
kmem_cache_free(cur->bc_cache, cur);
}
@@ -1017,22 +1005,22 @@ xfs_btree_readahead_agblock(
struct xfs_btree_block *block)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
int rval = 0;
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
xfs_buf_readahead(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, left),
- mp->m_bsize, cur->bc_ops->buf_ops);
+ xfs_agbno_to_daddr(pag, left), mp->m_bsize,
+ cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
xfs_buf_readahead(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, right),
- mp->m_bsize, cur->bc_ops->buf_ops);
+ xfs_agbno_to_daddr(pag, right), mp->m_bsize,
+ cur->bc_ops->buf_ops);
rval++;
}
@@ -1091,7 +1079,7 @@ xfs_btree_ptr_to_daddr(
switch (cur->bc_ops->type) {
case XFS_BTREE_TYPE_AG:
- *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group),
be32_to_cpu(ptr->s));
break;
case XFS_BTREE_TYPE_INODE:
@@ -1313,7 +1301,7 @@ xfs_btree_owner(
case XFS_BTREE_TYPE_INODE:
return cur->bc_ino.ip->i_ino;
case XFS_BTREE_TYPE_AG:
- return cur->bc_ag.pag->pag_agno;
+ return cur->bc_group->xg_gno;
default:
ASSERT(0);
return 0;
@@ -4745,7 +4733,7 @@ xfs_btree_agblock_v5hdr_verify(
return __this_address;
if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
return __this_address;
- if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag))
return __this_address;
return NULL;
}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 10b7ddc3b2b3..3b739459ebb0 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -254,6 +254,7 @@ struct xfs_btree_cur
union xfs_btree_irec bc_rec; /* current insert/search record value */
uint8_t bc_nlevels; /* number of levels in the tree */
uint8_t bc_maxlevels; /* maximum levels for this btree type */
+ struct xfs_group *bc_group;
/* per-type information */
union {
@@ -264,13 +265,11 @@ struct xfs_btree_cur
struct xbtree_ifakeroot *ifake; /* for staging cursor */
} bc_ino;
struct {
- struct xfs_perag *pag;
struct xfs_buf *agbp;
struct xbtree_afakeroot *afake; /* for staging cursor */
} bc_ag;
struct {
struct xfbtree *xfbtree;
- struct xfs_perag *pag;
} bc_mem;
};
diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c
index 036061fe32cc..df3d613675a1 100644
--- a/fs/xfs/libxfs/xfs_btree_mem.c
+++ b/fs/xfs/libxfs/xfs_btree_mem.c
@@ -57,10 +57,8 @@ xfbtree_dup_cursor(
ncur->bc_flags = cur->bc_flags;
ncur->bc_nlevels = cur->bc_nlevels;
ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree;
-
- if (cur->bc_mem.pag)
- ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag);
-
+ if (cur->bc_group)
+ ncur->bc_group = xfs_group_hold(cur->bc_group);
return ncur;
}
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 2cd212ad2c1d..5b377cbbb1f7 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -846,6 +846,12 @@ xfs_defer_add(
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ if (!ops->finish_item) {
+ ASSERT(ops->finish_item != NULL);
+ xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ return NULL;
+ }
+
dfp = xfs_defer_find_last(tp, ops);
if (!dfp || !xfs_defer_can_append(dfp, ops))
dfp = xfs_defer_alloc(&tp->t_dfops, ops);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 8b338031e487..ec51b8465e61 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -71,6 +71,7 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_rtextent_free_defer_type;
extern const struct xfs_defer_op_type xfs_attr_defer_type;
extern const struct xfs_defer_op_type xfs_exchmaps_defer_type;
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 15a362e2f5ea..dceef2abd4e2 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -16,6 +16,9 @@
#include "xfs_trans.h"
#include "xfs_qm.h"
#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_metadir.h"
+#include "xfs_metafile.h"
int
xfs_calc_dquots_per_chunk(
@@ -323,3 +326,190 @@ xfs_dquot_to_disk_ts(
return cpu_to_be32(t);
}
+
+inline unsigned int
+xfs_dqinode_sick_mask(xfs_dqtype_t type)
+{
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ return XFS_SICK_FS_UQUOTA;
+ case XFS_DQTYPE_GROUP:
+ return XFS_SICK_FS_GQUOTA;
+ case XFS_DQTYPE_PROJ:
+ return XFS_SICK_FS_PQUOTA;
+ }
+
+ ASSERT(0);
+ return 0;
+}
+
+/*
+ * Load the inode for a given type of quota, assuming that the sb fields have
+ * been sorted out. This is not true when switching quota types on a V4
+ * filesystem, so do not use this function for that. If metadir is enabled,
+ * @dp must be the /quota metadir.
+ *
+ * Returns -ENOENT if the quota inode field is NULLFSINO; 0 and an inode on
+ * success; or a negative errno.
+ */
+int
+xfs_dqinode_load(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dqtype_t type,
+ struct xfs_inode **ipp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *ip;
+ enum xfs_metafile_type metafile_type = xfs_dqinode_metafile_type(type);
+ int error;
+
+ if (!xfs_has_metadir(mp)) {
+ xfs_ino_t ino;
+
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ ino = mp->m_sb.sb_uquotino;
+ break;
+ case XFS_DQTYPE_GROUP:
+ ino = mp->m_sb.sb_gquotino;
+ break;
+ case XFS_DQTYPE_PROJ:
+ ino = mp->m_sb.sb_pquotino;
+ break;
+ default:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ /* Should have set 0 to NULLFSINO when loading superblock */
+ if (ino == NULLFSINO)
+ return -ENOENT;
+
+ error = xfs_trans_metafile_iget(tp, ino, metafile_type, &ip);
+ } else {
+ error = xfs_metadir_load(tp, dp, xfs_dqinode_path(type),
+ metafile_type, &ip);
+ if (error == -ENOENT)
+ return error;
+ }
+ if (error) {
+ if (xfs_metadata_is_sick(error))
+ xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type));
+ return error;
+ }
+
+ if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) {
+ xfs_irele(ip);
+ xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type));
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_IS_CORRUPT(mp, ip->i_projid != 0)) {
+ xfs_irele(ip);
+ xfs_fs_mark_sick(mp, xfs_dqinode_sick_mask(type));
+ return -EFSCORRUPTED;
+ }
+
+ *ipp = ip;
+ return 0;
+}
+
+/* Create a metadata directory quota inode. */
+int
+xfs_dqinode_metadir_create(
+ struct xfs_inode *dp,
+ xfs_dqtype_t type,
+ struct xfs_inode **ipp)
+{
+ struct xfs_metadir_update upd = {
+ .dp = dp,
+ .metafile_type = xfs_dqinode_metafile_type(type),
+ .path = xfs_dqinode_path(type),
+ };
+ int error;
+
+ error = xfs_metadir_start_create(&upd);
+ if (error)
+ return error;
+
+ error = xfs_metadir_create(&upd, S_IFREG);
+ if (error)
+ return error;
+
+ xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE);
+
+ error = xfs_metadir_commit(&upd);
+ if (error)
+ return error;
+
+ xfs_finish_inode_setup(upd.ip);
+ *ipp = upd.ip;
+ return 0;
+}
+
+#ifndef __KERNEL__
+/* Link a metadata directory quota inode. */
+int
+xfs_dqinode_metadir_link(
+ struct xfs_inode *dp,
+ xfs_dqtype_t type,
+ struct xfs_inode *ip)
+{
+ struct xfs_metadir_update upd = {
+ .dp = dp,
+ .metafile_type = xfs_dqinode_metafile_type(type),
+ .path = xfs_dqinode_path(type),
+ .ip = ip,
+ };
+ int error;
+
+ error = xfs_metadir_start_link(&upd);
+ if (error)
+ return error;
+
+ error = xfs_metadir_link(&upd);
+ if (error)
+ return error;
+
+ xfs_trans_log_inode(upd.tp, upd.ip, XFS_ILOG_CORE);
+
+ return xfs_metadir_commit(&upd);
+}
+#endif /* __KERNEL__ */
+
+/* Create the parent directory for all quota inodes and load it. */
+int
+xfs_dqinode_mkdir_parent(
+ struct xfs_mount *mp,
+ struct xfs_inode **dpp)
+{
+ if (!mp->m_metadirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_metadir_mkdir(mp->m_metadirip, "quota", dpp);
+}
+
+/*
+ * Load the parent directory of all quota inodes. Pass the inode to the caller
+ * because quota functions (e.g. QUOTARM) can be called on the quota files even
+ * if quotas are not enabled.
+ */
+int
+xfs_dqinode_load_parent(
+ struct xfs_trans *tp,
+ struct xfs_inode **dpp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (!mp->m_metadirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_metadir_load(tp, mp->m_metadirip, "quota", XFS_METAFILE_DIR,
+ dpp);
+}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index e1bfee0c3b1a..4d47a3e723aa 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -174,6 +174,14 @@ typedef struct xfs_sb {
xfs_lsn_t sb_lsn; /* last write sequence */
uuid_t sb_meta_uuid; /* metadata file system unique id */
+ xfs_ino_t sb_metadirino; /* metadata directory tree root */
+
+ xfs_rgnumber_t sb_rgcount; /* number of realtime groups */
+ xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */
+
+ uint8_t sb_rgblklog; /* rt group number shift */
+ uint8_t sb_pad[7]; /* zeroes */
+
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -259,7 +267,19 @@ struct xfs_dsb {
__be64 sb_lsn; /* last write sequence */
uuid_t sb_meta_uuid; /* metadata file system unique id */
- /* must be padded to 64 bit alignment */
+ __be64 sb_metadirino; /* metadata directory tree root */
+ __be32 sb_rgcount; /* # of realtime groups */
+ __be32 sb_rgextents; /* size of rtgroup in rtx */
+
+ __u8 sb_rgblklog; /* rt group number shift */
+ __u8 sb_pad[7]; /* zeroes */
+
+ /*
+ * The size of this structure must be padded to 64 bit alignment.
+ *
+ * NOTE: Don't forget to update secondary_sb_whack in xfs_repair when
+ * adding new fields here.
+ */
};
#define XFS_SB_CRC_OFF offsetof(struct xfs_dsb, sb_crc)
@@ -278,7 +298,7 @@ struct xfs_dsb {
#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-static inline bool xfs_sb_is_v5(struct xfs_sb *sbp)
+static inline bool xfs_sb_is_v5(const struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
@@ -287,12 +307,12 @@ static inline bool xfs_sb_is_v5(struct xfs_sb *sbp)
* Detect a mismatched features2 field. Older kernels read/wrote
* this into the wrong slot, so to be safe we keep them in sync.
*/
-static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
+static inline bool xfs_sb_has_mismatched_features2(const struct xfs_sb *sbp)
{
return sbp->sb_bad_features2 != sbp->sb_features2;
}
-static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_hasmorebits(const struct xfs_sb *sbp)
{
return xfs_sb_is_v5(sbp) ||
(sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
@@ -342,8 +362,8 @@ static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp)
#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL
static inline bool
xfs_sb_has_compat_feature(
- struct xfs_sb *sbp,
- uint32_t feature)
+ const struct xfs_sb *sbp,
+ uint32_t feature)
{
return (sbp->sb_features_compat & feature) != 0;
}
@@ -360,8 +380,8 @@ xfs_sb_has_compat_feature(
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
- struct xfs_sb *sbp,
- uint32_t feature)
+ const struct xfs_sb *sbp,
+ uint32_t feature)
{
return (sbp->sb_features_ro_compat & feature) != 0;
}
@@ -374,6 +394,7 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
+#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
XFS_SB_FEAT_INCOMPAT_SPINODES | \
@@ -382,13 +403,14 @@ xfs_sb_has_ro_compat_feature(
XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR | \
XFS_SB_FEAT_INCOMPAT_NREXT64 | \
XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
- XFS_SB_FEAT_INCOMPAT_PARENT)
+ XFS_SB_FEAT_INCOMPAT_PARENT | \
+ XFS_SB_FEAT_INCOMPAT_METADIR)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
xfs_sb_has_incompat_feature(
- struct xfs_sb *sbp,
- uint32_t feature)
+ const struct xfs_sb *sbp,
+ uint32_t feature)
{
return (sbp->sb_features_incompat & feature) != 0;
}
@@ -399,8 +421,8 @@ xfs_sb_has_incompat_feature(
#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
static inline bool
xfs_sb_has_incompat_log_feature(
- struct xfs_sb *sbp,
- uint32_t feature)
+ const struct xfs_sb *sbp,
+ uint32_t feature)
{
return (sbp->sb_features_log_incompat & feature) != 0;
}
@@ -420,7 +442,7 @@ xfs_sb_add_incompat_log_features(
sbp->sb_features_log_incompat |= features;
}
-static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
+static inline bool xfs_sb_version_haslogxattrs(const struct xfs_sb *sbp)
{
return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
@@ -694,21 +716,58 @@ struct xfs_agfl {
/*
* Realtime bitmap information is accessed by the word, which is currently
- * stored in host-endian format.
+ * stored in host-endian format. Starting with the realtime groups feature,
+ * the words are stored in be32 ondisk.
*/
union xfs_rtword_raw {
__u32 old;
+ __be32 rtg;
};
/*
* Realtime summary counts are accessed by the word, which is currently
- * stored in host-endian format.
+ * stored in host-endian format. Starting with the realtime groups feature,
+ * the words are stored in be32 ondisk.
*/
union xfs_suminfo_raw {
__u32 old;
+ __be32 rtg;
};
/*
+ * Realtime allocation groups break the rt section into multiple pieces that
+ * could be locked independently. Realtime block group numbers are 32-bit
+ * quantities. Block numbers within a group are also 32-bit quantities, but
+ * the upper bit must never be set. rtgroup 0 might have a superblock in it,
+ * so the minimum size of an rtgroup is 2 rtx.
+ */
+#define XFS_MAX_RGBLOCKS ((xfs_rgblock_t)(1U << 31) - 1)
+#define XFS_MIN_RGEXTENTS ((xfs_rtxlen_t)2)
+#define XFS_MAX_RGNUMBER ((xfs_rgnumber_t)(-1U))
+
+#define XFS_RTSB_MAGIC 0x46726F67 /* 'Frog' */
+
+/*
+ * Realtime superblock - on disk version. Must be padded to 64 bit alignment.
+ * The first block of the realtime volume contains this superblock.
+ */
+struct xfs_rtsb {
+ __be32 rsb_magicnum; /* magic number == XFS_RTSB_MAGIC */
+ __le32 rsb_crc; /* superblock crc */
+
+ __be32 rsb_pad; /* zero */
+ unsigned char rsb_fname[XFSLABEL_MAX]; /* file system name */
+
+ uuid_t rsb_uuid; /* user-visible file system unique id */
+ uuid_t rsb_meta_uuid; /* metadata file system unique id */
+
+ /* must be padded to 64 bit alignment */
+};
+
+#define XFS_RTSB_CRC_OFF offsetof(struct xfs_rtsb, rsb_crc)
+#define XFS_RTSB_DADDR ((xfs_daddr_t)0) /* daddr in rt section */
+
+/*
* XFS Timestamps
* ==============
*
@@ -790,6 +849,27 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
}
+enum xfs_metafile_type {
+ XFS_METAFILE_UNKNOWN, /* unknown */
+ XFS_METAFILE_DIR, /* metadir directory */
+ XFS_METAFILE_USRQUOTA, /* user quota */
+ XFS_METAFILE_GRPQUOTA, /* group quota */
+ XFS_METAFILE_PRJQUOTA, /* project quota */
+ XFS_METAFILE_RTBITMAP, /* rt bitmap */
+ XFS_METAFILE_RTSUMMARY, /* rt summary */
+
+ XFS_METAFILE_MAX
+} __packed;
+
+#define XFS_METAFILE_TYPE_STR \
+ { XFS_METAFILE_UNKNOWN, "unknown" }, \
+ { XFS_METAFILE_DIR, "dir" }, \
+ { XFS_METAFILE_USRQUOTA, "usrquota" }, \
+ { XFS_METAFILE_GRPQUOTA, "grpquota" }, \
+ { XFS_METAFILE_PRJQUOTA, "prjquota" }, \
+ { XFS_METAFILE_RTBITMAP, "rtbitmap" }, \
+ { XFS_METAFILE_RTSUMMARY, "rtsummary" }
+
/*
* On-disk inode structure.
*
@@ -812,7 +892,7 @@ struct xfs_dinode {
__be16 di_mode; /* mode and type of file */
__u8 di_version; /* inode version */
__u8 di_format; /* format of di_c data */
- __be16 di_onlink; /* old number of links to file */
+ __be16 di_metatype; /* XFS_METAFILE_*; was di_onlink */
__be32 di_uid; /* owner's user id */
__be32 di_gid; /* owner's group id */
__be32 di_nlink; /* number of links to file */
@@ -1088,21 +1168,60 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
* Values for di_flags2 These start by being exposed to userspace in the upper
* 16 bits of the XFS_XFLAG_s range.
*/
-#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
-#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
-#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
-#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
-#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
+/* use DAX for this inode */
+#define XFS_DIFLAG2_DAX_BIT 0
+
+/* file's blocks may be shared */
+#define XFS_DIFLAG2_REFLINK_BIT 1
-#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
-#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
-#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
-#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
-#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
+/* copy on write extent size hint */
+#define XFS_DIFLAG2_COWEXTSIZE_BIT 2
+
+/* big timestamps */
+#define XFS_DIFLAG2_BIGTIME_BIT 3
+
+/* large extent counters */
+#define XFS_DIFLAG2_NREXT64_BIT 4
+
+/*
+ * The inode contains filesystem metadata and can be found through the metadata
+ * directory tree. Metadata inodes must satisfy the following constraints:
+ *
+ * - V5 filesystem (and ftype) are enabled;
+ * - The only valid modes are regular files and directories;
+ * - The access bits must be zero;
+ * - DMAPI event and state masks are zero;
+ * - The user and group IDs must be zero;
+ * - The project ID can be used as a u32 annotation;
+ * - The immutable, sync, noatime, nodump, nodefrag flags must be set.
+ * - The dax flag must not be set.
+ * - Directories must have nosymlinks set.
+ *
+ * These requirements are chosen defensively to minimize the ability of
+ * userspace to read or modify the contents, should a metadata file ever
+ * escape to userspace.
+ *
+ * There are further constraints on the directory tree itself:
+ *
+ * - Metadata inodes must never be resolvable through the root directory;
+ * - They must never be accessed by userspace;
+ * - Metadata directory entries must have correct ftype.
+ *
+ * Superblock-rooted metadata files must have the METADATA iflag set even
+ * though they do not have a parent directory.
+ */
+#define XFS_DIFLAG2_METADATA_BIT 5
+
+#define XFS_DIFLAG2_DAX (1ULL << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK (1ULL << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE (1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME (1ULL << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64 (1ULL << XFS_DIFLAG2_NREXT64_BIT)
+#define XFS_DIFLAG2_METADATA (1ULL << XFS_DIFLAG2_METADATA_BIT)
#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
- XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA)
static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
@@ -1117,6 +1236,12 @@ static inline bool xfs_dinode_has_large_extent_counts(
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
}
+static inline bool xfs_dinode_is_metadir(const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA));
+}
+
/*
* Inode number format:
* low inopblog bits - offset in block
@@ -1165,6 +1290,24 @@ static inline bool xfs_dinode_has_large_extent_counts(
#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
/*
+ * RT bit manipulation macros.
+ */
+#define XFS_RTBITMAP_MAGIC 0x424D505A /* BMPZ */
+#define XFS_RTSUMMARY_MAGIC 0x53554D59 /* SUMY */
+
+struct xfs_rtbuf_blkinfo {
+ __be32 rt_magic; /* validity check on block */
+ __be32 rt_crc; /* CRC of block */
+ __be64 rt_owner; /* inode that owns the block */
+ __be64 rt_blkno; /* first block of the buffer */
+ __be64 rt_lsn; /* sequence number of last write */
+ uuid_t rt_uuid; /* filesystem we belong to */
+};
+
+#define XFS_RTBUF_CRC_OFF \
+ offsetof(struct xfs_rtbuf_blkinfo, rt_crc)
+
+/*
* Dquot and dquot block format definitions
*/
#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 860284064c5a..41ce4d3d650e 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -187,7 +187,9 @@ struct xfs_fsop_geom {
__u32 logsunit; /* log stripe unit, bytes */
uint32_t sick; /* o: unhealthy fs & rt metadata */
uint32_t checked; /* o: checked fs & rt metadata */
- __u64 reserved[17]; /* reserved space */
+ __u32 rgextents; /* rt extents in a realtime group */
+ __u32 rgcount; /* number of realtime groups */
+ __u64 reserved[16]; /* reserved space */
};
#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
@@ -198,6 +200,8 @@ struct xfs_fsop_geom {
#define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */
#define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */
#define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */
+#define XFS_FSOP_GEOM_SICK_METADIR (1 << 8) /* metadata directory */
+#define XFS_FSOP_GEOM_SICK_METAPATH (1 << 9) /* metadir tree path */
/* Output for XFS_FS_COUNTS */
typedef struct xfs_fsop_counts {
@@ -242,6 +246,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
+#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */
/*
* Minimum and maximum sizes need for growth checks.
@@ -489,9 +494,17 @@ struct xfs_bulk_ireq {
*/
#define XFS_BULK_IREQ_NREXT64 (1U << 2)
+/*
+ * Allow bulkstat to return information about metadata directories. This
+ * enables xfs_scrub to find them for scanning, as they are otherwise ordinary
+ * directories.
+ */
+#define XFS_BULK_IREQ_METADIR (1U << 3)
+
#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
XFS_BULK_IREQ_SPECIAL | \
- XFS_BULK_IREQ_NREXT64)
+ XFS_BULK_IREQ_NREXT64 | \
+ XFS_BULK_IREQ_METADIR)
/* Operate on the root directory inode. */
#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
@@ -722,9 +735,11 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */
#define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */
#define XFS_SCRUB_TYPE_DIRTREE 28 /* directory tree structure */
+#define XFS_SCRUB_TYPE_METAPATH 29 /* metadata directory tree paths */
+#define XFS_SCRUB_TYPE_RGSUPER 30 /* realtime superblock */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 29
+#define XFS_SCRUB_TYPE_NR 31
/*
* This special type code only applies to the vectored scrub implementation.
@@ -803,6 +818,22 @@ struct xfs_scrub_vec_head {
#define XFS_SCRUB_VEC_FLAGS_ALL (0)
/*
+ * i: sm_ino values for XFS_SCRUB_TYPE_METAPATH to select a metadata file for
+ * path checking.
+ */
+#define XFS_SCRUB_METAPATH_PROBE (0) /* do we have a metapath scrubber? */
+#define XFS_SCRUB_METAPATH_RTDIR (1) /* rtrgroups metadir */
+#define XFS_SCRUB_METAPATH_RTBITMAP (2) /* per-rtg bitmap */
+#define XFS_SCRUB_METAPATH_RTSUMMARY (3) /* per-rtg summary */
+#define XFS_SCRUB_METAPATH_QUOTADIR (4) /* quota metadir */
+#define XFS_SCRUB_METAPATH_USRQUOTA (5) /* user quota */
+#define XFS_SCRUB_METAPATH_GRPQUOTA (6) /* group quota */
+#define XFS_SCRUB_METAPATH_PRJQUOTA (7) /* project quota */
+
+/* Number of metapath sm_ino values */
+#define XFS_SCRUB_METAPATH_NR (8)
+
+/*
* ioctl limits
*/
#ifdef XATTR_LIST_MAX
@@ -949,6 +980,21 @@ struct xfs_getparents_by_handle {
};
/*
+ * Output for XFS_IOC_RTGROUP_GEOMETRY
+ */
+struct xfs_rtgroup_geometry {
+ __u32 rg_number; /* i/o: rtgroup number */
+ __u32 rg_length; /* o: length in blocks */
+ __u32 rg_sick; /* o: sick things in ag */
+ __u32 rg_checked; /* o: checked metadata in ag */
+ __u32 rg_flags; /* i/o: flags for this ag */
+ __u32 rg_reserved[27]; /* o: zero */
+};
+#define XFS_RTGROUP_GEOM_SICK_SUPER (1U << 0) /* superblock */
+#define XFS_RTGROUP_GEOM_SICK_BITMAP (1U << 1) /* rtbitmap */
+#define XFS_RTGROUP_GEOM_SICK_SUMMARY (1U << 2) /* rtsummary */
+
+/*
* ioctl commands that are used by Linux filesystems
*/
#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
@@ -986,6 +1032,7 @@ struct xfs_getparents_by_handle {
#define XFS_IOC_GETPARENTS _IOWR('X', 62, struct xfs_getparents)
#define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle)
#define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head)
+#define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry)
/*
* ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c
new file mode 100644
index 000000000000..e9d76bcdc820
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_group.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ */
+
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_extent_busy.h"
+#include "xfs_group.h"
+
+/*
+ * Groups can have passive and active references.
+ *
+ * For passive references the code freeing a group is responsible for cleaning
+ * up objects that hold the passive references (e.g. cached buffers).
+ * Routines manipulating passive references are xfs_group_get, xfs_group_hold
+ * and xfs_group_put.
+ *
+ * Active references are for short term access to the group for walking trees or
+ * accessing state. If a group is being shrunk or offlined, the lookup will fail
+ * to find that group and return NULL instead.
+ * Routines manipulating active references are xfs_group_grab and
+ * xfs_group_rele.
+ */
+
+struct xfs_group *
+xfs_group_get(
+ struct xfs_mount *mp,
+ uint32_t index,
+ enum xfs_group_type type)
+{
+ struct xfs_group *xg;
+
+ rcu_read_lock();
+ xg = xa_load(&mp->m_groups[type].xa, index);
+ if (xg) {
+ trace_xfs_group_get(xg, _RET_IP_);
+ ASSERT(atomic_read(&xg->xg_ref) >= 0);
+ atomic_inc(&xg->xg_ref);
+ }
+ rcu_read_unlock();
+ return xg;
+}
+
+struct xfs_group *
+xfs_group_hold(
+ struct xfs_group *xg)
+{
+ ASSERT(atomic_read(&xg->xg_ref) > 0 ||
+ atomic_read(&xg->xg_active_ref) > 0);
+
+ trace_xfs_group_hold(xg, _RET_IP_);
+ atomic_inc(&xg->xg_ref);
+ return xg;
+}
+
+void
+xfs_group_put(
+ struct xfs_group *xg)
+{
+ trace_xfs_group_put(xg, _RET_IP_);
+
+ ASSERT(atomic_read(&xg->xg_ref) > 0);
+ atomic_dec(&xg->xg_ref);
+}
+
+struct xfs_group *
+xfs_group_grab(
+ struct xfs_mount *mp,
+ uint32_t index,
+ enum xfs_group_type type)
+{
+ struct xfs_group *xg;
+
+ rcu_read_lock();
+ xg = xa_load(&mp->m_groups[type].xa, index);
+ if (xg) {
+ trace_xfs_group_grab(xg, _RET_IP_);
+ if (!atomic_inc_not_zero(&xg->xg_active_ref))
+ xg = NULL;
+ }
+ rcu_read_unlock();
+ return xg;
+}
+
+/*
+ * Iterate to the next group. To start the iteration at @start_index, a %NULL
+ * @xg is passed, else the previous group returned from this function. The
+ * caller should break out of the loop when this returns %NULL. If the caller
+ * wants to break out of a loop that did not finish it needs to release the
+ * active reference to @xg using xfs_group_rele() itself.
+ */
+struct xfs_group *
+xfs_group_next_range(
+ struct xfs_mount *mp,
+ struct xfs_group *xg,
+ uint32_t start_index,
+ uint32_t end_index,
+ enum xfs_group_type type)
+{
+ uint32_t index = start_index;
+
+ if (xg) {
+ index = xg->xg_gno + 1;
+ xfs_group_rele(xg);
+ }
+ if (index > end_index)
+ return NULL;
+ return xfs_group_grab(mp, index, type);
+}
+
+/*
+ * Find the next group after @xg, or the first group if @xg is NULL.
+ */
+struct xfs_group *
+xfs_group_grab_next_mark(
+ struct xfs_mount *mp,
+ struct xfs_group *xg,
+ xa_mark_t mark,
+ enum xfs_group_type type)
+{
+ unsigned long index = 0;
+
+ if (xg) {
+ index = xg->xg_gno + 1;
+ xfs_group_rele(xg);
+ }
+
+ rcu_read_lock();
+ xg = xa_find(&mp->m_groups[type].xa, &index, ULONG_MAX, mark);
+ if (xg) {
+ trace_xfs_group_grab_next_tag(xg, _RET_IP_);
+ if (!atomic_inc_not_zero(&xg->xg_active_ref))
+ xg = NULL;
+ }
+ rcu_read_unlock();
+ return xg;
+}
+
+void
+xfs_group_rele(
+ struct xfs_group *xg)
+{
+ trace_xfs_group_rele(xg, _RET_IP_);
+ atomic_dec(&xg->xg_active_ref);
+}
+
+void
+xfs_group_free(
+ struct xfs_mount *mp,
+ uint32_t index,
+ enum xfs_group_type type,
+ void (*uninit)(struct xfs_group *xg))
+{
+ struct xfs_group *xg = xa_erase(&mp->m_groups[type].xa, index);
+
+ XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_ref) != 0);
+
+ xfs_defer_drain_free(&xg->xg_intents_drain);
+#ifdef __KERNEL__
+ kfree(xg->xg_busy_extents);
+#endif
+
+ if (uninit)
+ uninit(xg);
+
+ /* drop the mount's active reference */
+ xfs_group_rele(xg);
+ XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0);
+ kfree_rcu_mightsleep(xg);
+}
+
+int
+xfs_group_insert(
+ struct xfs_mount *mp,
+ struct xfs_group *xg,
+ uint32_t index,
+ enum xfs_group_type type)
+{
+ int error;
+
+ xg->xg_mount = mp;
+ xg->xg_gno = index;
+ xg->xg_type = type;
+
+#ifdef __KERNEL__
+ xg->xg_busy_extents = xfs_extent_busy_alloc();
+ if (!xg->xg_busy_extents)
+ return -ENOMEM;
+ spin_lock_init(&xg->xg_state_lock);
+ xfs_hooks_init(&xg->xg_rmap_update_hooks);
+#endif
+ xfs_defer_drain_init(&xg->xg_intents_drain);
+
+ /* Active ref owned by mount indicates group is online. */
+ atomic_set(&xg->xg_active_ref, 1);
+
+ error = xa_insert(&mp->m_groups[type].xa, index, xg, GFP_KERNEL);
+ if (error) {
+ WARN_ON_ONCE(error == -EBUSY);
+ goto out_drain;
+ }
+
+ return 0;
+out_drain:
+ xfs_defer_drain_free(&xg->xg_intents_drain);
+#ifdef __KERNEL__
+ kfree(xg->xg_busy_extents);
+#endif
+ return error;
+}
+
+struct xfs_group *
+xfs_group_get_by_fsb(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno,
+ enum xfs_group_type type)
+{
+ return xfs_group_get(mp, xfs_fsb_to_gno(mp, fsbno, type), type);
+}
diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
new file mode 100644
index 000000000000..242b05627c7a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_group.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ */
+#ifndef __LIBXFS_GROUP_H
+#define __LIBXFS_GROUP_H 1
+
+struct xfs_group {
+ struct xfs_mount *xg_mount;
+ uint32_t xg_gno;
+ enum xfs_group_type xg_type;
+ atomic_t xg_ref; /* passive reference count */
+ atomic_t xg_active_ref; /* active reference count */
+
+ /* Precalculated geometry info */
+ uint32_t xg_block_count; /* max usable gbno */
+ uint32_t xg_min_gbno; /* min usable gbno */
+
+#ifdef __KERNEL__
+ /* -- kernel only structures below this line -- */
+
+ /*
+ * Track freed but not yet committed extents.
+ */
+ struct xfs_extent_busy_tree *xg_busy_extents;
+
+ /*
+ * Bitsets of per-ag metadata that have been checked and/or are sick.
+ * Callers should hold xg_state_lock before accessing this field.
+ */
+ uint16_t xg_checked;
+ uint16_t xg_sick;
+ spinlock_t xg_state_lock;
+
+ /*
+ * We use xfs_drain to track the number of deferred log intent items
+ * that have been queued (but not yet processed) so that waiters (e.g.
+ * scrub) will not lock resources when other threads are in the middle
+ * of processing a chain of intent items only to find momentary
+ * inconsistencies.
+ */
+ struct xfs_defer_drain xg_intents_drain;
+
+ /*
+ * Hook to feed rmapbt updates to an active online repair.
+ */
+ struct xfs_hooks xg_rmap_update_hooks;
+#endif /* __KERNEL__ */
+};
+
+struct xfs_group *xfs_group_get(struct xfs_mount *mp, uint32_t index,
+ enum xfs_group_type type);
+struct xfs_group *xfs_group_get_by_fsb(struct xfs_mount *mp,
+ xfs_fsblock_t fsbno, enum xfs_group_type type);
+struct xfs_group *xfs_group_hold(struct xfs_group *xg);
+void xfs_group_put(struct xfs_group *xg);
+
+struct xfs_group *xfs_group_grab(struct xfs_mount *mp, uint32_t index,
+ enum xfs_group_type type);
+struct xfs_group *xfs_group_next_range(struct xfs_mount *mp,
+ struct xfs_group *xg, uint32_t start_index, uint32_t end_index,
+ enum xfs_group_type type);
+struct xfs_group *xfs_group_grab_next_mark(struct xfs_mount *mp,
+ struct xfs_group *xg, xa_mark_t mark, enum xfs_group_type type);
+void xfs_group_rele(struct xfs_group *xg);
+
+void xfs_group_free(struct xfs_mount *mp, uint32_t index,
+ enum xfs_group_type type, void (*uninit)(struct xfs_group *xg));
+int xfs_group_insert(struct xfs_mount *mp, struct xfs_group *xg,
+ uint32_t index, enum xfs_group_type);
+
+#define xfs_group_set_mark(_xg, _mark) \
+ xa_set_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \
+ (_xg)->xg_gno, (_mark))
+#define xfs_group_clear_mark(_xg, _mark) \
+ xa_clear_mark(&(_xg)->xg_mount->m_groups[(_xg)->xg_type].xa, \
+ (_xg)->xg_gno, (_mark))
+#define xfs_group_marked(_mp, _type, _mark) \
+ xa_marked(&(_mp)->m_groups[(_type)].xa, (_mark))
+
+static inline xfs_agblock_t
+xfs_group_max_blocks(
+ struct xfs_group *xg)
+{
+ return xg->xg_mount->m_groups[xg->xg_type].blocks;
+}
+
+static inline xfs_fsblock_t
+xfs_group_start_fsb(
+ struct xfs_group *xg)
+{
+ return ((xfs_fsblock_t)xg->xg_gno) <<
+ xg->xg_mount->m_groups[xg->xg_type].blklog;
+}
+
+static inline xfs_fsblock_t
+xfs_gbno_to_fsb(
+ struct xfs_group *xg,
+ xfs_agblock_t gbno)
+{
+ return xfs_group_start_fsb(xg) | gbno;
+}
+
+static inline xfs_daddr_t
+xfs_gbno_to_daddr(
+ struct xfs_group *xg,
+ xfs_agblock_t gbno)
+{
+ struct xfs_mount *mp = xg->xg_mount;
+ uint32_t blocks = mp->m_groups[xg->xg_type].blocks;
+
+ return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno);
+}
+
+static inline uint32_t
+xfs_fsb_to_gno(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno,
+ enum xfs_group_type type)
+{
+ if (!mp->m_groups[type].blklog)
+ return 0;
+ return fsbno >> mp->m_groups[type].blklog;
+}
+
+static inline xfs_agblock_t
+xfs_fsb_to_gbno(
+ struct xfs_mount *mp,
+ xfs_fsblock_t fsbno,
+ enum xfs_group_type type)
+{
+ return fsbno & mp->m_groups[type].blkmask;
+}
+
+static inline bool
+xfs_verify_gbno(
+ struct xfs_group *xg,
+ uint32_t gbno)
+{
+ if (gbno >= xg->xg_block_count)
+ return false;
+ if (gbno < xg->xg_min_gbno)
+ return false;
+ return true;
+}
+
+static inline bool
+xfs_verify_gbext(
+ struct xfs_group *xg,
+ uint32_t gbno,
+ uint32_t glen)
+{
+ uint32_t end;
+
+ if (!xfs_verify_gbno(xg, gbno))
+ return false;
+ if (glen == 0 || check_add_overflow(gbno, glen - 1, &end))
+ return false;
+ if (!xfs_verify_gbno(xg, end))
+ return false;
+ return true;
+}
+
+#endif /* __LIBXFS_GROUP_H */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index b0edb4288e59..d34986ac18c3 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -6,6 +6,8 @@
#ifndef __XFS_HEALTH_H__
#define __XFS_HEALTH_H__
+struct xfs_group;
+
/*
* In-Core Filesystem Health Assessments
* =====================================
@@ -52,6 +54,7 @@ struct xfs_inode;
struct xfs_fsop_geom;
struct xfs_btree_cur;
struct xfs_da_args;
+struct xfs_rtgroup;
/* Observable health issues for metadata spanning the entire filesystem. */
#define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */
@@ -60,10 +63,13 @@ struct xfs_da_args;
#define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */
#define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */
#define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */
+#define XFS_SICK_FS_METADIR (1 << 6) /* metadata directory tree */
+#define XFS_SICK_FS_METAPATH (1 << 7) /* metadata directory tree path */
-/* Observable health issues for realtime volume metadata. */
-#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */
-#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */
+/* Observable health issues for realtime group metadata. */
+#define XFS_SICK_RG_SUPER (1 << 0) /* rt group superblock */
+#define XFS_SICK_RG_BITMAP (1 << 1) /* rt group bitmap */
+#define XFS_SICK_RG_SUMMARY (1 << 2) /* rt groups summary */
/* Observable health issues for AG metadata. */
#define XFS_SICK_AG_SB (1 << 0) /* superblock */
@@ -103,10 +109,13 @@ struct xfs_da_args;
XFS_SICK_FS_GQUOTA | \
XFS_SICK_FS_PQUOTA | \
XFS_SICK_FS_QUOTACHECK | \
- XFS_SICK_FS_NLINKS)
+ XFS_SICK_FS_NLINKS | \
+ XFS_SICK_FS_METADIR | \
+ XFS_SICK_FS_METAPATH)
-#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \
- XFS_SICK_RT_SUMMARY)
+#define XFS_SICK_RG_PRIMARY (XFS_SICK_RG_SUPER | \
+ XFS_SICK_RG_BITMAP | \
+ XFS_SICK_RG_SUMMARY)
#define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \
XFS_SICK_AG_AGF | \
@@ -136,26 +145,26 @@ struct xfs_da_args;
/* Secondary state related to (but not primary evidence of) health problems. */
#define XFS_SICK_FS_SECONDARY (0)
-#define XFS_SICK_RT_SECONDARY (0)
+#define XFS_SICK_RG_SECONDARY (0)
#define XFS_SICK_AG_SECONDARY (0)
#define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET)
/* Evidence of health problems elsewhere. */
#define XFS_SICK_FS_INDIRECT (0)
-#define XFS_SICK_RT_INDIRECT (0)
+#define XFS_SICK_RG_INDIRECT (0)
#define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES)
#define XFS_SICK_INO_INDIRECT (0)
/* All health masks. */
-#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \
+#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \
XFS_SICK_FS_SECONDARY | \
XFS_SICK_FS_INDIRECT)
-#define XFS_SICK_RT_ALL (XFS_SICK_RT_PRIMARY | \
- XFS_SICK_RT_SECONDARY | \
- XFS_SICK_RT_INDIRECT)
+#define XFS_SICK_RG_ALL (XFS_SICK_RG_PRIMARY | \
+ XFS_SICK_RG_SECONDARY | \
+ XFS_SICK_RG_INDIRECT)
-#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \
+#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \
XFS_SICK_AG_SECONDARY | \
XFS_SICK_AG_INDIRECT)
@@ -189,18 +198,17 @@ void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
unsigned int *checked);
-void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
-void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask);
-void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
-void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
- unsigned int *checked);
+void xfs_rgno_mark_sick(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+ unsigned int mask);
void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno,
unsigned int mask);
-void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
-void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask);
-void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
-void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
+void xfs_group_mark_sick(struct xfs_group *xg, unsigned int mask);
+#define xfs_ag_mark_sick(pag, mask) \
+ xfs_group_mark_sick(pag_group(pag), (mask))
+void xfs_group_mark_corrupt(struct xfs_group *xg, unsigned int mask);
+void xfs_group_mark_healthy(struct xfs_group *xg, unsigned int mask);
+void xfs_group_measure_sickness(struct xfs_group *xg, unsigned int *sick,
unsigned int *checked);
void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
@@ -227,22 +235,25 @@ xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask)
}
static inline bool
-xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask)
+xfs_group_has_sickness(
+ struct xfs_group *xg,
+ unsigned int mask)
{
- unsigned int sick, checked;
+ unsigned int sick, checked;
- xfs_rt_measure_sickness(mp, &sick, &checked);
+ xfs_group_measure_sickness(xg, &sick, &checked);
return sick & mask;
}
-static inline bool
-xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask)
-{
- unsigned int sick, checked;
+#define xfs_ag_has_sickness(pag, mask) \
+ xfs_group_has_sickness(pag_group(pag), (mask))
+#define xfs_ag_is_healthy(pag) \
+ (!xfs_ag_has_sickness((pag), UINT_MAX))
- xfs_ag_measure_sickness(pag, &sick, &checked);
- return sick & mask;
-}
+#define xfs_rtgroup_has_sickness(rtg, mask) \
+ xfs_group_has_sickness(rtg_group(rtg), (mask))
+#define xfs_rtgroup_is_healthy(rtg) \
+ (!xfs_rtgroup_has_sickness((rtg), UINT_MAX))
static inline bool
xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask)
@@ -260,18 +271,6 @@ xfs_fs_is_healthy(struct xfs_mount *mp)
}
static inline bool
-xfs_rt_is_healthy(struct xfs_mount *mp)
-{
- return !xfs_rt_has_sickness(mp, -1U);
-}
-
-static inline bool
-xfs_ag_is_healthy(struct xfs_perag *pag)
-{
- return !xfs_ag_has_sickness(pag, -1U);
-}
-
-static inline bool
xfs_inode_is_healthy(struct xfs_inode *ip)
{
return !xfs_inode_has_sickness(ip, -1U);
@@ -279,6 +278,8 @@ xfs_inode_is_healthy(struct xfs_inode *ip)
void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+void xfs_rtgroup_geom_health(struct xfs_rtgroup *rtg,
+ struct xfs_rtgroup_geometry *rgeo);
void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
#define xfs_metadata_is_sick(error) \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 271855227514..8b84e2cf711b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -142,7 +142,7 @@ xfs_inobt_complain_bad_rec(
xfs_warn(mp,
"%sbt record corruption in AG %d detected at %pS!",
- cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
+ cur->bc_ops->name, cur->bc_group->xg_gno, fa);
xfs_warn(mp,
"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
irec->ir_startino, irec->ir_count, irec->ir_freecount,
@@ -170,7 +170,7 @@ xfs_inobt_get_rec(
return error;
xfs_inobt_btrec_to_irec(mp, rec, irec);
- fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec);
+ fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, irec);
@@ -275,8 +275,10 @@ xfs_check_agi_freecount(
}
} while (i == 1);
- if (!xfs_is_shutdown(cur->bc_mp))
- ASSERT(freecount == cur->bc_ag.pag->pagi_freecount);
+ if (!xfs_is_shutdown(cur->bc_mp)) {
+ ASSERT(freecount ==
+ to_perag(cur->bc_group)->pagi_freecount);
+ }
}
return 0;
}
@@ -551,7 +553,7 @@ xfs_inobt_insert_sprec(
struct xfs_buf *agbp,
struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_btree_cur *cur;
int error;
int i;
@@ -606,15 +608,12 @@ xfs_inobt_insert_sprec(
goto error;
}
- trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
- rec.ir_holemask, nrec->ir_startino,
- nrec->ir_holemask);
+ trace_xfs_irec_merge_pre(pag, &rec, nrec);
/* merge to nrec to output the updated record */
__xfs_inobt_rec_merge(nrec, &rec);
- trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
- nrec->ir_holemask);
+ trace_xfs_irec_merge_post(pag, nrec);
error = xfs_inobt_rec_check_count(mp, nrec);
if (error)
@@ -648,7 +647,7 @@ xfs_finobt_insert_sprec(
struct xfs_buf *agbp,
struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_btree_cur *cur;
int error;
int i;
@@ -768,8 +767,7 @@ xfs_ialloc_ag_alloc(
/* Allow space for the inode btree to split. */
args.minleft = igeo->inobt_maxlevels;
error = xfs_alloc_vextent_exact_bno(&args,
- XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
- args.agbno));
+ xfs_agbno_to_fsb(pag, args.agbno));
if (error)
return error;
@@ -811,8 +809,8 @@ xfs_ialloc_ag_alloc(
*/
args.minleft = igeo->inobt_maxlevels;
error = xfs_alloc_vextent_near_bno(&args,
- XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
- be32_to_cpu(agi->agi_root)));
+ xfs_agbno_to_fsb(pag,
+ be32_to_cpu(agi->agi_root)));
if (error)
return error;
}
@@ -824,8 +822,8 @@ xfs_ialloc_ag_alloc(
if (isaligned && args.fsbno == NULLFSBLOCK) {
args.alignment = igeo->cluster_align;
error = xfs_alloc_vextent_near_bno(&args,
- XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
- be32_to_cpu(agi->agi_root)));
+ xfs_agbno_to_fsb(pag,
+ be32_to_cpu(agi->agi_root)));
if (error)
return error;
}
@@ -860,8 +858,8 @@ sparse_alloc:
igeo->ialloc_blks;
error = xfs_alloc_vextent_near_bno(&args,
- XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
- be32_to_cpu(agi->agi_root)));
+ xfs_agbno_to_fsb(pag,
+ be32_to_cpu(agi->agi_root)));
if (error)
return error;
@@ -884,7 +882,7 @@ sparse_alloc:
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag),
args.agbno, args.len, get_random_u32());
if (error)
@@ -915,8 +913,7 @@ sparse_alloc:
if (error == -EFSCORRUPTED) {
xfs_alert(args.mp,
"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
- XFS_AGINO_TO_INO(args.mp, pag->pag_agno,
- rec.ir_startino),
+ xfs_agino_to_ino(pag, rec.ir_startino),
rec.ir_holemask, rec.ir_count);
xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
}
@@ -1076,7 +1073,7 @@ xfs_dialloc_check_ino(
if (error)
return -EAGAIN;
- error = xfs_imap_to_bp(pag->pag_mount, tp, &imap, &bp);
+ error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp);
if (error)
return -EAGAIN;
@@ -1127,7 +1124,7 @@ xfs_dialloc_ag_inobt(
/*
* If in the same AG as the parent, try to get near the parent.
*/
- if (pagno == pag->pag_agno) {
+ if (pagno == pag_agno(pag)) {
int doneleft; /* done, to the left */
int doneright; /* done, to the right */
@@ -1335,7 +1332,7 @@ alloc_inode:
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
- ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+ ino = xfs_agino_to_ino(pag, rec.ir_startino + offset);
if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
error = xfs_dialloc_check_ino(pag, tp, ino);
@@ -1604,7 +1601,7 @@ xfs_dialloc_ag(
* parent. If so, find the closest available inode to the parent. If
* not, consider the agi hint or find the first free inode in the AG.
*/
- if (pag->pag_agno == pagno)
+ if (pag_agno(pag) == pagno)
error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
else
error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
@@ -1616,7 +1613,7 @@ xfs_dialloc_ag(
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
- ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
+ ino = xfs_agino_to_ino(pag, rec.ir_startino + offset);
if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
error = xfs_dialloc_check_ino(pag, tp, ino);
@@ -1845,6 +1842,40 @@ out_release:
}
/*
+ * Pick an AG for the new inode.
+ *
+ * Directories, symlinks, and regular files frequently allocate at least one
+ * block, so factor that potential expansion when we examine whether an AG has
+ * enough space for file creation. Try to keep metadata files all in the same
+ * AG.
+ */
+static inline xfs_agnumber_t
+xfs_dialloc_pick_ag(
+ struct xfs_mount *mp,
+ struct xfs_inode *dp,
+ umode_t mode)
+{
+ xfs_agnumber_t start_agno;
+
+ if (!dp)
+ return 0;
+ if (xfs_is_metadir_inode(dp)) {
+ if (mp->m_sb.sb_logstart)
+ return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+ return 0;
+ }
+
+ if (S_ISDIR(mode))
+ return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi;
+
+ start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino);
+ if (start_agno >= mp->m_maxagi)
+ start_agno = 0;
+
+ return start_agno;
+}
+
+/*
* Allocate an on-disk inode.
*
* Mode is used to tell whether the new inode is a directory and hence where to
@@ -1859,31 +1890,19 @@ xfs_dialloc(
xfs_ino_t *new_ino)
{
struct xfs_mount *mp = (*tpp)->t_mountp;
+ struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_ino_t ino = NULLFSINO;
xfs_ino_t parent = args->pip ? args->pip->i_ino : 0;
- umode_t mode = args->mode & S_IFMT;
xfs_agnumber_t agno;
- int error = 0;
xfs_agnumber_t start_agno;
- struct xfs_perag *pag;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ umode_t mode = args->mode & S_IFMT;
bool ok_alloc = true;
bool low_space = false;
int flags;
- xfs_ino_t ino = NULLFSINO;
+ int error = 0;
- /*
- * Directories, symlinks, and regular files frequently allocate at least
- * one block, so factor that potential expansion when we examine whether
- * an AG has enough space for file creation.
- */
- if (S_ISDIR(mode))
- start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) %
- mp->m_maxagi;
- else {
- start_agno = XFS_INO_TO_AGNO(mp, parent);
- if (start_agno >= mp->m_maxagi)
- start_agno = 0;
- }
+ start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode);
/*
* If we have already hit the ceiling of inode blocks then clear
@@ -1974,7 +1993,7 @@ retry:
static int
xfs_difree_inode_chunk(
struct xfs_trans *tp,
- xfs_agnumber_t agno,
+ struct xfs_perag *pag,
struct xfs_inobt_rec_incore *rec)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -1988,8 +2007,7 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
- return xfs_free_extent_later(tp,
- XFS_AGB_TO_FSB(mp, agno, sagbno),
+ return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno),
M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
XFS_AG_RESV_NONE, 0);
}
@@ -2035,9 +2053,9 @@ xfs_difree_inode_chunk(
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
- error = xfs_free_extent_later(tp,
- XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
- &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0);
+ error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno),
+ contigblk, &XFS_RMAP_OINFO_INODES,
+ XFS_AG_RESV_NONE, 0);
if (error)
return error;
@@ -2059,7 +2077,7 @@ xfs_difree_inobt(
struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_agi *agi = agbp->b_addr;
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
@@ -2124,8 +2142,7 @@ xfs_difree_inobt(
if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
xic->deleted = true;
- xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
- rec.ir_startino);
+ xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino);
xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
/*
@@ -2148,7 +2165,7 @@ xfs_difree_inobt(
goto error0;
}
- error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
+ error = xfs_difree_inode_chunk(tp, pag, &rec);
if (error)
goto error0;
} else {
@@ -2194,7 +2211,7 @@ xfs_difree_finobt(
xfs_agino_t agino,
struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
int offset = agino - ibtrec->ir_startino;
@@ -2317,17 +2334,17 @@ xfs_difree(
/*
* Break up inode number into its components.
*/
- if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) {
- xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).",
- __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno);
+ if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) {
+ xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).",
+ __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag));
ASSERT(0);
return -EINVAL;
}
agino = XFS_INO_TO_AGINO(mp, inode);
- if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
- xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+ if (inode != xfs_agino_to_ino(pag, agino)) {
+ xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).",
__func__, (unsigned long long)inode,
- (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
+ (unsigned long long)xfs_agino_to_ino(pag, agino));
ASSERT(0);
return -EINVAL;
}
@@ -2380,7 +2397,7 @@ xfs_imap_lookup(
xfs_agblock_t *offset_agbno,
int flags)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_inobt_rec_incore rec;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
@@ -2391,7 +2408,7 @@ xfs_imap_lookup(
if (error) {
xfs_alert(mp,
"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
- __func__, error, pag->pag_agno);
+ __func__, error, pag_agno(pag));
return error;
}
@@ -2441,7 +2458,7 @@ xfs_imap(
struct xfs_imap *imap, /* location map structure */
uint flags) /* flags for inode btree lookup */
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
xfs_agblock_t agbno; /* block number of inode in the alloc group */
xfs_agino_t agino; /* inode number within alloc group */
xfs_agblock_t chunk_agbno; /* first block in inode chunk */
@@ -2458,7 +2475,7 @@ xfs_imap(
agino = XFS_INO_TO_AGINO(mp, ino);
agbno = XFS_AGINO_TO_AGBNO(mp, agino);
if (agbno >= mp->m_sb.sb_agblocks ||
- ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+ ino != xfs_agino_to_ino(pag, agino)) {
error = -EINVAL;
#ifdef DEBUG
/*
@@ -2473,11 +2490,11 @@ xfs_imap(
__func__, (unsigned long long)agbno,
(unsigned long)mp->m_sb.sb_agblocks);
}
- if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
+ if (ino != xfs_agino_to_ino(pag, agino)) {
xfs_alert(mp,
- "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+ "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)",
__func__, ino,
- XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
+ xfs_agino_to_ino(pag, agino));
}
xfs_stack_trace();
#endif /* DEBUG */
@@ -2507,7 +2524,7 @@ xfs_imap(
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
- imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno);
+ imap->im_blkno = xfs_agbno_to_daddr(pag, agbno);
imap->im_len = XFS_FSB_TO_BB(mp, 1);
imap->im_boffset = (unsigned short)(offset <<
mp->m_sb.sb_inodelog);
@@ -2537,7 +2554,7 @@ out_map:
offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
XFS_INO_TO_OFFSET(mp, ino);
- imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno);
+ imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno);
imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
@@ -2733,13 +2750,13 @@ xfs_read_agi(
xfs_buf_flags_t flags,
struct xfs_buf **agibpp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
int error;
- trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
+ trace_xfs_read_agi(pag);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
+ XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
@@ -2767,7 +2784,7 @@ xfs_ialloc_read_agi(
struct xfs_agi *agi;
int error;
- trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
+ trace_xfs_ialloc_read_agi(pag);
error = xfs_read_agi(pag, tp,
(flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
@@ -2787,7 +2804,7 @@ xfs_ialloc_read_agi(
* we are in the middle of a forced shutdown.
*/
ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- xfs_is_shutdown(pag->pag_mount));
+ xfs_is_shutdown(pag_mount(pag)));
if (agibpp)
*agibpp = agibp;
else
@@ -2887,7 +2904,7 @@ xfs_ialloc_count_inodes_rec(
xfs_failaddr_t fa;
xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
- fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec);
+ fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, &irec);
@@ -3126,13 +3143,13 @@ xfs_ialloc_check_shrink(
int has;
int error;
- if (!xfs_has_sparseinodes(pag->pag_mount))
+ if (!xfs_has_sparseinodes(pag_mount(pag)))
return 0;
cur = xfs_inobt_init_cursor(pag, tp, agibp);
/* Look up the inobt record that would correspond to the new EOFS. */
- agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length);
+ agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
if (error || !has)
goto out;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 401b42d52af6..9b34896dd1a3 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -37,7 +37,7 @@ STATIC struct xfs_btree_cur *
xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
- return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
+ return xfs_inobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp,
cur->bc_ag.agbp);
}
@@ -45,7 +45,7 @@ STATIC struct xfs_btree_cur *
xfs_finobt_dup_cursor(
struct xfs_btree_cur *cur)
{
- return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
+ return xfs_finobt_init_cursor(to_perag(cur->bc_group), cur->bc_tp,
cur->bc_ag.agbp);
}
@@ -112,7 +112,7 @@ __xfs_inobt_alloc_block(
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
- args.pag = cur->bc_ag.pag;
+ args.pag = to_perag(cur->bc_group);
args.oinfo = XFS_RMAP_OINFO_INOBT;
args.minlen = 1;
args.maxlen = 1;
@@ -120,7 +120,7 @@ __xfs_inobt_alloc_block(
args.resv = resv;
error = xfs_alloc_vextent_near_bno(&args,
- XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, sbno));
+ xfs_agbno_to_fsb(args.pag, sbno));
if (error)
return error;
@@ -248,7 +248,7 @@ xfs_inobt_init_ptr_from_cur(
{
struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno));
ptr->s = agi->agi_root;
}
@@ -260,7 +260,8 @@ xfs_finobt_init_ptr_from_cur(
{
struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agi->agi_seqno));
+
ptr->s = agi->agi_free_root;
}
@@ -478,12 +479,12 @@ xfs_inobt_init_cursor(
struct xfs_trans *tp,
struct xfs_buf *agbp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_btree_cur *cur;
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops,
M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_ag.agbp = agbp;
if (agbp) {
struct xfs_agi *agi = agbp->b_addr;
@@ -504,12 +505,12 @@ xfs_finobt_init_cursor(
struct xfs_trans *tp,
struct xfs_buf *agbp)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_btree_cur *cur;
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops,
M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_ag.agbp = agbp;
if (agbp) {
struct xfs_agi *agi = agbp->b_addr;
@@ -715,8 +716,8 @@ static xfs_extlen_t
xfs_inobt_max_size(
struct xfs_perag *pag)
{
- struct xfs_mount *mp = pag->pag_mount;
- xfs_agblock_t agblocks = pag->block_count;
+ struct xfs_mount *mp = pag_mount(pag);
+ xfs_agblock_t agblocks = pag_group(pag)->xg_block_count;
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (M_IGEO(mp)->inobt_mxr[0] == 0)
@@ -727,7 +728,7 @@ xfs_inobt_max_size(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (xfs_ag_contains_log(mp, pag->pag_agno))
+ if (xfs_ag_contains_log(mp, pag_agno(pag)))
agblocks -= mp->m_sb.sb_logblocks;
return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
@@ -791,10 +792,10 @@ xfs_finobt_calc_reserves(
xfs_extlen_t tree_len = 0;
int error;
- if (!xfs_has_finobt(pag->pag_mount))
+ if (!xfs_has_finobt(pag_mount(pag)))
return 0;
- if (xfs_has_inobtcounts(pag->pag_mount))
+ if (xfs_has_inobtcounts(pag_mount(pag)))
error = xfs_finobt_read_blocks(pag, tp, &tree_len);
else
error = xfs_finobt_count_blocks(pag, tp, &tree_len);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 79babeac9d75..424861fbf1bd 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -19,6 +19,7 @@
#include "xfs_ialloc.h"
#include "xfs_dir2.h"
#include "xfs_health.h"
+#include "xfs_metafile.h"
#include <linux/iversion.h>
@@ -209,12 +210,15 @@ xfs_inode_from_disk(
* They will also be unconditionally written back to disk as v2 inodes.
*/
if (unlikely(from->di_version == 1)) {
- set_nlink(inode, be16_to_cpu(from->di_onlink));
+ /* di_metatype used to be di_onlink */
+ set_nlink(inode, be16_to_cpu(from->di_metatype));
ip->i_projid = 0;
} else {
set_nlink(inode, be32_to_cpu(from->di_nlink));
ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
be16_to_cpu(from->di_projid_lo);
+ if (xfs_dinode_is_metadir(from))
+ ip->i_metatype = be16_to_cpu(from->di_metatype);
}
i_uid_write(inode, be32_to_cpu(from->di_uid));
@@ -315,7 +319,10 @@ xfs_inode_to_disk(
struct inode *inode = VFS_I(ip);
to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
- to->di_onlink = 0;
+ if (xfs_is_metadir_inode(ip))
+ to->di_metatype = cpu_to_be16(ip->i_metatype);
+ else
+ to->di_metatype = 0;
to->di_format = xfs_ifork_format(&ip->i_df);
to->di_uid = cpu_to_be32(i_uid_read(inode));
@@ -483,6 +490,69 @@ xfs_dinode_verify_nrext64(
return NULL;
}
+/*
+ * Validate all the picky requirements we have for a file that claims to be
+ * filesystem metadata.
+ */
+xfs_failaddr_t
+xfs_dinode_verify_metadir(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip,
+ uint16_t mode,
+ uint16_t flags,
+ uint64_t flags2)
+{
+ if (!xfs_has_metadir(mp))
+ return __this_address;
+
+ /* V5 filesystem only */
+ if (dip->di_version < 3)
+ return __this_address;
+
+ if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX)
+ return __this_address;
+
+ /* V3 inode fields that are always zero */
+ if ((flags2 & XFS_DIFLAG2_NREXT64) && dip->di_nrext64_pad)
+ return __this_address;
+ if (!(flags2 & XFS_DIFLAG2_NREXT64) && dip->di_flushiter)
+ return __this_address;
+
+ /* Metadata files can only be directories or regular files */
+ if (!S_ISDIR(mode) && !S_ISREG(mode))
+ return __this_address;
+
+ /* They must have zero access permissions */
+ if (mode & 0777)
+ return __this_address;
+
+ /* DMAPI event and state masks are zero */
+ if (dip->di_dmevmask || dip->di_dmstate)
+ return __this_address;
+
+ /*
+ * User and group IDs must be zero. The project ID is used for
+ * grouping inodes. Metadata inodes are never accounted to quotas.
+ */
+ if (dip->di_uid || dip->di_gid)
+ return __this_address;
+
+ /* Mandatory inode flags must be set */
+ if (S_ISDIR(mode)) {
+ if ((flags & XFS_METADIR_DIFLAGS) != XFS_METADIR_DIFLAGS)
+ return __this_address;
+ } else {
+ if ((flags & XFS_METAFILE_DIFLAGS) != XFS_METAFILE_DIFLAGS)
+ return __this_address;
+ }
+
+ /* dax flags2 must not be set */
+ if (flags2 & XFS_DIFLAG2_DAX)
+ return __this_address;
+
+ return NULL;
+}
+
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
@@ -523,8 +593,11 @@ xfs_dinode_verify(
* di_nlink==0 on a V1 inode. V2/3 inodes would get written out with
* di_onlink==0, so we can check that.
*/
- if (dip->di_version >= 2) {
- if (dip->di_onlink)
+ if (dip->di_version == 2) {
+ if (dip->di_metatype)
+ return __this_address;
+ } else if (dip->di_version >= 3) {
+ if (!xfs_dinode_is_metadir(dip) && dip->di_metatype)
return __this_address;
}
@@ -546,7 +619,8 @@ xfs_dinode_verify(
if (dip->di_nlink)
return __this_address;
} else {
- if (dip->di_onlink)
+ /* di_metatype used to be di_onlink */
+ if (dip->di_metatype)
return __this_address;
}
}
@@ -663,6 +737,12 @@ xfs_dinode_verify(
!xfs_has_bigtime(mp))
return __this_address;
+ if (flags2 & XFS_DIFLAG2_METADATA) {
+ fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2);
+ if (fa)
+ return fa;
+ }
+
return NULL;
}
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 585ed5a110af..8d43d2641c73 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -28,6 +28,9 @@ int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_dinode *dip);
+xfs_failaddr_t xfs_dinode_verify_metadir(struct xfs_mount *mp,
+ struct xfs_dinode *dip, uint16_t mode, uint16_t flags,
+ uint64_t flags2);
xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp,
uint32_t extsize, uint16_t mode, uint16_t flags);
xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index cc38e1c3c3e1..deb0b7c00a1f 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -224,6 +224,8 @@ xfs_inode_inherit_flags2(
}
if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+ if (xfs_is_metadir_inode(pip))
+ ip->i_diflags2 |= XFS_DIFLAG2_METADATA;
/* Don't let invalid cowextsize hints propagate. */
failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
@@ -442,8 +444,8 @@ xfs_iunlink_update_bucket(
ASSERT(xfs_verify_agino_or_null(pag, new_agino));
old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
- old_value, new_agino);
+ trace_xfs_iunlink_update_bucket(pag, bucket_index, old_value,
+ new_agino);
/*
* We should never find the head of the list already set to the value
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 3e6682ed656b..15dec19b6c32 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -248,6 +248,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */
#define XFS_LI_XMI 0x1248 /* mapping exchange intent */
#define XFS_LI_XMD 0x1249 /* mapping exchange done */
+#define XFS_LI_EFI_RT 0x124a /* realtime extent free intent */
+#define XFS_LI_EFD_RT 0x124b /* realtime extent free done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -267,7 +269,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \
{ XFS_LI_ATTRD, "XFS_LI_ATTRD" }, \
{ XFS_LI_XMI, "XFS_LI_XMI" }, \
- { XFS_LI_XMD, "XFS_LI_XMD" }
+ { XFS_LI_XMD, "XFS_LI_XMD" }, \
+ { XFS_LI_EFI_RT, "XFS_LI_EFI_RT" }, \
+ { XFS_LI_EFD_RT, "XFS_LI_EFD_RT" }
/*
* Inode Log Item Format definitions.
@@ -404,7 +408,7 @@ struct xfs_log_dinode {
uint16_t di_mode; /* mode and type of file */
int8_t di_version; /* inode version */
int8_t di_format; /* format of di_c data */
- uint8_t di_pad3[2]; /* unused in v2/3 inodes */
+ uint16_t di_metatype; /* metadata type, if DIFLAG2_METADATA */
uint32_t di_uid; /* owner's user id */
uint32_t di_gid; /* owner's group id */
uint32_t di_nlink; /* number of links to file */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 521d327e4c89..5397a8ff004d 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -77,6 +77,8 @@ extern const struct xlog_recover_item_ops xlog_attri_item_ops;
extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
extern const struct xlog_recover_item_ops xlog_xmi_item_ops;
extern const struct xlog_recover_item_ops xlog_xmd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtefi_item_ops;
+extern const struct xlog_recover_item_ops xlog_rtefd_item_ops;
/*
* Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_metadir.c b/fs/xfs/libxfs/xfs_metadir.c
new file mode 100644
index 000000000000..bae7377c0f22
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_metadir.c
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_metafile.h"
+#include "xfs_metadir.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_parent.h"
+#include "xfs_health.h"
+
+/*
+ * Metadata Directory Tree
+ * =======================
+ *
+ * These functions provide an abstraction layer for looking up, creating, and
+ * deleting metadata inodes that live within a special metadata directory tree.
+ *
+ * This code does not manage the five existing metadata inodes: real time
+ * bitmap & summary; and the user, group, and quotas. All other metadata
+ * inodes must use only the xfs_meta{dir,file}_* functions.
+ *
+ * Callers wishing to create or hardlink a metadata inode must create an
+ * xfs_metadir_update structure, call the appropriate xfs_metadir* function,
+ * and then call xfs_metadir_commit or xfs_metadir_cancel to commit or cancel
+ * the update. Files in the metadata directory tree currently cannot be
+ * unlinked.
+ *
+ * When the metadir feature is enabled, all metadata inodes must have the
+ * "metadata" inode flag set to prevent them from being exposed to the outside
+ * world.
+ *
+ * Callers must take the ILOCK of any inode in the metadata directory tree to
+ * synchronize access to that inode. It is never necessary to take the IOLOCK
+ * or the MMAPLOCK since metadata inodes must not be exposed to user space.
+ */
+
+static inline void
+xfs_metadir_set_xname(
+ struct xfs_name *xname,
+ const char *path,
+ unsigned char ftype)
+{
+ xname->name = (const unsigned char *)path;
+ xname->len = strlen(path);
+ xname->type = ftype;
+}
+
+/*
+ * Given a parent directory @dp and a metadata inode path component @xname,
+ * Look up the inode number in the directory, returning it in @ino.
+ * @xname.type must match the directory entry's ftype.
+ *
+ * Caller must hold ILOCK_EXCL.
+ */
+static inline int
+xfs_metadir_lookup(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ struct xfs_name *xname,
+ xfs_ino_t *ino)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_args args = {
+ .trans = tp,
+ .dp = dp,
+ .geo = mp->m_dir_geo,
+ .name = xname->name,
+ .namelen = xname->len,
+ .hashval = xfs_dir2_hashname(mp, xname),
+ .whichfork = XFS_DATA_FORK,
+ .op_flags = XFS_DA_OP_OKNOENT,
+ .owner = dp->i_ino,
+ };
+ int error;
+
+ if (!S_ISDIR(VFS_I(dp)->i_mode)) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ error = xfs_dir_lookup_args(&args);
+ if (error)
+ return error;
+
+ if (!xfs_verify_ino(mp, args.inumber)) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+ if (xname->type != XFS_DIR3_FT_UNKNOWN && xname->type != args.filetype) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ trace_xfs_metadir_lookup(dp, xname, args.inumber);
+ *ino = args.inumber;
+ return 0;
+}
+
+/*
+ * Look up and read a metadata inode from the metadata directory. If the path
+ * component doesn't exist, return -ENOENT.
+ */
+int
+xfs_metadir_load(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ const char *path,
+ enum xfs_metafile_type metafile_type,
+ struct xfs_inode **ipp)
+{
+ struct xfs_name xname;
+ xfs_ino_t ino;
+ int error;
+
+ xfs_metadir_set_xname(&xname, path, XFS_DIR3_FT_UNKNOWN);
+
+ xfs_ilock(dp, XFS_ILOCK_EXCL);
+ error = xfs_metadir_lookup(tp, dp, &xname, &ino);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+ return xfs_trans_metafile_iget(tp, ino, metafile_type, ipp);
+}
+
+/*
+ * Unlock and release resources after committing (or cancelling) a metadata
+ * directory tree operation. The caller retains its reference to @upd->ip
+ * and must release it explicitly.
+ */
+static inline void
+xfs_metadir_teardown(
+ struct xfs_metadir_update *upd,
+ int error)
+{
+ trace_xfs_metadir_teardown(upd, error);
+
+ if (upd->ppargs) {
+ xfs_parent_finish(upd->dp->i_mount, upd->ppargs);
+ upd->ppargs = NULL;
+ }
+
+ if (upd->ip) {
+ if (upd->ip_locked)
+ xfs_iunlock(upd->ip, XFS_ILOCK_EXCL);
+ upd->ip_locked = false;
+ }
+
+ if (upd->dp_locked)
+ xfs_iunlock(upd->dp, XFS_ILOCK_EXCL);
+ upd->dp_locked = false;
+}
+
+/*
+ * Begin the process of creating a metadata file by allocating transactions
+ * and taking whatever resources we're going to need.
+ */
+int
+xfs_metadir_start_create(
+ struct xfs_metadir_update *upd)
+{
+ struct xfs_mount *mp = upd->dp->i_mount;
+ int error;
+
+ ASSERT(upd->dp != NULL);
+ ASSERT(upd->ip == NULL);
+ ASSERT(xfs_has_metadir(mp));
+ ASSERT(upd->metafile_type != XFS_METAFILE_UNKNOWN);
+
+ error = xfs_parent_start(mp, &upd->ppargs);
+ if (error)
+ return error;
+
+ /*
+ * If we ever need the ability to create rt metadata files on a
+ * pre-metadir filesystem, we'll need to dqattach the parent here.
+ * Currently we assume that mkfs will create the files and quotacheck
+ * will account for them.
+ */
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
+ xfs_create_space_res(mp, MAXNAMELEN), 0, 0, &upd->tp);
+ if (error)
+ goto out_teardown;
+
+ /*
+ * Lock the parent directory if there is one. We can't ijoin it to
+ * the transaction until after the child file has been created.
+ */
+ xfs_ilock(upd->dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ upd->dp_locked = true;
+
+ trace_xfs_metadir_start_create(upd);
+ return 0;
+out_teardown:
+ xfs_metadir_teardown(upd, error);
+ return error;
+}
+
+/*
+ * Create a metadata inode with the given @mode, and insert it into the
+ * metadata directory tree at the given @upd->path. The path up to the final
+ * component must already exist. The final path component must not exist.
+ *
+ * The new metadata inode will be attached to the update structure @upd->ip,
+ * with the ILOCK held until the caller releases it.
+ *
+ * NOTE: This function may return a new inode to the caller even if it returns
+ * a negative error code. If an inode is passed back, the caller must finish
+ * setting up the inode before releasing it.
+ */
+int
+xfs_metadir_create(
+ struct xfs_metadir_update *upd,
+ umode_t mode)
+{
+ struct xfs_icreate_args args = {
+ .pip = upd->dp,
+ .mode = mode,
+ };
+ struct xfs_name xname;
+ struct xfs_dir_update du = {
+ .dp = upd->dp,
+ .name = &xname,
+ .ppargs = upd->ppargs,
+ };
+ struct xfs_mount *mp = upd->dp->i_mount;
+ xfs_ino_t ino;
+ unsigned int resblks;
+ int error;
+
+ xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL);
+
+ /* Check that the name does not already exist in the directory. */
+ xfs_metadir_set_xname(&xname, upd->path, XFS_DIR3_FT_UNKNOWN);
+ error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino);
+ switch (error) {
+ case -ENOENT:
+ break;
+ case 0:
+ error = -EEXIST;
+ fallthrough;
+ default:
+ return error;
+ }
+
+ /*
+ * A newly created regular or special file just has one directory
+ * entry pointing to them, but a directory also the "." entry
+ * pointing to itself.
+ */
+ error = xfs_dialloc(&upd->tp, &args, &ino);
+ if (error)
+ return error;
+ error = xfs_icreate(upd->tp, ino, &args, &upd->ip);
+ if (error)
+ return error;
+ du.ip = upd->ip;
+ xfs_metafile_set_iflag(upd->tp, upd->ip, upd->metafile_type);
+ upd->ip_locked = true;
+
+ /*
+ * Join the directory inode to the transaction. We do not do it
+ * earlier because xfs_dialloc rolls the transaction.
+ */
+ xfs_trans_ijoin(upd->tp, upd->dp, 0);
+
+ /* Create the entry. */
+ if (S_ISDIR(args.mode))
+ resblks = xfs_mkdir_space_res(mp, xname.len);
+ else
+ resblks = xfs_create_space_res(mp, xname.len);
+ xname.type = xfs_mode_to_ftype(args.mode);
+
+ trace_xfs_metadir_try_create(upd);
+
+ error = xfs_dir_create_child(upd->tp, resblks, &du);
+ if (error)
+ return error;
+
+ /* Metadir files are not accounted to quota. */
+
+ trace_xfs_metadir_create(upd);
+
+ return 0;
+}
+
+#ifndef __KERNEL__
+/*
+ * Begin the process of linking a metadata file by allocating transactions
+ * and locking whatever resources we're going to need.
+ */
+int
+xfs_metadir_start_link(
+ struct xfs_metadir_update *upd)
+{
+ struct xfs_mount *mp = upd->dp->i_mount;
+ unsigned int resblks;
+ int nospace_error = 0;
+ int error;
+
+ ASSERT(upd->dp != NULL);
+ ASSERT(upd->ip != NULL);
+ ASSERT(xfs_has_metadir(mp));
+
+ error = xfs_parent_start(mp, &upd->ppargs);
+ if (error)
+ return error;
+
+ resblks = xfs_link_space_res(mp, MAXNAMELEN);
+ error = xfs_trans_alloc_dir(upd->dp, &M_RES(mp)->tr_link, upd->ip,
+ &resblks, &upd->tp, &nospace_error);
+ if (error)
+ goto out_teardown;
+ if (!resblks) {
+ /* We don't allow reservationless updates. */
+ xfs_trans_cancel(upd->tp);
+ upd->tp = NULL;
+ xfs_iunlock(upd->dp, XFS_ILOCK_EXCL);
+ xfs_iunlock(upd->ip, XFS_ILOCK_EXCL);
+ error = nospace_error;
+ goto out_teardown;
+ }
+
+ upd->dp_locked = true;
+ upd->ip_locked = true;
+
+ trace_xfs_metadir_start_link(upd);
+ return 0;
+out_teardown:
+ xfs_metadir_teardown(upd, error);
+ return error;
+}
+
+/*
+ * Link the metadata directory given by @path to the inode @upd->ip.
+ * The path (up to the final component) must already exist, but the final
+ * component must not already exist.
+ */
+int
+xfs_metadir_link(
+ struct xfs_metadir_update *upd)
+{
+ struct xfs_name xname;
+ struct xfs_dir_update du = {
+ .dp = upd->dp,
+ .name = &xname,
+ .ip = upd->ip,
+ .ppargs = upd->ppargs,
+ };
+ struct xfs_mount *mp = upd->dp->i_mount;
+ xfs_ino_t ino;
+ unsigned int resblks;
+ int error;
+
+ xfs_assert_ilocked(upd->dp, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(upd->ip, XFS_ILOCK_EXCL);
+
+ /* Look up the name in the current directory. */
+ xfs_metadir_set_xname(&xname, upd->path,
+ xfs_mode_to_ftype(VFS_I(upd->ip)->i_mode));
+ error = xfs_metadir_lookup(upd->tp, upd->dp, &xname, &ino);
+ switch (error) {
+ case -ENOENT:
+ break;
+ case 0:
+ error = -EEXIST;
+ fallthrough;
+ default:
+ return error;
+ }
+
+ resblks = xfs_link_space_res(mp, xname.len);
+ error = xfs_dir_add_child(upd->tp, resblks, &du);
+ if (error)
+ return error;
+
+ trace_xfs_metadir_link(upd);
+
+ return 0;
+}
+#endif /* ! __KERNEL__ */
+
+/* Commit a metadir update and unlock/drop all resources. */
+int
+xfs_metadir_commit(
+ struct xfs_metadir_update *upd)
+{
+ int error;
+
+ trace_xfs_metadir_commit(upd);
+
+ error = xfs_trans_commit(upd->tp);
+ upd->tp = NULL;
+
+ xfs_metadir_teardown(upd, error);
+ return error;
+}
+
+/* Cancel a metadir update and unlock/drop all resources. */
+void
+xfs_metadir_cancel(
+ struct xfs_metadir_update *upd,
+ int error)
+{
+ trace_xfs_metadir_cancel(upd);
+
+ xfs_trans_cancel(upd->tp);
+ upd->tp = NULL;
+
+ xfs_metadir_teardown(upd, error);
+}
+
+/* Create a metadata for the last component of the path. */
+int
+xfs_metadir_mkdir(
+ struct xfs_inode *dp,
+ const char *path,
+ struct xfs_inode **ipp)
+{
+ struct xfs_metadir_update upd = {
+ .dp = dp,
+ .path = path,
+ .metafile_type = XFS_METAFILE_DIR,
+ };
+ int error;
+
+ if (xfs_is_shutdown(dp->i_mount))
+ return -EIO;
+
+ /* Allocate a transaction to create the last directory. */
+ error = xfs_metadir_start_create(&upd);
+ if (error)
+ return error;
+
+ /* Create the subdirectory and take our reference. */
+ error = xfs_metadir_create(&upd, S_IFDIR);
+ if (error)
+ goto out_cancel;
+
+ error = xfs_metadir_commit(&upd);
+ if (error)
+ goto out_irele;
+
+ xfs_finish_inode_setup(upd.ip);
+ *ipp = upd.ip;
+ return 0;
+
+out_cancel:
+ xfs_metadir_cancel(&upd, error);
+out_irele:
+ /* Have to finish setting up the inode to ensure it's deleted. */
+ if (upd.ip) {
+ xfs_finish_inode_setup(upd.ip);
+ xfs_irele(upd.ip);
+ }
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_metadir.h b/fs/xfs/libxfs/xfs_metadir.h
new file mode 100644
index 000000000000..bfecac7d3d14
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_metadir.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_METADIR_H__
+#define __XFS_METADIR_H__
+
+/* Cleanup widget for metadata inode creation and deletion. */
+struct xfs_metadir_update {
+ /* Parent directory */
+ struct xfs_inode *dp;
+
+ /* Path to metadata file */
+ const char *path;
+
+ /* Parent pointer update context */
+ struct xfs_parent_args *ppargs;
+
+ /* Child metadata file */
+ struct xfs_inode *ip;
+
+ struct xfs_trans *tp;
+
+ enum xfs_metafile_type metafile_type;
+
+ unsigned int dp_locked:1;
+ unsigned int ip_locked:1;
+};
+
+int xfs_metadir_load(struct xfs_trans *tp, struct xfs_inode *dp,
+ const char *path, enum xfs_metafile_type metafile_type,
+ struct xfs_inode **ipp);
+
+int xfs_metadir_start_create(struct xfs_metadir_update *upd);
+int xfs_metadir_create(struct xfs_metadir_update *upd, umode_t mode);
+
+int xfs_metadir_start_link(struct xfs_metadir_update *upd);
+int xfs_metadir_link(struct xfs_metadir_update *upd);
+
+int xfs_metadir_commit(struct xfs_metadir_update *upd);
+void xfs_metadir_cancel(struct xfs_metadir_update *upd, int error);
+
+int xfs_metadir_mkdir(struct xfs_inode *dp, const char *path,
+ struct xfs_inode **ipp);
+
+#endif /* __XFS_METADIR_H__ */
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
new file mode 100644
index 000000000000..adeb25d1a444
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_metafile.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+
+/* Set up an inode to be recognized as a metadata directory inode. */
+void
+xfs_metafile_set_iflag(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ enum xfs_metafile_type metafile_type)
+{
+ VFS_I(ip)->i_mode &= ~0777;
+ VFS_I(ip)->i_uid = GLOBAL_ROOT_UID;
+ VFS_I(ip)->i_gid = GLOBAL_ROOT_GID;
+ if (S_ISDIR(VFS_I(ip)->i_mode))
+ ip->i_diflags |= XFS_METADIR_DIFLAGS;
+ else
+ ip->i_diflags |= XFS_METAFILE_DIFLAGS;
+ ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
+ ip->i_diflags2 |= XFS_DIFLAG2_METADATA;
+ ip->i_metatype = metafile_type;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Clear the metadata directory inode flag. */
+void
+xfs_metafile_clear_iflag(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ ASSERT(xfs_is_metadir_inode(ip));
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+
+ ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h
new file mode 100644
index 000000000000..acec400123db
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_metafile.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_METAFILE_H__
+#define __XFS_METAFILE_H__
+
+/* All metadata files must have these flags set. */
+#define XFS_METAFILE_DIFLAGS (XFS_DIFLAG_IMMUTABLE | \
+ XFS_DIFLAG_SYNC | \
+ XFS_DIFLAG_NOATIME | \
+ XFS_DIFLAG_NODUMP | \
+ XFS_DIFLAG_NODEFRAG)
+
+/* All metadata directories must have these flags set. */
+#define XFS_METADIR_DIFLAGS (XFS_METAFILE_DIFLAGS | \
+ XFS_DIFLAG_NOSYMLINKS)
+
+void xfs_metafile_set_iflag(struct xfs_trans *tp, struct xfs_inode *ip,
+ enum xfs_metafile_type metafile_type);
+void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
+
+/* Code specific to kernel/userspace; must be provided externally. */
+
+int xfs_trans_metafile_iget(struct xfs_trans *tp, xfs_ino_t ino,
+ enum xfs_metafile_type metafile_type, struct xfs_inode **ipp);
+int xfs_metafile_iget(struct xfs_mount *mp, xfs_ino_t ino,
+ enum xfs_metafile_type metafile_type, struct xfs_inode **ipp);
+
+#endif /* __XFS_METAFILE_H__ */
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 23c133fd36f5..ad0dedf00f18 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -19,40 +19,46 @@
static_assert((value) == (expected), \
"XFS: value of " #value " is wrong, expected " #expected)
+#define XFS_CHECK_SB_OFFSET(field, offset) \
+ XFS_CHECK_OFFSET(struct xfs_dsb, field, offset); \
+ XFS_CHECK_OFFSET(struct xfs_sb, field, offset);
+
static inline void __init
xfs_check_ondisk_structs(void)
{
- /* ag/file structures */
+ /* file structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_acl, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12);
- XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
- XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
- XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
- XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48);
- XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64);
- XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136);
- XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264);
XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56);
+ XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8);
+
+ /* space btrees */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_alloc_rec, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4);
XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24);
- XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8);
- XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
- XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_bmdr_key_t, 8);
/* dir/attr trees */
XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80);
@@ -67,33 +73,34 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64);
XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64);
XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t, 8);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t, 32);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_entry, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_hdr, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_map, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_local, 4);
/* realtime structures */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rtsb, 56);
XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4);
XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_rtbuf_blkinfo, 48);
/*
- * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
- * 4 bytes anyway so it's not obviously a problem. Hence for the moment
- * we don't check this structure. This can be re-instated when the attr
- * definitions are updated to use c99 VLA definitions.
+ * m68k has problems with struct xfs_attr_leaf_name_remote, but we pad
+ * it to 4 bytes anyway so it's not obviously a problem. Hence for the
+ * moment we don't check this structure. This can be re-instated when
+ * the attr definitions are updated to use c99 VLA definitions.
*
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leaf_name_remote, 12);
*/
- XFS_CHECK_OFFSET(struct xfs_dsb, sb_crc, 224);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
- XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
- XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, valuelen, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, namelen, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_local, nameval, 3);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valueblk, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, valuelen, 4);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, namelen, 8);
+ XFS_CHECK_OFFSET(struct xfs_attr_leaf_name_remote, name, 9);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_leafblock, 32);
XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4);
XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0);
XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2);
@@ -101,27 +108,41 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1);
XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2);
XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3);
- XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
- XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
- XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16);
- XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0);
- XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3);
- XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0);
- XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1);
- XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3);
- XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da_blkinfo, 12);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da_intnode, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_entry, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da_node_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16);
+ XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, freetag, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir2_data_unused, length, 2);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3);
+ XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, namelen, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, offset, 1);
+ XFS_CHECK_OFFSET(struct xfs_dir2_sf_entry, name, 3);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10);
XFS_CHECK_STRUCT_SIZE(struct xfs_parent_rec, 12);
+ /* ondisk dir/attr structures from xfs/122 */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_entry, 3);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_free, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_data_unused, 6);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_free_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_entry, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_hdr, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_leaf_tail, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_entry, 3);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir2_sf_hdr, 10);
+
/* log structures */
XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88);
XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
@@ -157,6 +178,11 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(struct xfs_efi_log_format_32, efi_extents, 16);
XFS_CHECK_OFFSET(struct xfs_efi_log_format_64, efi_extents, 16);
+ /* ondisk log structures from xfs/122 */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_unmount_log_format, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_xmd_log_format, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_xmi_log_format, 88);
+
/* parent pointer ioctls */
XFS_CHECK_STRUCT_SIZE(struct xfs_getparents_rec, 32);
XFS_CHECK_STRUCT_SIZE(struct xfs_getparents, 40);
@@ -201,6 +227,70 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4);
XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT,
16299260424LL);
+
+ /* superblock field checks we got from xfs/122 */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288);
+ XFS_CHECK_SB_OFFSET(sb_magicnum, 0);
+ XFS_CHECK_SB_OFFSET(sb_blocksize, 4);
+ XFS_CHECK_SB_OFFSET(sb_dblocks, 8);
+ XFS_CHECK_SB_OFFSET(sb_rblocks, 16);
+ XFS_CHECK_SB_OFFSET(sb_rextents, 24);
+ XFS_CHECK_SB_OFFSET(sb_uuid, 32);
+ XFS_CHECK_SB_OFFSET(sb_logstart, 48);
+ XFS_CHECK_SB_OFFSET(sb_rootino, 56);
+ XFS_CHECK_SB_OFFSET(sb_rbmino, 64);
+ XFS_CHECK_SB_OFFSET(sb_rsumino, 72);
+ XFS_CHECK_SB_OFFSET(sb_rextsize, 80);
+ XFS_CHECK_SB_OFFSET(sb_agblocks, 84);
+ XFS_CHECK_SB_OFFSET(sb_agcount, 88);
+ XFS_CHECK_SB_OFFSET(sb_rbmblocks, 92);
+ XFS_CHECK_SB_OFFSET(sb_logblocks, 96);
+ XFS_CHECK_SB_OFFSET(sb_versionnum, 100);
+ XFS_CHECK_SB_OFFSET(sb_sectsize, 102);
+ XFS_CHECK_SB_OFFSET(sb_inodesize, 104);
+ XFS_CHECK_SB_OFFSET(sb_inopblock, 106);
+ XFS_CHECK_SB_OFFSET(sb_blocklog, 120);
+ XFS_CHECK_SB_OFFSET(sb_fname[12], 120);
+ XFS_CHECK_SB_OFFSET(sb_sectlog, 121);
+ XFS_CHECK_SB_OFFSET(sb_inodelog, 122);
+ XFS_CHECK_SB_OFFSET(sb_inopblog, 123);
+ XFS_CHECK_SB_OFFSET(sb_agblklog, 124);
+ XFS_CHECK_SB_OFFSET(sb_rextslog, 125);
+ XFS_CHECK_SB_OFFSET(sb_inprogress, 126);
+ XFS_CHECK_SB_OFFSET(sb_imax_pct, 127);
+ XFS_CHECK_SB_OFFSET(sb_icount, 128);
+ XFS_CHECK_SB_OFFSET(sb_ifree, 136);
+ XFS_CHECK_SB_OFFSET(sb_fdblocks, 144);
+ XFS_CHECK_SB_OFFSET(sb_frextents, 152);
+ XFS_CHECK_SB_OFFSET(sb_uquotino, 160);
+ XFS_CHECK_SB_OFFSET(sb_gquotino, 168);
+ XFS_CHECK_SB_OFFSET(sb_qflags, 176);
+ XFS_CHECK_SB_OFFSET(sb_flags, 178);
+ XFS_CHECK_SB_OFFSET(sb_shared_vn, 179);
+ XFS_CHECK_SB_OFFSET(sb_inoalignmt, 180);
+ XFS_CHECK_SB_OFFSET(sb_unit, 184);
+ XFS_CHECK_SB_OFFSET(sb_width, 188);
+ XFS_CHECK_SB_OFFSET(sb_dirblklog, 192);
+ XFS_CHECK_SB_OFFSET(sb_logsectlog, 193);
+ XFS_CHECK_SB_OFFSET(sb_logsectsize, 194);
+ XFS_CHECK_SB_OFFSET(sb_logsunit, 196);
+ XFS_CHECK_SB_OFFSET(sb_features2, 200);
+ XFS_CHECK_SB_OFFSET(sb_bad_features2, 204);
+ XFS_CHECK_SB_OFFSET(sb_features_compat, 208);
+ XFS_CHECK_SB_OFFSET(sb_features_ro_compat, 212);
+ XFS_CHECK_SB_OFFSET(sb_features_incompat, 216);
+ XFS_CHECK_SB_OFFSET(sb_features_log_incompat, 220);
+ XFS_CHECK_SB_OFFSET(sb_crc, 224);
+ XFS_CHECK_SB_OFFSET(sb_spino_align, 228);
+ XFS_CHECK_SB_OFFSET(sb_pquotino, 232);
+ XFS_CHECK_SB_OFFSET(sb_lsn, 240);
+ XFS_CHECK_SB_OFFSET(sb_meta_uuid, 248);
+ XFS_CHECK_SB_OFFSET(sb_metadirino, 264);
+ XFS_CHECK_SB_OFFSET(sb_rgcount, 272);
+ XFS_CHECK_SB_OFFSET(sb_rgextents, 276);
+ XFS_CHECK_SB_OFFSET(sb_rgblklog, 280);
+ XFS_CHECK_SB_OFFSET(sb_pad, 281);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index fb05f44f6c75..763d941a8420 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -143,4 +143,47 @@ time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq,
__be32 dtimer);
__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer);
+static inline const char *
+xfs_dqinode_path(xfs_dqtype_t type)
+{
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ return "user";
+ case XFS_DQTYPE_GROUP:
+ return "group";
+ case XFS_DQTYPE_PROJ:
+ return "project";
+ }
+
+ ASSERT(0);
+ return NULL;
+}
+
+static inline enum xfs_metafile_type
+xfs_dqinode_metafile_type(xfs_dqtype_t type)
+{
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ return XFS_METAFILE_USRQUOTA;
+ case XFS_DQTYPE_GROUP:
+ return XFS_METAFILE_GRPQUOTA;
+ case XFS_DQTYPE_PROJ:
+ return XFS_METAFILE_PRJQUOTA;
+ }
+
+ ASSERT(0);
+ return XFS_METAFILE_UNKNOWN;
+}
+
+unsigned int xfs_dqinode_sick_mask(xfs_dqtype_t type);
+
+int xfs_dqinode_load(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dqtype_t type, struct xfs_inode **ipp);
+int xfs_dqinode_metadir_create(struct xfs_inode *dp, xfs_dqtype_t type,
+ struct xfs_inode **ipp);
+int xfs_dqinode_metadir_link(struct xfs_inode *dp, xfs_dqtype_t type,
+ struct xfs_inode *ip);
+int xfs_dqinode_mkdir_parent(struct xfs_mount *mp, struct xfs_inode **dpp);
+int xfs_dqinode_load_parent(struct xfs_trans *tp, struct xfs_inode **dpp);
+
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 198b84117df1..2dbab68b4fe6 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -154,7 +154,7 @@ xfs_refcount_complain_bad_rec(
xfs_warn(mp,
"Refcount BTree record corruption in AG %d detected at %pS!",
- cur->bc_ag.pag->pag_agno, fa);
+ cur->bc_group->xg_gno, fa);
xfs_warn(mp,
"Start block 0x%x, block count 0x%x, references 0x%x",
irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
@@ -180,7 +180,7 @@ xfs_refcount_get_rec(
return error;
xfs_refcount_btrec_to_irec(rec, irec);
- fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec);
+ fa = xfs_refcount_check_irec(to_perag(cur->bc_group), irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, irec);
@@ -1154,8 +1154,7 @@ xfs_refcount_adjust_extents(
goto out_error;
}
} else {
- fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_ag.pag->pag_agno,
+ fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group),
tmp.rc_startblock);
error = xfs_free_extent_later(cur->bc_tp, fsbno,
tmp.rc_blockcount, NULL,
@@ -1217,8 +1216,7 @@ xfs_refcount_adjust_extents(
}
goto advloop;
} else {
- fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_ag.pag->pag_agno,
+ fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group),
ext.rc_startblock);
error = xfs_free_extent_later(cur->bc_tp, fsbno,
ext.rc_blockcount, NULL,
@@ -1312,7 +1310,7 @@ xfs_refcount_continue_op(
xfs_agblock_t new_agbno)
{
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno,
ri->ri_blockcount))) {
@@ -1320,10 +1318,10 @@ xfs_refcount_continue_op(
return -EFSCORRUPTED;
}
- ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+ ri->ri_startblock = xfs_agbno_to_fsb(pag, new_agbno);
ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount));
- ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
+ ASSERT(pag_agno(pag) == XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
return 0;
}
@@ -1360,7 +1358,7 @@ xfs_refcount_finish_one(
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
- if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
+ if (rcur != NULL && rcur->bc_group != ri->ri_group) {
nr_ops = rcur->bc_refc.nr_ops;
shape_changes = rcur->bc_refc.shape_changes;
xfs_btree_del_cursor(rcur, 0);
@@ -1368,13 +1366,14 @@ xfs_refcount_finish_one(
*pcur = NULL;
}
if (rcur == NULL) {
- error = xfs_alloc_read_agf(ri->ri_pag, tp,
+ struct xfs_perag *pag = to_perag(ri->ri_group);
+
+ error = xfs_alloc_read_agf(pag, tp,
XFS_ALLOC_FLAG_FREEING, &agbp);
if (error)
return error;
- *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp,
- ri->ri_pag);
+ *pcur = rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag);
rcur->bc_refc.nr_ops = nr_ops;
rcur->bc_refc.shape_changes = shape_changes;
}
@@ -1880,7 +1879,8 @@ xfs_refcount_recover_extent(
INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
+ if (xfs_refcount_check_irec(to_perag(cur->bc_group), &rr->rr_rrec) !=
+ NULL ||
XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
xfs_btree_mark_sick(cur);
@@ -1956,8 +1956,7 @@ xfs_refcount_recover_cow_leftovers(
goto out_free;
/* Free the orphan record */
- fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno,
- rr->rr_rrec.rc_startblock);
+ fsb = xfs_agbno_to_fsb(pag, rr->rr_rrec.rc_startblock);
xfs_refcount_free_cow_extent(tp, fsb,
rr->rr_rrec.rc_blockcount);
@@ -2029,7 +2028,7 @@ xfs_refcount_query_range_helper(
xfs_failaddr_t fa;
xfs_refcount_btrec_to_irec(rec, &irec);
- fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec);
+ fa = xfs_refcount_check_irec(to_perag(cur->bc_group), &irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, &irec);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 68acb0b1b4a8..62d78afcf1f3 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -56,7 +56,7 @@ enum xfs_refcount_intent_type {
struct xfs_refcount_intent {
struct list_head ri_list;
- struct xfs_perag *ri_pag;
+ struct xfs_group *ri_group;
enum xfs_refcount_intent_type ri_type;
xfs_extlen_t ri_blockcount;
xfs_fsblock_t ri_startblock;
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 795928d1a66d..54505fee1852 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -30,7 +30,7 @@ xfs_refcountbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.pag);
+ cur->bc_ag.agbp, to_perag(cur->bc_group));
}
STATIC void
@@ -68,21 +68,20 @@ xfs_refcountbt_alloc_block(
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
- args.pag = cur->bc_ag.pag;
+ args.pag = to_perag(cur->bc_group);
args.oinfo = XFS_RMAP_OINFO_REFC;
args.minlen = args.maxlen = args.prod = 1;
args.resv = XFS_AG_RESV_METADATA;
error = xfs_alloc_vextent_near_bno(&args,
- XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno,
- xfs_refc_block(args.mp)));
+ xfs_agbno_to_fsb(args.pag, xfs_refc_block(args.mp)));
if (error)
goto out_error;
if (args.fsbno == NULLFSBLOCK) {
*stat = 0;
return 0;
}
- ASSERT(args.agno == cur->bc_ag.pag->pag_agno);
+ ASSERT(args.agno == cur->bc_group->xg_gno);
ASSERT(args.len == 1);
new->s = cpu_to_be32(args.agbno);
@@ -170,7 +169,7 @@ xfs_refcountbt_init_ptr_from_cur(
{
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_refcount_root;
}
@@ -362,11 +361,11 @@ xfs_refcountbt_init_cursor(
{
struct xfs_btree_cur *cur;
- ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
+ ASSERT(pag_agno(pag) < mp->m_sb.sb_agcount);
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops,
mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_refc.nr_ops = 0;
cur->bc_refc.shape_changes = 0;
cur->bc_ag.agbp = agbp;
@@ -515,7 +514,7 @@ xfs_refcountbt_calc_reserves(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (xfs_ag_contains_log(mp, pag->pag_agno))
+ if (xfs_ag_contains_log(mp, pag_agno(pag)))
agblocks -= mp->m_sb.sb_logblocks;
*ask += xfs_refcountbt_max_size(mp, agblocks);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 6ef4687b3aba..d0df68dc3131 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -213,7 +213,7 @@ xfs_rmap_check_irec(
struct xfs_perag *pag,
const struct xfs_rmap_irec *irec)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
bool is_inode;
bool is_unwritten;
bool is_bmbt;
@@ -269,9 +269,7 @@ xfs_rmap_check_btrec(
struct xfs_btree_cur *cur,
const struct xfs_rmap_irec *irec)
{
- if (xfs_btree_is_mem_rmap(cur->bc_ops))
- return xfs_rmap_check_irec(cur->bc_mem.pag, irec);
- return xfs_rmap_check_irec(cur->bc_ag.pag, irec);
+ return xfs_rmap_check_irec(to_perag(cur->bc_group), irec);
}
static inline int
@@ -288,7 +286,7 @@ xfs_rmap_complain_bad_rec(
else
xfs_warn(mp,
"Reverse Mapping BTree record corruption in AG %d detected at %pS!",
- cur->bc_ag.pag->pag_agno, fa);
+ cur->bc_group->xg_gno, fa);
xfs_warn(mp,
"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
irec->rm_owner, irec->rm_flags, irec->rm_startblock,
@@ -835,7 +833,7 @@ xfs_rmap_hook_enable(void)
static inline void
xfs_rmap_update_hook(
struct xfs_trans *tp,
- struct xfs_perag *pag,
+ struct xfs_group *xg,
enum xfs_rmap_intent_type op,
xfs_agblock_t startblock,
xfs_extlen_t blockcount,
@@ -850,27 +848,27 @@ xfs_rmap_update_hook(
.oinfo = *oinfo, /* struct copy */
};
- if (pag)
- xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p);
+ if (xg)
+ xfs_hooks_call(&xg->xg_rmap_update_hooks, op, &p);
}
}
/* Call the specified function during a reverse mapping update. */
int
xfs_rmap_hook_add(
- struct xfs_perag *pag,
+ struct xfs_group *xg,
struct xfs_rmap_hook *hook)
{
- return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+ return xfs_hooks_add(&xg->xg_rmap_update_hooks, &hook->rmap_hook);
}
/* Stop calling the specified function during a reverse mapping update. */
void
xfs_rmap_hook_del(
- struct xfs_perag *pag,
+ struct xfs_group *xg,
struct xfs_rmap_hook *hook)
{
- xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+ xfs_hooks_del(&xg->xg_rmap_update_hooks, &hook->rmap_hook);
}
/* Configure rmap update hook functions. */
@@ -905,7 +903,8 @@ xfs_rmap_free(
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
- xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo);
+ xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_UNMAP, bno, len,
+ false, oinfo);
error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -1149,7 +1148,8 @@ xfs_rmap_alloc(
return 0;
cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
- xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo);
+ xfs_rmap_update_hook(tp, pag_group(pag), XFS_RMAP_MAP, bno, len, false,
+ oinfo);
error = xfs_rmap_map(cur, bno, len, false, oinfo);
xfs_btree_del_cursor(cur, error);
@@ -2586,28 +2586,30 @@ xfs_rmap_finish_one(
* If we haven't gotten a cursor or the cursor AG doesn't match
* the startblock, get one now.
*/
- if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
+ if (rcur != NULL && rcur->bc_group != ri->ri_group) {
xfs_btree_del_cursor(rcur, 0);
rcur = NULL;
*pcur = NULL;
}
if (rcur == NULL) {
+ struct xfs_perag *pag = to_perag(ri->ri_group);
+
/*
* Refresh the freelist before we start changing the
* rmapbt, because a shape change could cause us to
* allocate blocks.
*/
- error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp);
+ error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
if (error) {
- xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
return error;
}
if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
- xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
return -EFSCORRUPTED;
}
- *pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
+ *pcur = rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
}
xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
@@ -2620,7 +2622,7 @@ xfs_rmap_finish_one(
if (error)
return error;
- xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno,
+ xfs_rmap_update_hook(tp, ri->ri_group, ri->ri_type, bno,
ri->ri_bmap.br_blockcount, unwritten, &oinfo);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index b783dd4dd95d..96b4321d8310 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -173,7 +173,7 @@ struct xfs_rmap_intent {
int ri_whichfork;
uint64_t ri_owner;
struct xfs_bmbt_irec ri_bmap;
- struct xfs_perag *ri_pag;
+ struct xfs_group *ri_group;
};
/* functions for updating the rmapbt based on bmbt map/unmap operations */
@@ -264,8 +264,8 @@ struct xfs_rmap_hook {
void xfs_rmap_hook_disable(void);
void xfs_rmap_hook_enable(void);
-int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
-void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+int xfs_rmap_hook_add(struct xfs_group *xg, struct xfs_rmap_hook *hook);
+void xfs_rmap_hook_del(struct xfs_group *xg, struct xfs_rmap_hook *hook);
void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn);
#endif
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index ac2f1f499b76..2cab694ac58a 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -57,7 +57,7 @@ xfs_rmapbt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_ag.agbp, cur->bc_ag.pag);
+ cur->bc_ag.agbp, to_perag(cur->bc_group));
}
STATIC void
@@ -66,14 +66,15 @@ xfs_rmapbt_set_root(
const union xfs_btree_ptr *ptr,
int inc)
{
- struct xfs_buf *agbp = cur->bc_ag.agbp;
- struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
ASSERT(ptr->s != 0);
agf->agf_rmap_root = ptr->s;
be32_add_cpu(&agf->agf_rmap_level, inc);
- cur->bc_ag.pag->pagf_rmap_level += inc;
+ pag->pagf_rmap_level += inc;
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -87,7 +88,7 @@ xfs_rmapbt_alloc_block(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
struct xfs_alloc_arg args = { .len = 1 };
int error;
xfs_agblock_t bno;
@@ -102,7 +103,7 @@ xfs_rmapbt_alloc_block(
return 0;
}
- xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false);
+ xfs_extent_busy_reuse(pag_group(pag), bno, 1, false);
new->s = cpu_to_be32(bno);
be32_add_cpu(&agf->agf_rmap_blocks, 1);
@@ -125,7 +126,7 @@ xfs_rmapbt_free_block(
{
struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr;
- struct xfs_perag *pag = cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(cur->bc_group);
xfs_agblock_t bno;
int error;
@@ -136,7 +137,7 @@ xfs_rmapbt_free_block(
if (error)
return error;
- xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1,
+ xfs_extent_busy_insert(cur->bc_tp, pag_group(pag), bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
@@ -227,7 +228,7 @@ xfs_rmapbt_init_ptr_from_cur(
{
struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_group->xg_gno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_rmap_root;
}
@@ -538,7 +539,7 @@ xfs_rmapbt_init_cursor(
cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
- cur->bc_ag.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
cur->bc_ag.agbp = agbp;
if (agbp) {
struct xfs_agf *agf = agbp->b_addr;
@@ -647,14 +648,13 @@ xfs_rmapbt_mem_cursor(
struct xfbtree *xfbt)
{
struct xfs_btree_cur *cur;
- struct xfs_mount *mp = pag->pag_mount;
- cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops,
+ cur = xfs_btree_alloc_cursor(pag_mount(pag), tp, &xfs_rmapbt_mem_ops,
xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
cur->bc_mem.xfbtree = xfbt;
cur->bc_nlevels = xfbt->nlevels;
- cur->bc_mem.pag = xfs_perag_hold(pag);
+ cur->bc_group = xfs_group_hold(pag_group(pag));
return cur;
}
@@ -863,7 +863,7 @@ xfs_rmapbt_calc_reserves(
* never be available for the kinds of things that would require btree
* expansion. We therefore can pretend the space isn't there.
*/
- if (xfs_ag_contains_log(mp, pag->pag_agno))
+ if (xfs_ag_contains_log(mp, pag_agno(pag)))
agblocks -= mp->m_sb.sb_logblocks;
/* Reserve 1% of the AG or enough for 1 block per record. */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 27a4472402ba..4ddfb7e395b3 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -20,28 +20,87 @@
#include "xfs_error.h"
#include "xfs_rtbitmap.h"
#include "xfs_health.h"
+#include "xfs_sb.h"
+#include "xfs_errortag.h"
+#include "xfs_log.h"
+#include "xfs_buf_item.h"
+#include "xfs_extent_busy.h"
/*
* Realtime allocator bitmap functions shared with userspace.
*/
-/*
- * Real time buffers need verifiers to avoid runtime warnings during IO.
- * We don't have anything to verify, however, so these are just dummy
- * operations.
- */
+static xfs_failaddr_t
+xfs_rtbuf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+ if (!xfs_verify_magic(bp, hdr->rt_magic))
+ return __this_address;
+ if (!xfs_has_rtgroups(mp))
+ return __this_address;
+ if (!xfs_has_crc(mp))
+ return __this_address;
+ if (!uuid_equal(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (hdr->rt_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
+ return __this_address;
+ return NULL;
+}
+
static void
xfs_rtbuf_verify_read(
- struct xfs_buf *bp)
+ struct xfs_buf *bp)
{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+ xfs_failaddr_t fa;
+
+ if (!xfs_has_rtgroups(mp))
+ return;
+
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr->rt_lsn))) {
+ fa = __this_address;
+ goto fail;
+ }
+
+ if (!xfs_buf_verify_cksum(bp, XFS_RTBUF_CRC_OFF)) {
+ fa = __this_address;
+ goto fail;
+ }
+
+ fa = xfs_rtbuf_verify(bp);
+ if (fa)
+ goto fail;
+
return;
+fail:
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
static void
xfs_rtbuf_verify_write(
struct xfs_buf *bp)
{
- return;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+ xfs_failaddr_t fa;
+
+ if (!xfs_has_rtgroups(mp))
+ return;
+
+ fa = xfs_rtbuf_verify(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ if (bip)
+ hdr->rt_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_RTBUF_CRC_OFF);
}
const struct xfs_buf_ops xfs_rtbuf_ops = {
@@ -50,6 +109,22 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
.verify_write = xfs_rtbuf_verify_write,
};
+const struct xfs_buf_ops xfs_rtbitmap_buf_ops = {
+ .name = "xfs_rtbitmap",
+ .magic = { 0, cpu_to_be32(XFS_RTBITMAP_MAGIC) },
+ .verify_read = xfs_rtbuf_verify_read,
+ .verify_write = xfs_rtbuf_verify_write,
+ .verify_struct = xfs_rtbuf_verify,
+};
+
+const struct xfs_buf_ops xfs_rtsummary_buf_ops = {
+ .name = "xfs_rtsummary",
+ .magic = { 0, cpu_to_be32(XFS_RTSUMMARY_MAGIC) },
+ .verify_read = xfs_rtbuf_verify_read,
+ .verify_write = xfs_rtbuf_verify_write,
+ .verify_struct = xfs_rtbuf_verify,
+};
+
/* Release cached rt bitmap and summary buffers. */
void
xfs_rtbuf_cache_relse(
@@ -75,28 +150,31 @@ static int
xfs_rtbuf_get(
struct xfs_rtalloc_args *args,
xfs_fileoff_t block, /* block number in bitmap or summary */
- int issum) /* is summary not bitmap */
+ enum xfs_rtg_inodes type)
{
+ struct xfs_inode *ip = args->rtg->rtg_inodes[type];
struct xfs_mount *mp = args->mp;
struct xfs_buf **cbpp; /* cached block buffer */
xfs_fileoff_t *coffp; /* cached block number */
struct xfs_buf *bp; /* block buffer, result */
- struct xfs_inode *ip; /* bitmap or summary inode */
struct xfs_bmbt_irec map;
- enum xfs_blft type;
+ enum xfs_blft buf_type;
int nmap = 1;
int error;
- if (issum) {
+ switch (type) {
+ case XFS_RTGI_SUMMARY:
cbpp = &args->sumbp;
coffp = &args->sumoff;
- ip = mp->m_rsumip;
- type = XFS_BLFT_RTSUMMARY_BUF;
- } else {
+ buf_type = XFS_BLFT_RTSUMMARY_BUF;
+ break;
+ case XFS_RTGI_BITMAP:
cbpp = &args->rbmbp;
coffp = &args->rbmoff;
- ip = mp->m_rbmip;
- type = XFS_BLFT_RTBITMAP_BUF;
+ buf_type = XFS_BLFT_RTBITMAP_BUF;
+ break;
+ default:
+ return -EINVAL;
}
/*
@@ -119,22 +197,32 @@ xfs_rtbuf_get(
return error;
if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) {
- xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
- XFS_SICK_RT_BITMAP);
+ xfs_rtginode_mark_sick(args->rtg, type);
return -EFSCORRUPTED;
}
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
- mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+ mp->m_bsize, 0, &bp,
+ xfs_rtblock_ops(mp, type));
if (xfs_metadata_is_sick(error))
- xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
- XFS_SICK_RT_BITMAP);
+ xfs_rtginode_mark_sick(args->rtg, type);
if (error)
return error;
- xfs_trans_buf_set_type(args->tp, bp, type);
+ if (xfs_has_rtgroups(mp)) {
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+ if (hdr->rt_owner != cpu_to_be64(ip->i_ino)) {
+ xfs_buf_mark_corrupt(bp);
+ xfs_trans_brelse(args->tp, bp);
+ xfs_rtginode_mark_sick(args->rtg, type);
+ return -EFSCORRUPTED;
+ }
+ }
+
+ xfs_trans_buf_set_type(args->tp, bp, buf_type);
*cbpp = bp;
*coffp = block;
return 0;
@@ -148,11 +236,11 @@ xfs_rtbitmap_read_buf(
struct xfs_mount *mp = args->mp;
if (XFS_IS_CORRUPT(mp, block >= mp->m_sb.sb_rbmblocks)) {
- xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP);
+ xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_BITMAP);
return -EFSCORRUPTED;
}
- return xfs_rtbuf_get(args, block, 0);
+ return xfs_rtbuf_get(args, block, XFS_RTGI_BITMAP);
}
int
@@ -163,10 +251,10 @@ xfs_rtsummary_read_buf(
struct xfs_mount *mp = args->mp;
if (XFS_IS_CORRUPT(mp, block >= mp->m_rsumblocks)) {
- xfs_rt_mark_sick(args->mp, XFS_SICK_RT_SUMMARY);
+ xfs_rtginode_mark_sick(args->rtg, XFS_RTGI_SUMMARY);
return -EFSCORRUPTED;
}
- return xfs_rtbuf_get(args, block, 1);
+ return xfs_rtbuf_get(args, block, XFS_RTGI_SUMMARY);
}
/*
@@ -503,6 +591,7 @@ xfs_rtmodify_summary(
{
struct xfs_mount *mp = args->mp;
xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno);
+ uint8_t *rsum_cache = args->rtg->rtg_rsum_cache;
unsigned int infoword;
xfs_suminfo_t val;
int error;
@@ -514,11 +603,11 @@ xfs_rtmodify_summary(
infoword = xfs_rtsumoffs_to_infoword(mp, so);
val = xfs_suminfo_add(args, infoword, delta);
- if (mp->m_rsum_cache) {
- if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log;
- if (val != 0 && log >= mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log + 1;
+ if (rsum_cache) {
+ if (val == 0 && log + 1 == rsum_cache[bbno])
+ rsum_cache[bbno] = log;
+ if (val != 0 && log >= rsum_cache[bbno])
+ rsum_cache[bbno] = log + 1;
}
xfs_trans_log_rtsummary(args, infoword);
@@ -737,7 +826,7 @@ xfs_rtfree_range(
/*
* Find the next allocated block (end of allocated extent).
*/
- error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+ error = xfs_rtfind_forw(args, end, args->rtg->rtg_extents - 1,
&postblock);
if (error)
return error;
@@ -961,19 +1050,25 @@ xfs_rtcheck_alloc_range(
int
xfs_rtfree_extent(
struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_rtgroup *rtg,
xfs_rtxnum_t start, /* starting rtext number to free */
xfs_rtxlen_t len) /* length of extent freed */
{
struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
struct xfs_rtalloc_args args = {
.mp = mp,
.tp = tp,
+ .rtg = rtg,
};
int error;
struct timespec64 atime;
- ASSERT(mp->m_rbmip->i_itemp != NULL);
- xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
+ ASSERT(rbmip->i_itemp != NULL);
+ xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL);
+
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT))
+ return -EIO;
error = xfs_rtcheck_alloc_range(&args, start, len);
if (error)
@@ -990,19 +1085,21 @@ xfs_rtfree_extent(
* Mark more blocks free in the superblock.
*/
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+
/*
* If we've now freed all the blocks, reset the file sequence
- * number to 0.
+ * number to 0 for pre-RTG file systems.
*/
- if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+ if (!xfs_has_rtgroups(mp) &&
+ tp->t_frextents_delta + mp->m_sb.sb_frextents ==
mp->m_sb.sb_rextents) {
- if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
- mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+ if (!(rbmip->i_diflags & XFS_DIFLAG_NEWRTBM))
+ rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
- atime = inode_get_atime(VFS_I(mp->m_rbmip));
+ atime = inode_get_atime(VFS_I(rbmip));
atime.tv_sec = 0;
- inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime);
- xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+ inode_set_atime_to_ts(VFS_I(rbmip), atime);
+ xfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE);
}
error = 0;
out:
@@ -1018,15 +1115,17 @@ out:
int
xfs_rtfree_blocks(
struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg,
xfs_fsblock_t rtbno,
xfs_filblks_t rtlen)
{
struct xfs_mount *mp = tp->t_mountp;
xfs_extlen_t mod;
+ int error;
ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
- mod = xfs_rtb_to_rtxoff(mp, rtlen);
+ mod = xfs_blen_to_rtxoff(mp, rtlen);
if (mod) {
ASSERT(mod == 0);
return -EIO;
@@ -1038,21 +1137,31 @@ xfs_rtfree_blocks(
return -EIO;
}
- return xfs_rtfree_extent(tp, xfs_rtb_to_rtx(mp, rtbno),
- xfs_rtb_to_rtx(mp, rtlen));
+ error = xfs_rtfree_extent(tp, rtg, xfs_rtb_to_rtx(mp, rtbno),
+ xfs_extlen_to_rtxlen(mp, rtlen));
+ if (error)
+ return error;
+
+ if (xfs_has_rtgroups(mp))
+ xfs_extent_busy_insert(tp, rtg_group(rtg),
+ xfs_rtb_to_rgbno(mp, rtbno), rtlen, 0);
+
+ return 0;
}
/* Find all the free records within a given range. */
int
xfs_rtalloc_query_range(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
xfs_rtxnum_t start,
xfs_rtxnum_t end,
xfs_rtalloc_query_range_fn fn,
void *priv)
{
+ struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_rtalloc_args args = {
+ .rtg = rtg,
.mp = mp,
.tp = tp,
};
@@ -1060,10 +1169,10 @@ xfs_rtalloc_query_range(
if (start > end)
return -EINVAL;
- if (start == end || start >= mp->m_sb.sb_rextents)
+ if (start == end || start >= rtg->rtg_extents)
return 0;
- end = min(end, mp->m_sb.sb_rextents - 1);
+ end = min(end, rtg->rtg_extents - 1);
/* Iterate the bitmap, looking for discrepancies. */
while (start <= end) {
@@ -1086,7 +1195,7 @@ xfs_rtalloc_query_range(
rec.ar_startext = start;
rec.ar_extcount = rtend - start + 1;
- error = fn(mp, tp, &rec, priv);
+ error = fn(rtg, tp, &rec, priv);
if (error)
break;
}
@@ -1101,26 +1210,27 @@ xfs_rtalloc_query_range(
/* Find all the free records. */
int
xfs_rtalloc_query_all(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv)
{
- return xfs_rtalloc_query_range(mp, tp, 0, mp->m_sb.sb_rextents - 1, fn,
+ return xfs_rtalloc_query_range(rtg, tp, 0, rtg->rtg_extents - 1, fn,
priv);
}
/* Is the given extent all free? */
int
xfs_rtalloc_extent_is_free(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
xfs_rtxnum_t start,
xfs_rtxlen_t len,
bool *is_free)
{
struct xfs_rtalloc_args args = {
- .mp = mp,
+ .mp = rtg_mount(rtg),
+ .rtg = rtg,
.tp = tp,
};
xfs_rtxnum_t end;
@@ -1136,88 +1246,71 @@ xfs_rtalloc_extent_is_free(
return 0;
}
+/* Compute the number of rt extents tracked by a single bitmap block. */
+xfs_rtxnum_t
+xfs_rtbitmap_rtx_per_rbmblock(
+ struct xfs_mount *mp)
+{
+ unsigned int rbmblock_bytes = mp->m_sb.sb_blocksize;
+
+ if (xfs_has_rtgroups(mp))
+ rbmblock_bytes -= sizeof(struct xfs_rtbuf_blkinfo);
+
+ return rbmblock_bytes * NBBY;
+}
+
/*
* Compute the number of rtbitmap blocks needed to track the given number of rt
* extents.
*/
xfs_filblks_t
-xfs_rtbitmap_blockcount(
+xfs_rtbitmap_blockcount_len(
struct xfs_mount *mp,
xfs_rtbxlen_t rtextents)
{
- return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize);
+ return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
}
-/* Compute the number of rtsummary blocks needed to track the given rt space. */
-xfs_filblks_t
-xfs_rtsummary_blockcount(
- struct xfs_mount *mp,
- unsigned int rsumlevels,
- xfs_extlen_t rbmblocks)
+/* How many rt extents does each rtbitmap file track? */
+static inline xfs_rtbxlen_t
+xfs_rtbitmap_bitcount(
+ struct xfs_mount *mp)
{
- unsigned long long rsumwords;
+ if (!mp->m_sb.sb_rextents)
+ return 0;
- rsumwords = (unsigned long long)rsumlevels * rbmblocks;
- return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG);
-}
+ /* rtgroup size can be nonzero even if rextents is zero */
+ if (xfs_has_rtgroups(mp))
+ return mp->m_sb.sb_rgextents;
-/* Lock both realtime free space metadata inodes for a freespace update. */
-void
-xfs_rtbitmap_lock(
- struct xfs_mount *mp)
-{
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+ return mp->m_sb.sb_rextents;
}
/*
- * Join both realtime free space metadata inodes to the transaction. The
- * ILOCKs will be released on transaction commit.
+ * Compute the number of rtbitmap blocks used for a given file system.
*/
-void
-xfs_rtbitmap_trans_join(
- struct xfs_trans *tp)
-{
- xfs_trans_ijoin(tp, tp->t_mountp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tp->t_mountp->m_rsumip, XFS_ILOCK_EXCL);
-}
-
-/* Unlock both realtime free space metadata inodes after a freespace update. */
-void
-xfs_rtbitmap_unlock(
+xfs_filblks_t
+xfs_rtbitmap_blockcount(
struct xfs_mount *mp)
{
- xfs_iunlock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+ return xfs_rtbitmap_blockcount_len(mp, xfs_rtbitmap_bitcount(mp));
}
/*
- * Lock the realtime free space metadata inodes for a freespace scan. Callers
- * must walk metadata blocks in order of increasing file offset.
+ * Compute the geometry of the rtsummary file needed to track the given rt
+ * space.
*/
-void
-xfs_rtbitmap_lock_shared(
- struct xfs_mount *mp,
- unsigned int rbmlock_flags)
-{
- if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
-
- if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
-}
-
-/* Unlock the realtime free space metadata inodes after a freespace scan. */
-void
-xfs_rtbitmap_unlock_shared(
+xfs_filblks_t
+xfs_rtsummary_blockcount(
struct xfs_mount *mp,
- unsigned int rbmlock_flags)
+ unsigned int *rsumlevels)
{
- if (rbmlock_flags & XFS_RBMLOCK_SUMMARY)
- xfs_iunlock(mp->m_rsumip, XFS_ILOCK_SHARED | XFS_ILOCK_RTSUM);
+ xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp);
+ unsigned long long rsumwords;
- if (rbmlock_flags & XFS_RBMLOCK_BITMAP)
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ *rsumlevels = xfs_compute_rextslog(rextents) + 1;
+ rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
+ return howmany_64(rsumwords, mp->m_blockwsize);
}
static int
@@ -1260,21 +1353,26 @@ out_trans_cancel:
/* Get a buffer for the block. */
static int
xfs_rtfile_initialize_block(
- struct xfs_inode *ip,
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type,
xfs_fsblock_t fsbno,
void *data)
{
- struct xfs_mount *mp = ip->i_mount;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_inode *ip = rtg->rtg_inodes[type];
struct xfs_trans *tp;
struct xfs_buf *bp;
+ void *bufdata;
const size_t copylen = mp->m_blockwsize << XFS_WORDLOG;
enum xfs_blft buf_type;
int error;
- if (ip == mp->m_rsumip)
+ if (type == XFS_RTGI_BITMAP)
+ buf_type = XFS_BLFT_RTBITMAP_BUF;
+ else if (type == XFS_RTGI_SUMMARY)
buf_type = XFS_BLFT_RTSUMMARY_BUF;
else
- buf_type = XFS_BLFT_RTBITMAP_BUF;
+ return -EINVAL;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, 0, 0, 0, &tp);
if (error)
@@ -1288,13 +1386,30 @@ xfs_rtfile_initialize_block(
xfs_trans_cancel(tp);
return error;
}
+ bufdata = bp->b_addr;
xfs_trans_buf_set_type(tp, bp, buf_type);
- bp->b_ops = &xfs_rtbuf_ops;
+ bp->b_ops = xfs_rtblock_ops(mp, type);
+
+ if (xfs_has_rtgroups(mp)) {
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+ if (type == XFS_RTGI_BITMAP)
+ hdr->rt_magic = cpu_to_be32(XFS_RTBITMAP_MAGIC);
+ else
+ hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC);
+ hdr->rt_owner = cpu_to_be64(ip->i_ino);
+ hdr->rt_blkno = cpu_to_be64(XFS_FSB_TO_DADDR(mp, fsbno));
+ hdr->rt_lsn = 0;
+ uuid_copy(&hdr->rt_uuid, &mp->m_sb.sb_meta_uuid);
+
+ bufdata += sizeof(*hdr);
+ }
+
if (data)
- memcpy(bp->b_addr, data, copylen);
+ memcpy(bufdata, data, copylen);
else
- memset(bp->b_addr, 0, copylen);
+ memset(bufdata, 0, copylen);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
return xfs_trans_commit(tp);
}
@@ -1306,12 +1421,13 @@ xfs_rtfile_initialize_block(
*/
int
xfs_rtfile_initialize_blocks(
- struct xfs_inode *ip, /* inode (bitmap/summary) */
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type,
xfs_fileoff_t offset_fsb, /* offset to start from */
xfs_fileoff_t end_fsb, /* offset to allocate to */
void *data) /* data to fill the blocks */
{
- struct xfs_mount *mp = ip->i_mount;
+ struct xfs_mount *mp = rtg_mount(rtg);
const size_t copylen = mp->m_blockwsize << XFS_WORDLOG;
while (offset_fsb < end_fsb) {
@@ -1319,8 +1435,8 @@ xfs_rtfile_initialize_blocks(
xfs_filblks_t i;
int error;
- error = xfs_rtfile_alloc_blocks(ip, offset_fsb,
- end_fsb - offset_fsb, &map);
+ error = xfs_rtfile_alloc_blocks(rtg->rtg_inodes[type],
+ offset_fsb, end_fsb - offset_fsb, &map);
if (error)
return error;
@@ -1330,7 +1446,7 @@ xfs_rtfile_initialize_blocks(
* Do this one block per transaction, to keep it simple.
*/
for (i = 0; i < map.br_blockcount; i++) {
- error = xfs_rtfile_initialize_block(ip,
+ error = xfs_rtfile_initialize_block(rtg, type,
map.br_startblock + i, data);
if (error)
return error;
@@ -1343,3 +1459,35 @@ xfs_rtfile_initialize_blocks(
return 0;
}
+
+int
+xfs_rtbitmap_create(
+ struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ ip->i_disk_size = mp->m_sb.sb_rbmblocks * mp->m_sb.sb_blocksize;
+ if (init && !xfs_has_rtgroups(mp)) {
+ ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+ inode_set_atime(VFS_I(ip), 0, 0);
+ }
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
+
+int
+xfs_rtsummary_create(
+ struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ ip->i_disk_size = mp->m_rsumblocks * mp->m_sb.sb_blocksize;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 140513d1d6bc..16563a44bd13 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -6,7 +6,10 @@
#ifndef __XFS_RTBITMAP_H__
#define __XFS_RTBITMAP_H__
+#include "xfs_rtgroup.h"
+
struct xfs_rtalloc_args {
+ struct xfs_rtgroup *rtg;
struct xfs_mount *mp;
struct xfs_trans *tp;
@@ -19,13 +22,37 @@ struct xfs_rtalloc_args {
static inline xfs_rtblock_t
xfs_rtx_to_rtb(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
xfs_rtxnum_t rtx)
{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rtblock_t start = xfs_group_start_fsb(rtg_group(rtg));
+
+ if (mp->m_rtxblklog >= 0)
+ return start + (rtx << mp->m_rtxblklog);
+ return start + (rtx * mp->m_sb.sb_rextsize);
+}
+
+/* Convert an rgbno into an rt extent number. */
+static inline xfs_rtxnum_t
+xfs_rgbno_to_rtx(
+ struct xfs_mount *mp,
+ xfs_rgblock_t rgbno)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return rgbno >> mp->m_rtxblklog;
+ return rgbno / mp->m_sb.sb_rextsize;
+}
+
+static inline uint64_t
+xfs_rtbxlen_to_blen(
+ struct xfs_mount *mp,
+ xfs_rtbxlen_t rtbxlen)
+{
if (mp->m_rtxblklog >= 0)
- return rtx << mp->m_rtxblklog;
+ return rtbxlen << mp->m_rtxblklog;
- return rtx * mp->m_sb.sb_rextsize;
+ return rtbxlen * mp->m_sb.sb_rextsize;
}
static inline xfs_extlen_t
@@ -62,15 +89,49 @@ xfs_extlen_to_rtxlen(
return len / mp->m_sb.sb_rextsize;
}
+/* Convert an rt block count into an rt extent count. */
+static inline xfs_rtbxlen_t
+xfs_blen_to_rtbxlen(
+ struct xfs_mount *mp,
+ uint64_t blen)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return blen >> mp->m_rtxblklog;
+
+ return div_u64(blen, mp->m_sb.sb_rextsize);
+}
+
+/* Return the offset of a file block length within an rt extent. */
+static inline xfs_extlen_t
+xfs_blen_to_rtxoff(
+ struct xfs_mount *mp,
+ xfs_filblks_t blen)
+{
+ if (likely(mp->m_rtxblklog >= 0))
+ return blen & mp->m_rtxblkmask;
+
+ return do_div(blen, mp->m_sb.sb_rextsize);
+}
+
+/* Round this block count up to the nearest rt extent size. */
+static inline xfs_filblks_t
+xfs_blen_roundup_rtx(
+ struct xfs_mount *mp,
+ xfs_filblks_t blen)
+{
+ return roundup_64(blen, mp->m_sb.sb_rextsize);
+}
+
/* Convert an rt block number into an rt extent number. */
static inline xfs_rtxnum_t
xfs_rtb_to_rtx(
struct xfs_mount *mp,
xfs_rtblock_t rtbno)
{
+ /* open-coded 64-bit masking operation */
+ rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask;
if (likely(mp->m_rtxblklog >= 0))
return rtbno >> mp->m_rtxblklog;
-
return div_u64(rtbno, mp->m_sb.sb_rextsize);
}
@@ -80,48 +141,29 @@ xfs_rtb_to_rtxoff(
struct xfs_mount *mp,
xfs_rtblock_t rtbno)
{
+ /* open-coded 64-bit masking operation */
+ rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask;
if (likely(mp->m_rtxblklog >= 0))
return rtbno & mp->m_rtxblkmask;
-
return do_div(rtbno, mp->m_sb.sb_rextsize);
}
-/*
- * Convert an rt block number into an rt extent number, rounding up to the next
- * rt extent if the rt block is not aligned to an rt extent boundary.
- */
-static inline xfs_rtxnum_t
-xfs_rtb_to_rtxup(
- struct xfs_mount *mp,
- xfs_rtblock_t rtbno)
-{
- if (likely(mp->m_rtxblklog >= 0)) {
- if (rtbno & mp->m_rtxblkmask)
- return (rtbno >> mp->m_rtxblklog) + 1;
- return rtbno >> mp->m_rtxblklog;
- }
-
- if (do_div(rtbno, mp->m_sb.sb_rextsize))
- rtbno++;
- return rtbno;
-}
-
-/* Round this rtblock up to the nearest rt extent size. */
+/* Round this file block offset up to the nearest rt extent size. */
static inline xfs_rtblock_t
-xfs_rtb_roundup_rtx(
+xfs_fileoff_roundup_rtx(
struct xfs_mount *mp,
- xfs_rtblock_t rtbno)
+ xfs_fileoff_t off)
{
- return roundup_64(rtbno, mp->m_sb.sb_rextsize);
+ return roundup_64(off, mp->m_sb.sb_rextsize);
}
-/* Round this rtblock down to the nearest rt extent size. */
+/* Round this file block offset down to the nearest rt extent size. */
static inline xfs_rtblock_t
-xfs_rtb_rounddown_rtx(
+xfs_fileoff_rounddown_rtx(
struct xfs_mount *mp,
- xfs_rtblock_t rtbno)
+ xfs_fileoff_t off)
{
- return rounddown_64(rtbno, mp->m_sb.sb_rextsize);
+ return rounddown_64(off, mp->m_sb.sb_rextsize);
}
/* Convert an rt extent number to a file block offset in the rt bitmap file. */
@@ -130,6 +172,9 @@ xfs_rtx_to_rbmblock(
struct xfs_mount *mp,
xfs_rtxnum_t rtx)
{
+ if (xfs_has_rtgroups(mp))
+ return div_u64(rtx, mp->m_rtx_per_rbmblock);
+
return rtx >> mp->m_blkbit_log;
}
@@ -139,6 +184,13 @@ xfs_rtx_to_rbmword(
struct xfs_mount *mp,
xfs_rtxnum_t rtx)
{
+ if (xfs_has_rtgroups(mp)) {
+ unsigned int mod;
+
+ div_u64_rem(rtx >> XFS_NBWORDLOG, mp->m_blockwsize, &mod);
+ return mod;
+ }
+
return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1);
}
@@ -148,6 +200,9 @@ xfs_rbmblock_to_rtx(
struct xfs_mount *mp,
xfs_fileoff_t rbmoff)
{
+ if (xfs_has_rtgroups(mp))
+ return rbmoff * mp->m_rtx_per_rbmblock;
+
return rbmoff << mp->m_blkbit_log;
}
@@ -157,7 +212,14 @@ xfs_rbmblock_wordptr(
struct xfs_rtalloc_args *args,
unsigned int index)
{
- union xfs_rtword_raw *words = args->rbmbp->b_addr;
+ struct xfs_mount *mp = args->mp;
+ union xfs_rtword_raw *words;
+ struct xfs_rtbuf_blkinfo *hdr = args->rbmbp->b_addr;
+
+ if (xfs_has_rtgroups(mp))
+ words = (union xfs_rtword_raw *)(hdr + 1);
+ else
+ words = args->rbmbp->b_addr;
return words + index;
}
@@ -170,6 +232,8 @@ xfs_rtbitmap_getword(
{
union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index);
+ if (xfs_has_rtgroups(args->mp))
+ return be32_to_cpu(word->rtg);
return word->old;
}
@@ -182,7 +246,10 @@ xfs_rtbitmap_setword(
{
union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index);
- word->old = value;
+ if (xfs_has_rtgroups(args->mp))
+ word->rtg = cpu_to_be32(value);
+ else
+ word->old = value;
}
/*
@@ -207,6 +274,9 @@ xfs_rtsumoffs_to_block(
struct xfs_mount *mp,
xfs_rtsumoff_t rsumoff)
{
+ if (xfs_has_rtgroups(mp))
+ return rsumoff / mp->m_blockwsize;
+
return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t));
}
@@ -221,6 +291,9 @@ xfs_rtsumoffs_to_infoword(
{
unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG;
+ if (xfs_has_rtgroups(mp))
+ return rsumoff % mp->m_blockwsize;
+
return rsumoff & mask;
}
@@ -230,7 +303,13 @@ xfs_rsumblock_infoptr(
struct xfs_rtalloc_args *args,
unsigned int index)
{
- union xfs_suminfo_raw *info = args->sumbp->b_addr;
+ union xfs_suminfo_raw *info;
+ struct xfs_rtbuf_blkinfo *hdr = args->sumbp->b_addr;
+
+ if (xfs_has_rtgroups(args->mp))
+ info = (union xfs_suminfo_raw *)(hdr + 1);
+ else
+ info = args->sumbp->b_addr;
return info + index;
}
@@ -243,6 +322,8 @@ xfs_suminfo_get(
{
union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index);
+ if (xfs_has_rtgroups(args->mp))
+ return be32_to_cpu(info->rtg);
return info->old;
}
@@ -255,10 +336,28 @@ xfs_suminfo_add(
{
union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index);
+ if (xfs_has_rtgroups(args->mp)) {
+ be32_add_cpu(&info->rtg, delta);
+ return be32_to_cpu(info->rtg);
+ }
+
info->old += delta;
return info->old;
}
+static inline const struct xfs_buf_ops *
+xfs_rtblock_ops(
+ struct xfs_mount *mp,
+ enum xfs_rtg_inodes type)
+{
+ if (xfs_has_rtgroups(mp)) {
+ if (type == XFS_RTGI_SUMMARY)
+ return &xfs_rtsummary_buf_ops;
+ return &xfs_rtbitmap_buf_ops;
+ }
+ return &xfs_rtbuf_ops;
+}
+
/*
* Functions for walking free space rtextents in the realtime bitmap.
*/
@@ -268,7 +367,7 @@ struct xfs_rtalloc_rec {
};
typedef int (*xfs_rtalloc_query_range_fn)(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv);
@@ -291,53 +390,43 @@ int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log,
xfs_fileoff_t bbno, int delta);
int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
xfs_rtxlen_t len);
-int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
+int xfs_rtalloc_query_range(struct xfs_rtgroup *rtg, struct xfs_trans *tp,
xfs_rtxnum_t start, xfs_rtxnum_t end,
xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtalloc_query_range_fn fn,
- void *priv);
-int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_rtxnum_t start, xfs_rtxlen_t len,
- bool *is_free);
-/*
- * Free an extent in the realtime subvolume. Length is expressed in
- * realtime extents, as is the block number.
- */
-int /* error */
-xfs_rtfree_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtxnum_t start, /* starting rtext number to free */
- xfs_rtxlen_t len); /* length of extent freed */
-
+int xfs_rtalloc_query_all(struct xfs_rtgroup *rtg, struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn, void *priv);
+int xfs_rtalloc_extent_is_free(struct xfs_rtgroup *rtg, struct xfs_trans *tp,
+ xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free);
+int xfs_rtfree_extent(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+ xfs_rtxnum_t start, xfs_rtxlen_t len);
/* Same as above, but in units of rt blocks. */
-int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
- xfs_filblks_t rtlen);
+int xfs_rtfree_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+ xfs_fsblock_t rtbno, xfs_filblks_t rtlen);
-xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
- rtextents);
+xfs_rtxnum_t xfs_rtbitmap_rtx_per_rbmblock(struct xfs_mount *mp);
+xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp);
+xfs_filblks_t xfs_rtbitmap_blockcount_len(struct xfs_mount *mp,
+ xfs_rtbxlen_t rtextents);
xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp,
- unsigned int rsumlevels, xfs_extlen_t rbmblocks);
-
-int xfs_rtfile_initialize_blocks(struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, void *data);
+ unsigned int *rsumlevels);
-void xfs_rtbitmap_lock(struct xfs_mount *mp);
-void xfs_rtbitmap_unlock(struct xfs_mount *mp);
-void xfs_rtbitmap_trans_join(struct xfs_trans *tp);
+int xfs_rtfile_initialize_blocks(struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type, xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb, void *data);
+int xfs_rtbitmap_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xfs_trans *tp, bool init);
+int xfs_rtsummary_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip,
+ struct xfs_trans *tp, bool init);
-/* Lock the rt bitmap inode in shared mode */
-#define XFS_RBMLOCK_BITMAP (1U << 0)
-/* Lock the rt summary inode in shared mode */
-#define XFS_RBMLOCK_SUMMARY (1U << 1)
-
-void xfs_rtbitmap_lock_shared(struct xfs_mount *mp,
- unsigned int rbmlock_flags);
-void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp,
- unsigned int rbmlock_flags);
#else /* CONFIG_XFS_RT */
# define xfs_rtfree_extent(t,b,l) (-ENOSYS)
-# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS)
+
+static inline int xfs_rtfree_blocks(struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno,
+ xfs_filblks_t rtlen)
+{
+ return -ENOSYS;
+}
# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS)
# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS)
# define xfs_rtbitmap_read_buf(a,b) (-ENOSYS)
@@ -345,17 +434,11 @@ void xfs_rtbitmap_unlock_shared(struct xfs_mount *mp,
# define xfs_rtbuf_cache_relse(a) (0)
# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS)
static inline xfs_filblks_t
-xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
+xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
{
/* shut up gcc */
return 0;
}
-# define xfs_rtsummary_blockcount(mp, l, b) (0)
-# define xfs_rtbitmap_lock(mp) do { } while (0)
-# define xfs_rtbitmap_trans_join(tp) do { } while (0)
-# define xfs_rtbitmap_unlock(mp) do { } while (0)
-# define xfs_rtbitmap_lock_shared(mp, lf) do { } while (0)
-# define xfs_rtbitmap_unlock_shared(mp, lf) do { } while (0)
#endif /* CONFIG_XFS_RT */
#endif /* __XFS_RTBITMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
new file mode 100644
index 000000000000..e74bb059f24f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -0,0 +1,697 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_health.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_buf_item.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_metafile.h"
+#include "xfs_metadir.h"
+
+/* Find the first usable fsblock in this rtgroup. */
+static inline uint32_t
+xfs_rtgroup_min_block(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno)
+{
+ if (xfs_has_rtsb(mp) && rgno == 0)
+ return mp->m_sb.sb_rextsize;
+
+ return 0;
+}
+
+/* Precompute this group's geometry */
+void
+xfs_rtgroup_calc_geometry(
+ struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
+ xfs_rgnumber_t rgno,
+ xfs_rgnumber_t rgcount,
+ xfs_rtbxlen_t rextents)
+{
+ rtg->rtg_extents = __xfs_rtgroup_extents(mp, rgno, rgcount, rextents);
+ rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize;
+ rtg_group(rtg)->xg_min_gbno = xfs_rtgroup_min_block(mp, rgno);
+}
+
+int
+xfs_rtgroup_alloc(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno,
+ xfs_rgnumber_t rgcount,
+ xfs_rtbxlen_t rextents)
+{
+ struct xfs_rtgroup *rtg;
+ int error;
+
+ rtg = kzalloc(sizeof(struct xfs_rtgroup), GFP_KERNEL);
+ if (!rtg)
+ return -ENOMEM;
+
+ xfs_rtgroup_calc_geometry(mp, rtg, rgno, rgcount, rextents);
+
+ error = xfs_group_insert(mp, rtg_group(rtg), rgno, XG_TYPE_RTG);
+ if (error)
+ goto out_free_rtg;
+ return 0;
+
+out_free_rtg:
+ kfree(rtg);
+ return error;
+}
+
+void
+xfs_rtgroup_free(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno)
+{
+ xfs_group_free(mp, rgno, XG_TYPE_RTG, NULL);
+}
+
+/* Free a range of incore rtgroup objects. */
+void
+xfs_free_rtgroups(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t first_rgno,
+ xfs_rgnumber_t end_rgno)
+{
+ xfs_rgnumber_t rgno;
+
+ for (rgno = first_rgno; rgno < end_rgno; rgno++)
+ xfs_rtgroup_free(mp, rgno);
+}
+
+/* Initialize some range of incore rtgroup objects. */
+int
+xfs_initialize_rtgroups(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t first_rgno,
+ xfs_rgnumber_t end_rgno,
+ xfs_rtbxlen_t rextents)
+{
+ xfs_rgnumber_t index;
+ int error;
+
+ if (first_rgno >= end_rgno)
+ return 0;
+
+ for (index = first_rgno; index < end_rgno; index++) {
+ error = xfs_rtgroup_alloc(mp, index, end_rgno, rextents);
+ if (error)
+ goto out_unwind_new_rtgs;
+ }
+
+ return 0;
+
+out_unwind_new_rtgs:
+ xfs_free_rtgroups(mp, first_rgno, index);
+ return error;
+}
+
+/* Compute the number of rt extents in this realtime group. */
+xfs_rtxnum_t
+__xfs_rtgroup_extents(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno,
+ xfs_rgnumber_t rgcount,
+ xfs_rtbxlen_t rextents)
+{
+ ASSERT(rgno < rgcount);
+ if (rgno == rgcount - 1)
+ return rextents - ((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents);
+
+ ASSERT(xfs_has_rtgroups(mp));
+ return mp->m_sb.sb_rgextents;
+}
+
+xfs_rtxnum_t
+xfs_rtgroup_extents(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno)
+{
+ return __xfs_rtgroup_extents(mp, rgno, mp->m_sb.sb_rgcount,
+ mp->m_sb.sb_rextents);
+}
+
+/*
+ * Update the rt extent count of the previous tail rtgroup if it changed during
+ * recovery (i.e. recovery of a growfs).
+ */
+int
+xfs_update_last_rtgroup_size(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t prev_rgcount)
+{
+ struct xfs_rtgroup *rtg;
+
+ ASSERT(prev_rgcount > 0);
+
+ rtg = xfs_rtgroup_grab(mp, prev_rgcount - 1);
+ if (!rtg)
+ return -EFSCORRUPTED;
+ rtg->rtg_extents = __xfs_rtgroup_extents(mp, prev_rgcount - 1,
+ mp->m_sb.sb_rgcount, mp->m_sb.sb_rextents);
+ rtg_group(rtg)->xg_block_count = rtg->rtg_extents * mp->m_sb.sb_rextsize;
+ xfs_rtgroup_rele(rtg);
+ return 0;
+}
+
+/* Lock metadata inodes associated with this rt group. */
+void
+xfs_rtgroup_lock(
+ struct xfs_rtgroup *rtg,
+ unsigned int rtglock_flags)
+{
+ ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
+ ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
+ !(rtglock_flags & XFS_RTGLOCK_BITMAP));
+
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ /*
+ * Lock both realtime free space metadata inodes for a freespace
+ * update.
+ */
+ xfs_ilock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_EXCL);
+ xfs_ilock(rtg->rtg_inodes[XFS_RTGI_SUMMARY], XFS_ILOCK_EXCL);
+ } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_ilock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_SHARED);
+ }
+}
+
+/* Unlock metadata inodes associated with this rt group. */
+void
+xfs_rtgroup_unlock(
+ struct xfs_rtgroup *rtg,
+ unsigned int rtglock_flags)
+{
+ ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
+ ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
+ !(rtglock_flags & XFS_RTGLOCK_BITMAP));
+
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_SUMMARY], XFS_ILOCK_EXCL);
+ xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_EXCL);
+ } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_iunlock(rtg->rtg_inodes[XFS_RTGI_BITMAP], XFS_ILOCK_SHARED);
+ }
+}
+
+/*
+ * Join realtime group metadata inodes to the transaction. The ILOCKs will be
+ * released on transaction commit.
+ */
+void
+xfs_rtgroup_trans_join(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg,
+ unsigned int rtglock_flags)
+{
+ ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
+ ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
+
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_BITMAP],
+ XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, rtg->rtg_inodes[XFS_RTGI_SUMMARY],
+ XFS_ILOCK_EXCL);
+ }
+}
+
+/* Retrieve rt group geometry. */
+int
+xfs_rtgroup_get_geometry(
+ struct xfs_rtgroup *rtg,
+ struct xfs_rtgroup_geometry *rgeo)
+{
+ /* Fill out form. */
+ memset(rgeo, 0, sizeof(*rgeo));
+ rgeo->rg_number = rtg_rgno(rtg);
+ rgeo->rg_length = rtg_group(rtg)->xg_block_count;
+ xfs_rtgroup_geom_health(rtg, rgeo);
+ return 0;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+static struct lock_class_key xfs_rtginode_lock_class;
+
+static int
+xfs_rtginode_ilock_cmp_fn(
+ const struct lockdep_map *m1,
+ const struct lockdep_map *m2)
+{
+ const struct xfs_inode *ip1 =
+ container_of(m1, struct xfs_inode, i_lock.dep_map);
+ const struct xfs_inode *ip2 =
+ container_of(m2, struct xfs_inode, i_lock.dep_map);
+
+ if (ip1->i_projid < ip2->i_projid)
+ return -1;
+ if (ip1->i_projid > ip2->i_projid)
+ return 1;
+ return 0;
+}
+
+static inline void
+xfs_rtginode_ilock_print_fn(
+ const struct lockdep_map *m)
+{
+ const struct xfs_inode *ip =
+ container_of(m, struct xfs_inode, i_lock.dep_map);
+
+ printk(KERN_CONT " rgno=%u", ip->i_projid);
+}
+
+/*
+ * Most of the time each of the RTG inode locks are only taken one at a time.
+ * But when committing deferred ops, more than one of a kind can be taken.
+ * However, deferred rt ops will be committed in rgno order so there is no
+ * potential for deadlocks. The code here is needed to tell lockdep about this
+ * order.
+ */
+static inline void
+xfs_rtginode_lockdep_setup(
+ struct xfs_inode *ip,
+ xfs_rgnumber_t rgno,
+ enum xfs_rtg_inodes type)
+{
+ lockdep_set_class_and_subclass(&ip->i_lock, &xfs_rtginode_lock_class,
+ type);
+ lock_set_cmp_fn(&ip->i_lock, xfs_rtginode_ilock_cmp_fn,
+ xfs_rtginode_ilock_print_fn);
+}
+#else
+#define xfs_rtginode_lockdep_setup(ip, rgno, type) do { } while (0)
+#endif /* CONFIG_PROVE_LOCKING */
+
+struct xfs_rtginode_ops {
+ const char *name; /* short name */
+
+ enum xfs_metafile_type metafile_type;
+
+ unsigned int sick; /* rtgroup sickness flag */
+
+ /* Does the fs have this feature? */
+ bool (*enabled)(struct xfs_mount *mp);
+
+ /* Create this rtgroup metadata inode and initialize it. */
+ int (*create)(struct xfs_rtgroup *rtg,
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ bool init);
+};
+
+static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
+ [XFS_RTGI_BITMAP] = {
+ .name = "bitmap",
+ .metafile_type = XFS_METAFILE_RTBITMAP,
+ .sick = XFS_SICK_RG_BITMAP,
+ .create = xfs_rtbitmap_create,
+ },
+ [XFS_RTGI_SUMMARY] = {
+ .name = "summary",
+ .metafile_type = XFS_METAFILE_RTSUMMARY,
+ .sick = XFS_SICK_RG_SUMMARY,
+ .create = xfs_rtsummary_create,
+ },
+};
+
+/* Return the shortname of this rtgroup inode. */
+const char *
+xfs_rtginode_name(
+ enum xfs_rtg_inodes type)
+{
+ return xfs_rtginode_ops[type].name;
+}
+
+/* Return the metafile type of this rtgroup inode. */
+enum xfs_metafile_type
+xfs_rtginode_metafile_type(
+ enum xfs_rtg_inodes type)
+{
+ return xfs_rtginode_ops[type].metafile_type;
+}
+
+/* Should this rtgroup inode be present? */
+bool
+xfs_rtginode_enabled(
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type)
+{
+ const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type];
+
+ if (!ops->enabled)
+ return true;
+ return ops->enabled(rtg_mount(rtg));
+}
+
+/* Mark an rtgroup inode sick */
+void
+xfs_rtginode_mark_sick(
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type)
+{
+ const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type];
+
+ xfs_group_mark_sick(rtg_group(rtg), ops->sick);
+}
+
+/* Load and existing rtgroup inode into the rtgroup structure. */
+int
+xfs_rtginode_load(
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type,
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *ip;
+ const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type];
+ int error;
+
+ if (!xfs_rtginode_enabled(rtg, type))
+ return 0;
+
+ if (!xfs_has_rtgroups(mp)) {
+ xfs_ino_t ino;
+
+ switch (type) {
+ case XFS_RTGI_BITMAP:
+ ino = mp->m_sb.sb_rbmino;
+ break;
+ case XFS_RTGI_SUMMARY:
+ ino = mp->m_sb.sb_rsumino;
+ break;
+ default:
+ /* None of the other types exist on !rtgroups */
+ return 0;
+ }
+
+ error = xfs_trans_metafile_iget(tp, ino, ops->metafile_type,
+ &ip);
+ } else {
+ const char *path;
+
+ if (!mp->m_rtdirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ path = xfs_rtginode_path(rtg_rgno(rtg), type);
+ if (!path)
+ return -ENOMEM;
+ error = xfs_metadir_load(tp, mp->m_rtdirip, path,
+ ops->metafile_type, &ip);
+ kfree(path);
+ }
+
+ if (error) {
+ if (xfs_metadata_is_sick(error))
+ xfs_rtginode_mark_sick(rtg, type);
+ return error;
+ }
+
+ if (XFS_IS_CORRUPT(mp, ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE)) {
+ xfs_irele(ip);
+ xfs_rtginode_mark_sick(rtg, type);
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_IS_CORRUPT(mp, ip->i_projid != rtg_rgno(rtg))) {
+ xfs_irele(ip);
+ xfs_rtginode_mark_sick(rtg, type);
+ return -EFSCORRUPTED;
+ }
+
+ xfs_rtginode_lockdep_setup(ip, rtg_rgno(rtg), type);
+ rtg->rtg_inodes[type] = ip;
+ return 0;
+}
+
+/* Release an rtgroup metadata inode. */
+void
+xfs_rtginode_irele(
+ struct xfs_inode **ipp)
+{
+ if (*ipp)
+ xfs_irele(*ipp);
+ *ipp = NULL;
+}
+
+/* Add a metadata inode for a realtime rmap btree. */
+int
+xfs_rtginode_create(
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type,
+ bool init)
+{
+ const struct xfs_rtginode_ops *ops = &xfs_rtginode_ops[type];
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_metadir_update upd = {
+ .dp = mp->m_rtdirip,
+ .metafile_type = ops->metafile_type,
+ };
+ int error;
+
+ if (!xfs_rtginode_enabled(rtg, type))
+ return 0;
+
+ if (!mp->m_rtdirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ upd.path = xfs_rtginode_path(rtg_rgno(rtg), type);
+ if (!upd.path)
+ return -ENOMEM;
+
+ error = xfs_metadir_start_create(&upd);
+ if (error)
+ goto out_path;
+
+ error = xfs_metadir_create(&upd, S_IFREG);
+ if (error)
+ return error;
+
+ xfs_rtginode_lockdep_setup(upd.ip, rtg_rgno(rtg), type);
+
+ upd.ip->i_projid = rtg_rgno(rtg);
+ error = ops->create(rtg, upd.ip, upd.tp, init);
+ if (error)
+ goto out_cancel;
+
+ error = xfs_metadir_commit(&upd);
+ if (error)
+ goto out_path;
+
+ kfree(upd.path);
+ xfs_finish_inode_setup(upd.ip);
+ rtg->rtg_inodes[type] = upd.ip;
+ return 0;
+
+out_cancel:
+ xfs_metadir_cancel(&upd, error);
+ /* Have to finish setting up the inode to ensure it's deleted. */
+ if (upd.ip) {
+ xfs_finish_inode_setup(upd.ip);
+ xfs_irele(upd.ip);
+ }
+out_path:
+ kfree(upd.path);
+ return error;
+}
+
+/* Create the parent directory for all rtgroup inodes and load it. */
+int
+xfs_rtginode_mkdir_parent(
+ struct xfs_mount *mp)
+{
+ if (!mp->m_metadirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_metadir_mkdir(mp->m_metadirip, "rtgroups", &mp->m_rtdirip);
+}
+
+/* Load the parent directory of all rtgroup inodes. */
+int
+xfs_rtginode_load_parent(
+ struct xfs_trans *tp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (!mp->m_metadirip) {
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+ }
+
+ return xfs_metadir_load(tp, mp->m_metadirip, "rtgroups",
+ XFS_METAFILE_DIR, &mp->m_rtdirip);
+}
+
+/* Check superblock fields for a read or a write. */
+static xfs_failaddr_t
+xfs_rtsb_verify_common(
+ struct xfs_buf *bp)
+{
+ struct xfs_rtsb *rsb = bp->b_addr;
+
+ if (!xfs_verify_magic(bp, rsb->rsb_magicnum))
+ return __this_address;
+ if (rsb->rsb_pad)
+ return __this_address;
+
+ /* Everything to the end of the fs block must be zero */
+ if (memchr_inv(rsb + 1, 0, BBTOB(bp->b_length) - sizeof(*rsb)))
+ return __this_address;
+
+ return NULL;
+}
+
+/* Check superblock fields for a read or revalidation. */
+static inline xfs_failaddr_t
+xfs_rtsb_verify_all(
+ struct xfs_buf *bp)
+{
+ struct xfs_rtsb *rsb = bp->b_addr;
+ struct xfs_mount *mp = bp->b_mount;
+ xfs_failaddr_t fa;
+
+ fa = xfs_rtsb_verify_common(bp);
+ if (fa)
+ return fa;
+
+ if (memcmp(&rsb->rsb_fname, &mp->m_sb.sb_fname, XFSLABEL_MAX))
+ return __this_address;
+ if (!uuid_equal(&rsb->rsb_uuid, &mp->m_sb.sb_uuid))
+ return __this_address;
+ if (!uuid_equal(&rsb->rsb_meta_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+
+ return NULL;
+}
+
+static void
+xfs_rtsb_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ if (!xfs_buf_verify_cksum(bp, XFS_RTSB_CRC_OFF)) {
+ xfs_verifier_error(bp, -EFSBADCRC, __this_address);
+ return;
+ }
+
+ fa = xfs_rtsb_verify_all(bp);
+ if (fa)
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+static void
+xfs_rtsb_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_failaddr_t fa;
+
+ fa = xfs_rtsb_verify_common(bp);
+ if (fa) {
+ xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+ return;
+ }
+
+ xfs_buf_update_cksum(bp, XFS_RTSB_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_rtsb_buf_ops = {
+ .name = "xfs_rtsb",
+ .magic = { 0, cpu_to_be32(XFS_RTSB_MAGIC) },
+ .verify_read = xfs_rtsb_read_verify,
+ .verify_write = xfs_rtsb_write_verify,
+ .verify_struct = xfs_rtsb_verify_all,
+};
+
+/* Update a realtime superblock from the primary fs super */
+void
+xfs_update_rtsb(
+ struct xfs_buf *rtsb_bp,
+ const struct xfs_buf *sb_bp)
+{
+ const struct xfs_dsb *dsb = sb_bp->b_addr;
+ struct xfs_rtsb *rsb = rtsb_bp->b_addr;
+ const uuid_t *meta_uuid;
+
+ rsb->rsb_magicnum = cpu_to_be32(XFS_RTSB_MAGIC);
+
+ rsb->rsb_pad = 0;
+ memcpy(&rsb->rsb_fname, &dsb->sb_fname, XFSLABEL_MAX);
+
+ memcpy(&rsb->rsb_uuid, &dsb->sb_uuid, sizeof(rsb->rsb_uuid));
+
+ /*
+ * The metadata uuid is the fs uuid if the metauuid feature is not
+ * enabled.
+ */
+ if (dsb->sb_features_incompat &
+ cpu_to_be32(XFS_SB_FEAT_INCOMPAT_META_UUID))
+ meta_uuid = &dsb->sb_meta_uuid;
+ else
+ meta_uuid = &dsb->sb_uuid;
+ memcpy(&rsb->rsb_meta_uuid, meta_uuid, sizeof(rsb->rsb_meta_uuid));
+}
+
+/*
+ * Update the realtime superblock from a filesystem superblock and log it to
+ * the given transaction.
+ */
+struct xfs_buf *
+xfs_log_rtsb(
+ struct xfs_trans *tp,
+ const struct xfs_buf *sb_bp)
+{
+ struct xfs_buf *rtsb_bp;
+
+ if (!xfs_has_rtsb(tp->t_mountp))
+ return NULL;
+
+ rtsb_bp = xfs_trans_getrtsb(tp);
+ if (!rtsb_bp) {
+ /*
+ * It's possible for the rtgroups feature to be enabled but
+ * there is no incore rt superblock buffer if the rt geometry
+ * was specified at mkfs time but the rt section has not yet
+ * been attached. In this case, rblocks must be zero.
+ */
+ ASSERT(tp->t_mountp->m_sb.sb_rblocks == 0);
+ return NULL;
+ }
+
+ xfs_update_rtsb(rtsb_bp, sb_bp);
+ xfs_trans_ordered_buf(tp, rtsb_bp);
+ return rtsb_bp;
+}
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
new file mode 100644
index 000000000000..7e7e491ff06f
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -0,0 +1,284 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __LIBXFS_RTGROUP_H
+#define __LIBXFS_RTGROUP_H 1
+
+#include "xfs_group.h"
+
+struct xfs_mount;
+struct xfs_trans;
+
+enum xfs_rtg_inodes {
+ XFS_RTGI_BITMAP, /* allocation bitmap */
+ XFS_RTGI_SUMMARY, /* allocation summary */
+
+ XFS_RTGI_MAX,
+};
+
+#ifdef MAX_LOCKDEP_SUBCLASSES
+static_assert(XFS_RTGI_MAX <= MAX_LOCKDEP_SUBCLASSES);
+#endif
+
+/*
+ * Realtime group incore structure, similar to the per-AG structure.
+ */
+struct xfs_rtgroup {
+ struct xfs_group rtg_group;
+
+ /* per-rtgroup metadata inodes */
+ struct xfs_inode *rtg_inodes[XFS_RTGI_MAX];
+
+ /* Number of blocks in this group */
+ xfs_rtxnum_t rtg_extents;
+
+ /*
+ * Cache of rt summary level per bitmap block with the invariant that
+ * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
+ * or 0 if rsum[i][bbno] == 0 for all i.
+ *
+ * Reads and writes are serialized by the rsumip inode lock.
+ */
+ uint8_t *rtg_rsum_cache;
+};
+
+static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
+{
+ return container_of(xg, struct xfs_rtgroup, rtg_group);
+}
+
+static inline struct xfs_group *rtg_group(struct xfs_rtgroup *rtg)
+{
+ return &rtg->rtg_group;
+}
+
+static inline struct xfs_mount *rtg_mount(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_group.xg_mount;
+}
+
+static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_group.xg_gno;
+}
+
+/* Passive rtgroup references */
+static inline struct xfs_rtgroup *
+xfs_rtgroup_get(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno)
+{
+ return to_rtg(xfs_group_get(mp, rgno, XG_TYPE_RTG));
+}
+
+static inline struct xfs_rtgroup *
+xfs_rtgroup_hold(
+ struct xfs_rtgroup *rtg)
+{
+ return to_rtg(xfs_group_hold(rtg_group(rtg)));
+}
+
+static inline void
+xfs_rtgroup_put(
+ struct xfs_rtgroup *rtg)
+{
+ xfs_group_put(rtg_group(rtg));
+}
+
+/* Active rtgroup references */
+static inline struct xfs_rtgroup *
+xfs_rtgroup_grab(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno)
+{
+ return to_rtg(xfs_group_grab(mp, rgno, XG_TYPE_RTG));
+}
+
+static inline void
+xfs_rtgroup_rele(
+ struct xfs_rtgroup *rtg)
+{
+ xfs_group_rele(rtg_group(rtg));
+}
+
+static inline struct xfs_rtgroup *
+xfs_rtgroup_next_range(
+ struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
+ xfs_rgnumber_t start_rgno,
+ xfs_rgnumber_t end_rgno)
+{
+ return to_rtg(xfs_group_next_range(mp, rtg ? rtg_group(rtg) : NULL,
+ start_rgno, end_rgno, XG_TYPE_RTG));
+}
+
+static inline struct xfs_rtgroup *
+xfs_rtgroup_next(
+ struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg)
+{
+ return xfs_rtgroup_next_range(mp, rtg, 0, mp->m_sb.sb_rgcount - 1);
+}
+
+static inline xfs_rtblock_t
+xfs_rgbno_to_rtb(
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t rgbno)
+{
+ return xfs_gbno_to_fsb(rtg_group(rtg), rgbno);
+}
+
+static inline xfs_rgnumber_t
+xfs_rtb_to_rgno(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return xfs_fsb_to_gno(mp, rtbno, XG_TYPE_RTG);
+}
+
+static inline xfs_rgblock_t
+xfs_rtb_to_rgbno(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return xfs_fsb_to_gbno(mp, rtbno, XG_TYPE_RTG);
+}
+
+/* Is rtbno the start of a RT group? */
+static inline bool
+xfs_rtbno_is_group_start(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ return (rtbno & mp->m_groups[XG_TYPE_RTG].blkmask) == 0;
+}
+
+/* Convert an rtgroups rt extent number into an rgbno. */
+static inline xfs_rgblock_t
+xfs_rtx_to_rgbno(
+ struct xfs_rtgroup *rtg,
+ xfs_rtxnum_t rtx)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (likely(mp->m_rtxblklog >= 0))
+ return rtx << mp->m_rtxblklog;
+ return rtx * mp->m_sb.sb_rextsize;
+}
+
+static inline xfs_daddr_t
+xfs_rtb_to_daddr(
+ struct xfs_mount *mp,
+ xfs_rtblock_t rtbno)
+{
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
+ uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks;
+
+ return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask));
+}
+
+static inline xfs_rtblock_t
+xfs_daddr_to_rtb(
+ struct xfs_mount *mp,
+ xfs_daddr_t daddr)
+{
+ xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr);
+
+ if (xfs_has_rtgroups(mp)) {
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ xfs_rgnumber_t rgno;
+ uint32_t rgbno;
+
+ rgno = div_u64_rem(bno, g->blocks, &rgbno);
+ return ((xfs_rtblock_t)rgno << g->blklog) + rgbno;
+ }
+
+ return bno;
+}
+
+#ifdef CONFIG_XFS_RT
+int xfs_rtgroup_alloc(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+ xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents);
+void xfs_rtgroup_free(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+
+void xfs_free_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno,
+ xfs_rgnumber_t end_rgno);
+int xfs_initialize_rtgroups(struct xfs_mount *mp, xfs_rgnumber_t first_rgno,
+ xfs_rgnumber_t end_rgno, xfs_rtbxlen_t rextents);
+
+xfs_rtxnum_t __xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno,
+ xfs_rgnumber_t rgcount, xfs_rtbxlen_t rextents);
+xfs_rtxnum_t xfs_rtgroup_extents(struct xfs_mount *mp, xfs_rgnumber_t rgno);
+void xfs_rtgroup_calc_geometry(struct xfs_mount *mp, struct xfs_rtgroup *rtg,
+ xfs_rgnumber_t rgno, xfs_rgnumber_t rgcount,
+ xfs_rtbxlen_t rextents);
+
+int xfs_update_last_rtgroup_size(struct xfs_mount *mp,
+ xfs_rgnumber_t prev_rgcount);
+
+/* Lock the rt bitmap inode in exclusive mode */
+#define XFS_RTGLOCK_BITMAP (1U << 0)
+/* Lock the rt bitmap inode in shared mode */
+#define XFS_RTGLOCK_BITMAP_SHARED (1U << 1)
+
+#define XFS_RTGLOCK_ALL_FLAGS (XFS_RTGLOCK_BITMAP | \
+ XFS_RTGLOCK_BITMAP_SHARED)
+
+void xfs_rtgroup_lock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
+void xfs_rtgroup_unlock(struct xfs_rtgroup *rtg, unsigned int rtglock_flags);
+void xfs_rtgroup_trans_join(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+ unsigned int rtglock_flags);
+
+int xfs_rtgroup_get_geometry(struct xfs_rtgroup *rtg,
+ struct xfs_rtgroup_geometry *rgeo);
+
+int xfs_rtginode_mkdir_parent(struct xfs_mount *mp);
+int xfs_rtginode_load_parent(struct xfs_trans *tp);
+
+const char *xfs_rtginode_name(enum xfs_rtg_inodes type);
+enum xfs_metafile_type xfs_rtginode_metafile_type(enum xfs_rtg_inodes type);
+bool xfs_rtginode_enabled(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type);
+void xfs_rtginode_mark_sick(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type);
+int xfs_rtginode_load(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type,
+ struct xfs_trans *tp);
+int xfs_rtginode_create(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type,
+ bool init);
+void xfs_rtginode_irele(struct xfs_inode **ipp);
+
+static inline const char *xfs_rtginode_path(xfs_rgnumber_t rgno,
+ enum xfs_rtg_inodes type)
+{
+ return kasprintf(GFP_KERNEL, "%u.%s", rgno, xfs_rtginode_name(type));
+}
+
+void xfs_update_rtsb(struct xfs_buf *rtsb_bp,
+ const struct xfs_buf *sb_bp);
+struct xfs_buf *xfs_log_rtsb(struct xfs_trans *tp,
+ const struct xfs_buf *sb_bp);
+#else
+static inline void xfs_free_rtgroups(struct xfs_mount *mp,
+ xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno)
+{
+}
+
+static inline int xfs_initialize_rtgroups(struct xfs_mount *mp,
+ xfs_rgnumber_t first_rgno, xfs_rgnumber_t end_rgno,
+ xfs_rtbxlen_t rextents)
+{
+ return 0;
+}
+
+# define xfs_rtgroup_extents(mp, rgno) (0)
+# define xfs_update_last_rtgroup_size(mp, rgno) (-EOPNOTSUPP)
+# define xfs_rtgroup_lock(rtg, gf) ((void)0)
+# define xfs_rtgroup_unlock(rtg, gf) ((void)0)
+# define xfs_rtgroup_trans_join(tp, rtg, gf) ((void)0)
+# define xfs_update_rtsb(bp, sb_bp) ((void)0)
+# define xfs_log_rtsb(tp, sb_bp) (NULL)
+# define xfs_rtgroup_get_geometry(rtg, rgeo) (-EOPNOTSUPP)
+#endif /* CONFIG_XFS_RT */
+
+#endif /* __LIBXFS_RTGROUP_H */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index d95409f3cba6..e81b240b7158 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -27,6 +27,7 @@
#include "xfs_ag.h"
#include "xfs_rtbitmap.h"
#include "xfs_exchrange.h"
+#include "xfs_rtgroup.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -180,6 +181,8 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_EXCHANGE_RANGE;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_PARENT)
features |= XFS_FEAT_PARENT;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
+ features |= XFS_FEAT_METADIR;
return features;
}
@@ -232,11 +235,37 @@ xfs_validate_sb_read(
return 0;
}
+/* Return the number of extents covered by a single rt bitmap file */
+static xfs_rtbxlen_t
+xfs_extents_per_rbm(
+ struct xfs_sb *sbp)
+{
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR))
+ return sbp->sb_rgextents;
+ return sbp->sb_rextents;
+}
+
+/*
+ * Return the payload size of a single rt bitmap block (without the metadata
+ * header if any).
+ */
+static inline unsigned int
+xfs_rtbmblock_size(
+ struct xfs_sb *sbp)
+{
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR))
+ return sbp->sb_blocksize - sizeof(struct xfs_rtbuf_blkinfo);
+ return sbp->sb_blocksize;
+}
+
static uint64_t
-xfs_sb_calc_rbmblocks(
+xfs_expected_rbmblocks(
struct xfs_sb *sbp)
{
- return howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize);
+ return howmany_64(xfs_extents_per_rbm(sbp),
+ NBBY * xfs_rtbmblock_size(sbp));
}
/* Validate the realtime geometry */
@@ -258,7 +287,7 @@ xfs_validate_rt_geometry(
if (sbp->sb_rextents == 0 ||
sbp->sb_rextents != div_u64(sbp->sb_rblocks, sbp->sb_rextsize) ||
sbp->sb_rextslog != xfs_compute_rextslog(sbp->sb_rextents) ||
- sbp->sb_rbmblocks != xfs_sb_calc_rbmblocks(sbp))
+ sbp->sb_rbmblocks != xfs_expected_rbmblocks(sbp))
return false;
return true;
@@ -339,6 +368,78 @@ xfs_validate_sb_write(
return 0;
}
+int
+xfs_compute_rgblklog(
+ xfs_rtxlen_t rgextents,
+ xfs_rgblock_t rextsize)
+{
+ uint64_t rgblocks = (uint64_t)rgextents * rextsize;
+
+ return xfs_highbit64(rgblocks - 1) + 1;
+}
+
+static int
+xfs_validate_sb_rtgroups(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
+{
+ uint64_t groups;
+ int rgblklog;
+
+ if (sbp->sb_rextsize == 0) {
+ xfs_warn(mp,
+"Realtime extent size must not be zero.");
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rgextents > XFS_MAX_RGBLOCKS / sbp->sb_rextsize) {
+ xfs_warn(mp,
+"Realtime group size (%u) must be less than %u rt extents.",
+ sbp->sb_rgextents,
+ XFS_MAX_RGBLOCKS / sbp->sb_rextsize);
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rgextents < XFS_MIN_RGEXTENTS) {
+ xfs_warn(mp,
+"Realtime group size (%u) must be at least %u rt extents.",
+ sbp->sb_rgextents, XFS_MIN_RGEXTENTS);
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rgcount > XFS_MAX_RGNUMBER) {
+ xfs_warn(mp,
+"Realtime groups (%u) must be less than %u.",
+ sbp->sb_rgcount, XFS_MAX_RGNUMBER);
+ return -EINVAL;
+ }
+
+ groups = howmany_64(sbp->sb_rextents, sbp->sb_rgextents);
+ if (groups != sbp->sb_rgcount) {
+ xfs_warn(mp,
+"Realtime groups (%u) do not cover the entire rt section; need (%llu) groups.",
+ sbp->sb_rgcount, groups);
+ return -EINVAL;
+ }
+
+ /* Exchange-range is required for fsr to work on realtime files */
+ if (!(sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_EXCHRANGE)) {
+ xfs_warn(mp,
+"Realtime groups feature requires exchange-range support.");
+ return -EINVAL;
+ }
+
+ rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents, sbp->sb_rextsize);
+ if (sbp->sb_rgblklog != rgblklog) {
+ xfs_warn(mp,
+"Realtime group log (%d) does not match expected value (%d).",
+ sbp->sb_rgblklog, rgblklog);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* Check the validity of the SB. */
STATIC int
xfs_validate_sb_common(
@@ -350,6 +451,7 @@ xfs_validate_sb_common(
uint32_t agcount = 0;
uint32_t rem;
bool has_dalign;
+ int error;
if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
xfs_warn(mp,
@@ -398,6 +500,32 @@ xfs_validate_sb_common(
sbp->sb_inoalignmt, align);
return -EINVAL;
}
+
+ if (!sbp->sb_spino_align ||
+ sbp->sb_spino_align > sbp->sb_inoalignmt ||
+ (sbp->sb_inoalignmt % sbp->sb_spino_align) != 0) {
+ xfs_warn(mp,
+ "Sparse inode alignment (%u) is invalid.",
+ sbp->sb_spino_align);
+ return -EINVAL;
+ }
+ } else if (sbp->sb_spino_align) {
+ xfs_warn(mp,
+ "Sparse inode alignment (%u) should be zero.",
+ sbp->sb_spino_align);
+ return -EINVAL;
+ }
+
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
+ if (memchr_inv(sbp->sb_pad, 0, sizeof(sbp->sb_pad))) {
+ xfs_warn(mp,
+"Metadir superblock padding fields must be zero.");
+ return -EINVAL;
+ }
+
+ error = xfs_validate_sb_rtgroups(mp, sbp);
+ if (error)
+ return error;
}
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
@@ -566,6 +694,14 @@ xfs_validate_sb_common(
void
xfs_sb_quota_from_disk(struct xfs_sb *sbp)
{
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
+ sbp->sb_uquotino = NULLFSINO;
+ sbp->sb_gquotino = NULLFSINO;
+ sbp->sb_pquotino = NULLFSINO;
+ return;
+ }
+
/*
* older mkfs doesn't initialize quota inodes to NULLFSINO. This
* leads to in-core values having two different values for a quota
@@ -689,6 +825,20 @@ __xfs_sb_from_disk(
/* Convert on-disk flags to in-memory flags? */
if (convert_xquota)
xfs_sb_quota_from_disk(to);
+
+ if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
+ to->sb_metadirino = be64_to_cpu(from->sb_metadirino);
+ to->sb_rgblklog = from->sb_rgblklog;
+ memcpy(to->sb_pad, from->sb_pad, sizeof(to->sb_pad));
+ to->sb_rgcount = be32_to_cpu(from->sb_rgcount);
+ to->sb_rgextents = be32_to_cpu(from->sb_rgextents);
+ to->sb_rbmino = NULLFSINO;
+ to->sb_rsumino = NULLFSINO;
+ } else {
+ to->sb_metadirino = NULLFSINO;
+ to->sb_rgcount = 1;
+ to->sb_rgextents = 0;
+ }
}
void
@@ -706,6 +856,15 @@ xfs_sb_quota_to_disk(
{
uint16_t qflags = from->sb_qflags;
+ if (xfs_sb_is_v5(from) &&
+ (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
+ to->sb_qflags = cpu_to_be16(from->sb_qflags);
+ to->sb_uquotino = cpu_to_be64(0);
+ to->sb_gquotino = cpu_to_be64(0);
+ to->sb_pquotino = cpu_to_be64(0);
+ return;
+ }
+
to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
/*
@@ -836,6 +995,16 @@ xfs_sb_to_disk(
to->sb_lsn = cpu_to_be64(from->sb_lsn);
if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)
uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+
+ if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) {
+ to->sb_metadirino = cpu_to_be64(from->sb_metadirino);
+ to->sb_rgblklog = from->sb_rgblklog;
+ memset(to->sb_pad, 0, sizeof(to->sb_pad));
+ to->sb_rgcount = cpu_to_be32(from->sb_rgcount);
+ to->sb_rgextents = cpu_to_be32(from->sb_rgextents);
+ to->sb_rbmino = cpu_to_be64(0);
+ to->sb_rsumino = cpu_to_be64(0);
+ }
}
/*
@@ -965,13 +1134,43 @@ const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
.verify_write = xfs_sb_write_verify,
};
+/* Compute cached rt geometry from the incore sb. */
void
-xfs_mount_sb_set_rextsize(
+xfs_sb_mount_rextsize(
struct xfs_mount *mp,
struct xfs_sb *sbp)
{
+ struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG];
+
mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize);
mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize);
+
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)) {
+ rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize;
+ rgs->blklog = mp->m_sb.sb_rgblklog;
+ rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
+ } else {
+ rgs->blocks = 0;
+ rgs->blklog = 0;
+ rgs->blkmask = (uint64_t)-1;
+ }
+}
+
+/* Update incore sb rt extent size, then recompute the cached rt geometry. */
+void
+xfs_mount_sb_set_rextsize(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp,
+ xfs_agblock_t rextsize)
+{
+ sbp->sb_rextsize = rextsize;
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR))
+ sbp->sb_rgblklog = xfs_compute_rgblklog(sbp->sb_rgextents,
+ rextsize);
+
+ xfs_sb_mount_rextsize(mp, sbp);
}
/*
@@ -988,6 +1187,8 @@ xfs_sb_mount_common(
struct xfs_mount *mp,
struct xfs_sb *sbp)
{
+ struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG];
+
mp->m_agfrotor = 0;
atomic_set(&mp->m_agirotor, 0);
mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -996,9 +1197,14 @@ xfs_sb_mount_common(
mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
mp->m_blockmask = sbp->sb_blocksize - 1;
- mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
- mp->m_blockwmask = mp->m_blockwsize - 1;
- xfs_mount_sb_set_rextsize(mp, sbp);
+ mp->m_blockwsize = xfs_rtbmblock_size(sbp) >> XFS_WORDLOG;
+ mp->m_rtx_per_rbmblock = mp->m_blockwsize << XFS_NBWORDLOG;
+
+ ags->blocks = mp->m_sb.sb_agblocks;
+ ags->blklog = mp->m_sb.sb_agblklog;
+ ags->blkmask = xfs_mask32lo(mp->m_sb.sb_agblklog);
+
+ xfs_sb_mount_rextsize(mp, sbp);
mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, true);
mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, false);
@@ -1045,11 +1251,6 @@ xfs_log_sb(
* reservations that have been taken out percpu counters. If we have an
* unclean shutdown, this will be corrected by log recovery rebuilding
* the counters from the AGF block counts.
- *
- * Do not update sb_frextents here because it is not part of the lazy
- * sb counters, despite having a percpu counter. It is always kept
- * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
- * and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount);
@@ -1060,6 +1261,16 @@ xfs_log_sb(
percpu_counter_sum_positive(&mp->m_fdblocks);
}
+ /*
+ * sb_frextents was added to the lazy sb counters when the rt groups
+ * feature was introduced. This counter can go negative due to the way
+ * we handle nearly-lockless reservations, so we must use the _positive
+ * variant here to avoid writing out nonsense frextents.
+ */
+ if (xfs_has_rtgroups(mp))
+ mp->m_sb.sb_frextents =
+ percpu_counter_sum_positive(&mp->m_frextents);
+
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
@@ -1109,18 +1320,17 @@ int
xfs_update_secondary_sbs(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno = 1;
+ struct xfs_perag *pag = NULL;
int saved_error = 0;
int error = 0;
LIST_HEAD (buffer_list);
/* update secondary superblocks. */
- for_each_perag_from(mp, agno, pag) {
+ while ((pag = xfs_perag_next_from(mp, pag, 1))) {
struct xfs_buf *bp;
error = xfs_buf_get(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR),
+ XFS_AG_DADDR(mp, pag_agno(pag), XFS_SB_DADDR),
XFS_FSS_TO_BB(mp, 1), &bp);
/*
* If we get an error reading or writing alternate superblocks,
@@ -1132,7 +1342,7 @@ xfs_update_secondary_sbs(
if (error) {
xfs_warn(mp,
"error allocating secondary superblock for ag %d",
- pag->pag_agno);
+ pag_agno(pag));
if (!saved_error)
saved_error = error;
continue;
@@ -1146,26 +1356,22 @@ xfs_update_secondary_sbs(
xfs_buf_relse(bp);
/* don't hold too many buffers at once */
- if (agno % 16)
+ if (pag_agno(pag) % 16)
continue;
error = xfs_buf_delwri_submit(&buffer_list);
if (error) {
xfs_warn(mp,
"write error %d updating a secondary superblock near ag %d",
- error, pag->pag_agno);
+ error, pag_agno(pag));
if (!saved_error)
saved_error = error;
continue;
}
}
error = xfs_buf_delwri_submit(&buffer_list);
- if (error) {
- xfs_warn(mp,
- "write error %d updating a secondary superblock near ag %d",
- error, agno);
- }
-
+ if (error)
+ xfs_warn(mp, "error %d writing secondary superblocks", error);
return saved_error ? saved_error : error;
}
@@ -1175,10 +1381,12 @@ xfs_update_secondary_sbs(
*/
int
xfs_sync_sb_buf(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool update_rtsb)
{
struct xfs_trans *tp;
struct xfs_buf *bp;
+ struct xfs_buf *rtsb_bp = NULL;
int error;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
@@ -1188,6 +1396,11 @@ xfs_sync_sb_buf(
bp = xfs_trans_getsb(tp);
xfs_log_sb(tp);
xfs_trans_bhold(tp, bp);
+ if (update_rtsb) {
+ rtsb_bp = xfs_log_rtsb(tp, bp);
+ if (rtsb_bp)
+ xfs_trans_bhold(tp, rtsb_bp);
+ }
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
if (error)
@@ -1196,7 +1409,11 @@ xfs_sync_sb_buf(
* write out the sb buffer to get the changes to disk
*/
error = xfs_bwrite(bp);
+ if (!error && rtsb_bp)
+ error = xfs_bwrite(rtsb_bp);
out:
+ if (rtsb_bp)
+ xfs_buf_relse(rtsb_bp);
xfs_buf_relse(bp);
return error;
}
@@ -1283,6 +1500,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
if (xfs_has_exchange_range(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
+ if (xfs_has_metadir(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
@@ -1298,6 +1517,11 @@ xfs_fs_geometry(
return;
geo->version = XFS_FSOP_GEOM_VERSION_V5;
+
+ if (xfs_has_rtgroups(mp)) {
+ geo->rgcount = sbp->sb_rgcount;
+ geo->rgextents = sbp->sb_rgextents;
+ }
}
/* Read a secondary superblock. */
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 885c83755991..34d0dd374e9b 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -15,10 +15,11 @@ struct xfs_perag;
extern void xfs_log_sb(struct xfs_trans *tp);
extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
-extern int xfs_sync_sb_buf(struct xfs_mount *mp);
+extern int xfs_sync_sb_buf(struct xfs_mount *mp, bool update_rtsb);
extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
+void xfs_sb_mount_rextsize(struct xfs_mount *mp, struct xfs_sb *sbp);
void xfs_mount_sb_set_rextsize(struct xfs_mount *mp,
- struct xfs_sb *sbp);
+ struct xfs_sb *sbp, xfs_agblock_t rextsize);
extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
@@ -43,5 +44,6 @@ bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
bool xfs_validate_rt_geometry(struct xfs_sb *sbp);
uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+int xfs_compute_rgblklog(xfs_rtxlen_t rgextents, xfs_rgblock_t rextsize);
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 33b84a3a83ff..e7efdb9ceaf3 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -38,7 +38,10 @@ extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbitmap_buf_ops;
+extern const struct xfs_buf_ops xfs_rtsummary_buf_ops;
extern const struct xfs_buf_ops xfs_rtbuf_ops;
+extern const struct xfs_buf_ops xfs_rtsb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
@@ -157,6 +160,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_TRANS_SB_RBLOCKS 0x00000800
#define XFS_TRANS_SB_REXTENTS 0x00001000
#define XFS_TRANS_SB_REXTSLOG 0x00002000
+#define XFS_TRANS_SB_RGCOUNT 0x00004000
/*
* Here we centralize the specification of XFS meta-data buffer reference count
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 3c40f37e82c7..c962ad64b0c1 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -62,12 +62,12 @@ xfs_trans_ichgtime(
ASSERT(tp);
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- tv = current_time(inode);
+ /* If the mtime changes, then ctime must also change */
+ ASSERT(flags & XFS_ICHGTIME_CHG);
+ tv = inode_set_ctime_current(inode);
if (flags & XFS_ICHGTIME_MOD)
inode_set_mtime_to_ts(inode, tv);
- if (flags & XFS_ICHGTIME_CHG)
- inode_set_ctime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_ACCESS)
inode_set_atime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CREATE)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 1a7f95bcf069..bab402340b5d 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -224,7 +224,7 @@ xfs_rtalloc_block_count(
xfs_rtxlen_t rtxlen;
rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN);
- rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen);
+ rtbmp_blocks = xfs_rtbitmap_blockcount_len(mp, rtxlen);
return (rtbmp_blocks + 1) * num_ops;
}
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index c299b16c9365..1faf04204c5d 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -12,6 +12,8 @@
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
/*
@@ -111,7 +113,7 @@ xfs_verify_ino(
/* Is this an internal inode number? */
inline bool
-xfs_internal_inum(
+xfs_is_sb_inum(
struct xfs_mount *mp,
xfs_ino_t ino)
{
@@ -129,24 +131,42 @@ xfs_verify_dir_ino(
struct xfs_mount *mp,
xfs_ino_t ino)
{
- if (xfs_internal_inum(mp, ino))
+ if (xfs_is_sb_inum(mp, ino))
return false;
return xfs_verify_ino(mp, ino);
}
/*
- * Verify that an realtime block number pointer doesn't point off the
- * end of the realtime device.
+ * Verify that a realtime block number pointer neither points outside the
+ * allocatable areas of the rtgroup nor off the end of the realtime
+ * device.
*/
inline bool
xfs_verify_rtbno(
struct xfs_mount *mp,
xfs_rtblock_t rtbno)
{
+ if (xfs_has_rtgroups(mp)) {
+ xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
+ xfs_rtxnum_t rtx = xfs_rtb_to_rtx(mp, rtbno);
+
+ if (rgno >= mp->m_sb.sb_rgcount)
+ return false;
+ if (rtx >= xfs_rtgroup_extents(mp, rgno))
+ return false;
+ if (xfs_has_rtsb(mp) && rgno == 0 && rtx == 0)
+ return false;
+ return true;
+ }
+
return rtbno < mp->m_sb.sb_rblocks;
}
-/* Verify that a realtime device extent is fully contained inside the volume. */
+/*
+ * Verify that an allocated realtime device extent neither points outside
+ * allocatable areas of the rtgroup, across an rtgroup boundary, nor off the
+ * end of the realtime device.
+ */
bool
xfs_verify_rtbext(
struct xfs_mount *mp,
@@ -159,7 +179,14 @@ xfs_verify_rtbext(
if (!xfs_verify_rtbno(mp, rtbno))
return false;
- return xfs_verify_rtbno(mp, rtbno + len - 1);
+ if (!xfs_verify_rtbno(mp, rtbno + len - 1))
+ return false;
+
+ if (xfs_has_rtgroups(mp) &&
+ xfs_rtb_to_rgno(mp, rtbno) != xfs_rtb_to_rgno(mp, rtbno + len - 1))
+ return false;
+
+ return true;
}
/* Calculate the range of valid icount values. */
@@ -170,13 +197,12 @@ xfs_icount_range(
unsigned long long *max)
{
unsigned long long nr_inos = 0;
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
/* root, rtbitmap, rtsum all live in the first chunk */
*min = XFS_INODES_PER_CHUNK;
- for_each_perag(mp, agno, pag)
+ while ((pag = xfs_perag_next(mp, pag)))
nr_inos += pag->agino_max - pag->agino_min + 1;
*max = nr_inos;
}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index a8cd44d03ef6..bf33c2b1e43e 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -9,10 +9,12 @@
typedef uint32_t prid_t; /* project ID */
typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
+typedef uint32_t xfs_rgblock_t; /* blockno in realtime group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
+typedef uint32_t xfs_rgnumber_t; /* realtime group number */
typedef uint64_t xfs_extnum_t; /* # of extents in a file */
typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
typedef int64_t xfs_fsize_t; /* bytes in a file */
@@ -53,7 +55,9 @@ typedef void * xfs_failaddr_t;
#define NULLFILEOFF ((xfs_fileoff_t)-1)
#define NULLAGBLOCK ((xfs_agblock_t)-1)
+#define NULLRGBLOCK ((xfs_rgblock_t)-1)
#define NULLAGNUMBER ((xfs_agnumber_t)-1)
+#define NULLRGNUMBER ((xfs_rgnumber_t)-1)
#define NULLCOMMITLSN ((xfs_lsn_t)-1)
@@ -212,6 +216,16 @@ enum xbtree_recpacking {
XBTREE_RECPACKING_FULL,
};
+enum xfs_group_type {
+ XG_TYPE_AG,
+ XG_TYPE_RTG,
+ XG_TYPE_MAX,
+} __packed;
+
+#define XG_TYPE_STRINGS \
+ { XG_TYPE_AG, "ag" }, \
+ { XG_TYPE_RTG, "rtg" }
+
/*
* Type verifier functions
*/
@@ -222,7 +236,7 @@ bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno,
xfs_fsblock_t len);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
-bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
+bool xfs_is_sb_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno,
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index f8e5b67128d2..61f80a6410c7 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -144,11 +144,16 @@ xchk_superblock(
if (sb->sb_rootino != cpu_to_be64(mp->m_sb.sb_rootino))
xchk_block_set_preen(sc, bp);
- if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
- xchk_block_set_preen(sc, bp);
+ if (xfs_has_metadir(sc->mp)) {
+ if (sb->sb_metadirino != cpu_to_be64(mp->m_sb.sb_metadirino))
+ xchk_block_set_preen(sc, bp);
+ } else {
+ if (sb->sb_rbmino != cpu_to_be64(mp->m_sb.sb_rbmino))
+ xchk_block_set_preen(sc, bp);
- if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino))
- xchk_block_set_preen(sc, bp);
+ if (sb->sb_rsumino != cpu_to_be64(mp->m_sb.sb_rsumino))
+ xchk_block_set_preen(sc, bp);
+ }
if (sb->sb_rextsize != cpu_to_be32(mp->m_sb.sb_rextsize))
xchk_block_set_corrupt(sc, bp);
@@ -224,11 +229,13 @@ xchk_superblock(
* sb_icount, sb_ifree, sb_fdblocks, sb_frexents
*/
- if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
- xchk_block_set_preen(sc, bp);
+ if (!xfs_has_metadir(mp)) {
+ if (sb->sb_uquotino != cpu_to_be64(mp->m_sb.sb_uquotino))
+ xchk_block_set_preen(sc, bp);
- if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino))
- xchk_block_set_preen(sc, bp);
+ if (sb->sb_gquotino != cpu_to_be64(mp->m_sb.sb_gquotino))
+ xchk_block_set_preen(sc, bp);
+ }
/*
* Skip the quota flags since repair will force quotacheck.
@@ -274,8 +281,15 @@ xchk_superblock(
if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok)))
xchk_block_set_corrupt(sc, bp);
- if (sb->sb_features2 != sb->sb_bad_features2)
- xchk_block_set_preen(sc, bp);
+ if (xfs_has_metadir(mp)) {
+ if (sb->sb_rgblklog != mp->m_sb.sb_rgblklog)
+ xchk_block_set_corrupt(sc, bp);
+ if (memchr_inv(sb->sb_pad, 0, sizeof(sb->sb_pad)))
+ xchk_block_set_preen(sc, bp);
+ } else {
+ if (sb->sb_features2 != sb->sb_bad_features2)
+ xchk_block_set_preen(sc, bp);
+ }
}
/* Check sb_features2 flags that are set at mkfs time. */
@@ -337,8 +351,10 @@ xchk_superblock(
if (sb->sb_spino_align != cpu_to_be32(mp->m_sb.sb_spino_align))
xchk_block_set_corrupt(sc, bp);
- if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
- xchk_block_set_preen(sc, bp);
+ if (!xfs_has_metadir(mp)) {
+ if (sb->sb_pquotino != cpu_to_be64(mp->m_sb.sb_pquotino))
+ xchk_block_set_preen(sc, bp);
+ }
/* Don't care about sb_lsn */
}
@@ -349,6 +365,14 @@ xchk_superblock(
xchk_block_set_corrupt(sc, bp);
}
+ if (xfs_has_metadir(mp)) {
+ if (sb->sb_rgcount != cpu_to_be32(mp->m_sb.sb_rgcount))
+ xchk_block_set_corrupt(sc, bp);
+
+ if (sb->sb_rgextents != cpu_to_be32(mp->m_sb.sb_rgextents))
+ xchk_block_set_corrupt(sc, bp);
+ }
+
/* Everything else must be zero. */
if (memchr_inv(sb + 1, 0,
BBTOB(bp->b_length) - sizeof(struct xfs_dsb)))
@@ -552,7 +576,7 @@ xchk_agf(
/* Check the AG length */
eoag = be32_to_cpu(agf->agf_length);
- if (eoag != pag->block_count)
+ if (eoag != pag_group(pag)->xg_block_count)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
/* Check the AGF btree roots and levels */
@@ -932,7 +956,7 @@ xchk_agi(
/* Check the AG length */
eoag = be32_to_cpu(agi->agi_length);
- if (eoag != pag->block_count)
+ if (eoag != pag_group(pag)->xg_block_count)
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check btree roots and levels */
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 2f98d90d7fd6..0fad0baaba2f 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -208,8 +208,8 @@ xrep_agf_init_header(
memset(agf, 0, BBTOB(agf_bp->b_length));
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
- agf->agf_seqno = cpu_to_be32(pag->pag_agno);
- agf->agf_length = cpu_to_be32(pag->block_count);
+ agf->agf_seqno = cpu_to_be32(pag_agno(pag));
+ agf->agf_length = cpu_to_be32(pag_group(pag)->xg_block_count);
agf->agf_flfirst = old_agf->agf_flfirst;
agf->agf_fllast = old_agf->agf_fllast;
agf->agf_flcount = old_agf->agf_flcount;
@@ -384,7 +384,7 @@ xrep_agf(
* was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
if (error)
@@ -687,7 +687,7 @@ xrep_agfl_init_header(
agfl = XFS_BUF_TO_AGFL(agfl_bp);
memset(agfl, 0xFF, BBTOB(agfl_bp->b_length));
agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
- agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno);
+ agfl->agfl_seqno = cpu_to_be32(pag_agno(sc->sa.pag));
uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
/*
@@ -741,7 +741,7 @@ xrep_agfl(
* was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL);
if (error)
@@ -897,8 +897,8 @@ xrep_agi_init_header(
memset(agi, 0, BBTOB(agi_bp->b_length));
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
- agi->agi_seqno = cpu_to_be32(pag->pag_agno);
- agi->agi_length = cpu_to_be32(pag->block_count);
+ agi->agi_seqno = cpu_to_be32(pag_agno(pag));
+ agi->agi_length = cpu_to_be32(pag_group(pag)->xg_block_count);
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
if (xfs_has_crc(mp))
@@ -1038,12 +1038,10 @@ xrep_iunlink_reload_next(
{
struct xfs_scrub *sc = ragi->sc;
struct xfs_inode *ip;
- xfs_ino_t ino;
xfs_agino_t ret = NULLAGINO;
int error;
- ino = XFS_AGINO_TO_INO(sc->mp, sc->sa.pag->pag_agno, agino);
- error = xchk_iget(ragi->sc, ino, &ip);
+ error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino), &ip);
if (error)
return ret;
@@ -1114,9 +1112,9 @@ xrep_iunlink_igrab(
struct xfs_perag *pag,
struct xfs_inode *ip)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
- if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
return false;
if (!xfs_inode_on_unlinked_list(ip))
@@ -1140,7 +1138,7 @@ xrep_iunlink_visit(
unsigned int bucket;
int error;
- ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == ragi->sc->sa.pag->pag_agno);
+ ASSERT(XFS_INO_TO_AGNO(mp, ip->i_ino) == pag_agno(ragi->sc->sa.pag));
ASSERT(xfs_inode_on_unlinked_list(ip));
agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
@@ -1171,7 +1169,7 @@ xrep_iunlink_mark_incore(
struct xrep_agi *ragi)
{
struct xfs_perag *pag = ragi->sc->sa.pag;
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
uint32_t first_index = 0;
bool done = false;
unsigned int nr_found = 0;
@@ -1211,7 +1209,7 @@ xrep_iunlink_mark_incore(
* us to see this inode, so another lookup from the
* same index will not find it again.
*/
- if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
@@ -1278,9 +1276,7 @@ xrep_iunlink_mark_ondisk_rec(
* on because we haven't actually scrubbed the inobt or the
* inodes yet.
*/
- error = xchk_iget(ragi->sc,
- XFS_AGINO_TO_INO(mp, sc->sa.pag->pag_agno,
- agino),
+ error = xchk_iget(ragi->sc, xfs_agino_to_ino(sc->sa.pag, agino),
&ip);
if (error)
continue;
@@ -1539,15 +1535,13 @@ xrep_iunlink_relink_next(
ip = xfs_iunlink_lookup(pag, agino);
if (!ip) {
- xfs_ino_t ino;
xfs_agino_t prev_agino;
/*
* No inode exists in cache. Load it off the disk so that we
* can reinsert it into the incore unlinked list.
*/
- ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
- error = xchk_iget(sc, ino, &ip);
+ error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip);
if (error)
return -EFSCORRUPTED;
@@ -1601,15 +1595,13 @@ xrep_iunlink_relink_prev(
ip = xfs_iunlink_lookup(pag, agino);
if (!ip) {
- xfs_ino_t ino;
xfs_agino_t next_agino;
/*
* No inode exists in cache. Load it off the disk so that we
* can reinsert it into the incore unlinked list.
*/
- ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
- error = xchk_iget(sc, ino, &ip);
+ error = xchk_iget(sc, xfs_agino_to_ino(pag, agino), &ip);
if (error)
return -EFSCORRUPTED;
@@ -1769,7 +1761,7 @@ xrep_agi(
* was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED.
*/
error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, sc->sa.pag->pag_agno,
+ XFS_AG_DADDR(mp, pag_agno(sc->sa.pag),
XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &ragi->agi_bp, NULL);
if (error)
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index d1b8a4997dd2..8b282138097f 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -139,7 +139,7 @@ xchk_allocbt_rec(
struct xchk_alloc *ca = bs->private;
xfs_alloc_btrec_to_irec(rec, &irec);
- if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
+ if (xfs_alloc_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index 30295898cc8a..0433363a90b6 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -132,17 +132,16 @@ int
xrep_setup_ag_allocbt(
struct xfs_scrub *sc)
{
+ struct xfs_group *xg = pag_group(sc->sa.pag);
unsigned int busy_gen;
/*
* Make sure the busy extent list is clear because we can't put extents
* on there twice.
*/
- busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
- if (xfs_extent_busy_list_empty(sc->sa.pag))
+ if (xfs_extent_busy_list_empty(xg, &busy_gen))
return 0;
-
- return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
+ return xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0);
}
/* Check for any obvious conflicts in the free extent. */
@@ -210,7 +209,7 @@ xrep_abt_stash(
if (error)
return error;
- trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
+ trace_xrep_abt_found(sc->sa.pag, &arec);
error = xfarray_append(ra->free_records, &arec);
if (error)
@@ -484,8 +483,8 @@ xrep_abt_reserve_space(
ASSERT(arec.ar_blockcount <= UINT_MAX);
len = min_t(unsigned int, arec.ar_blockcount, desired);
- trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
- arec.ar_startblock, len, XFS_RMAP_OWN_AG);
+ trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, arec.ar_startblock,
+ len, XFS_RMAP_OWN_AG);
error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
arec.ar_startblock, len);
@@ -543,7 +542,7 @@ xrep_abt_dispose_one(
/* Add a deferred rmap for each extent we used. */
if (resv->used > 0)
- xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
+ xfs_rmap_alloc_extent(sc->tp, pag_agno(pag), resv->agbno,
resv->used, XFS_RMAP_OWN_AG);
/*
@@ -554,8 +553,8 @@ xrep_abt_dispose_one(
if (free_aglen == 0)
return 0;
- trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
- free_aglen, ra->new_bnobt.oinfo.oi_owner);
+ trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
+ ra->new_bnobt.oinfo.oi_owner);
error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
&ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
@@ -849,6 +848,7 @@ xrep_allocbt(
{
struct xrep_abt *ra;
struct xfs_mount *mp = sc->mp;
+ unsigned int busy_gen;
char *descr;
int error;
@@ -869,7 +869,7 @@ xrep_allocbt(
* on there twice. In theory we cleared this before we started, but
* let's not risk the filesystem.
*/
- if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
+ if (!xfs_extent_busy_list_empty(pag_group(sc->sa.pag), &busy_gen)) {
error = -EDEADLOCK;
goto out_ra;
}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 5ab2ac53c920..7e00312225ed 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -19,6 +19,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_rtgroup.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -314,8 +315,20 @@ xchk_bmap_rt_iextent_xref(
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
+ int error;
+
+ error = xchk_rtgroup_init_existing(info->sc,
+ xfs_rtb_to_rgno(ip->i_mount, irec->br_startblock),
+ &info->sc->sr);
+ if (!xchk_fblock_process_error(info->sc, info->whichfork,
+ irec->br_startoff, &error))
+ return;
+
+ xchk_rtgroup_lock(&info->sc->sr, XCHK_RTGLOCK_ALL);
xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
irec->br_blockcount);
+
+ xchk_rtgroup_free(info->sc, &info->sc->sr);
}
/* Cross-reference a single datadev extent record. */
@@ -600,8 +613,8 @@ xchk_bmap_check_rmap(
if (irec.br_startoff != check_rec.rm_offset)
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
check_rec.rm_offset);
- if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
- cur->bc_ag.pag->pag_agno,
+ if (irec.br_startblock !=
+ xfs_agbno_to_fsb(to_perag(cur->bc_group),
check_rec.rm_startblock))
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
check_rec.rm_offset);
@@ -761,11 +774,10 @@ xchk_bmap_check_rmaps(
struct xfs_scrub *sc,
int whichfork)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
int error;
- for_each_perag(sc->mp, agno, pag) {
+ while ((pag = xfs_perag_next(sc->mp, pag))) {
error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
if (error ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
@@ -822,9 +834,12 @@ xchk_bmap_iext_mapping(
/* Are these two mappings contiguous with each other? */
static inline bool
xchk_are_bmaps_contiguous(
+ const struct xchk_bmap_info *info,
const struct xfs_bmbt_irec *b1,
const struct xfs_bmbt_irec *b2)
{
+ struct xfs_mount *mp = info->sc->mp;
+
/* Don't try to combine unallocated mappings. */
if (!xfs_bmap_is_real_extent(b1))
return false;
@@ -838,6 +853,17 @@ xchk_are_bmaps_contiguous(
return false;
if (b1->br_state != b2->br_state)
return false;
+
+ /*
+ * Don't combine bmaps that would cross rtgroup boundaries. This is a
+ * valid state, but if combined they will fail rtb extent checks.
+ */
+ if (info->is_rt && xfs_has_rtgroups(mp)) {
+ if (xfs_rtb_to_rgno(mp, b1->br_startblock) !=
+ xfs_rtb_to_rgno(mp, b2->br_startblock))
+ return false;
+ }
+
return true;
}
@@ -875,7 +901,7 @@ xchk_bmap_iext_iter(
* that we just read, if possible.
*/
while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) {
- if (!xchk_are_bmaps_contiguous(irec, &got))
+ if (!xchk_are_bmaps_contiguous(info, irec, &got))
break;
if (!xchk_bmap_iext_mapping(info, &got)) {
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 4505f4829d53..7c4955482641 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -196,7 +196,7 @@ xrep_bmap_check_fork_rmap(
return -EFSCORRUPTED;
/* Check that this is within the AG. */
- if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock,
+ if (!xfs_verify_agbext(to_perag(cur->bc_group), rec->rm_startblock,
rec->rm_blockcount))
return -EFSCORRUPTED;
@@ -237,7 +237,6 @@ xrep_bmap_walk_rmap(
void *priv)
{
struct xrep_bmap *rb = priv;
- struct xfs_mount *mp = cur->bc_mp;
xfs_fsblock_t fsbno;
int error = 0;
@@ -269,8 +268,7 @@ xrep_bmap_walk_rmap(
if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten)
return -EFSCORRUPTED;
- fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno,
- rec->rm_startblock);
+ fsbno = xfs_agbno_to_fsb(to_perag(cur->bc_group), rec->rm_startblock);
if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
rb->old_bmbt_block_count += rec->rm_blockcount;
@@ -409,12 +407,11 @@ xrep_bmap_find_mappings(
struct xrep_bmap *rb)
{
struct xfs_scrub *sc = rb->sc;
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
int error = 0;
/* Iterate the rmaps for extents. */
- for_each_perag(sc->mp, agno, pag) {
+ while ((pag = xfs_perag_next(sc->mp, pag))) {
error = xrep_bmap_scan_ag(rb, pag);
if (error) {
xfs_perag_rele(pag);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 22f5f1a9d3f0..5cbd94b56582 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -34,11 +34,13 @@
#include "xfs_quota.h"
#include "xfs_exchmaps.h"
#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/health.h"
+#include "scrub/tempfile.h"
/* Common code for the metadata scrubbers. */
@@ -121,6 +123,17 @@ xchk_process_error(
}
bool
+xchk_process_rt_error(
+ struct xfs_scrub *sc,
+ xfs_rgnumber_t rgno,
+ xfs_rgblock_t rgbno,
+ int *error)
+{
+ return __xchk_process_error(sc, rgno, rgbno, error,
+ XFS_SCRUB_OFLAG_CORRUPT, __return_address);
+}
+
+bool
xchk_xref_process_error(
struct xfs_scrub *sc,
xfs_agnumber_t agno,
@@ -513,7 +526,7 @@ xchk_perag_drain_and_lock(
* Obviously, this should be slanted against scrub and in favor
* of runtime threads.
*/
- if (!xfs_perag_intent_busy(sa->pag))
+ if (!xfs_group_intent_busy(pag_group(sa->pag)))
return 0;
if (sa->agf_bp) {
@@ -528,7 +541,7 @@ xchk_perag_drain_and_lock(
if (!(sc->flags & XCHK_FSGATES_DRAIN))
return -ECHRNG;
- error = xfs_perag_intent_drain(sa->pag);
+ error = xfs_group_intent_drain(pag_group(sa->pag));
if (error == -ERESTARTSYS)
error = -EINTR;
} while (!error);
@@ -683,6 +696,72 @@ xchk_ag_init(
return 0;
}
+#ifdef CONFIG_XFS_RT
+/*
+ * For scrubbing a realtime group, grab all the in-core resources we'll need to
+ * check the metadata, which means taking the ILOCK of the realtime group's
+ * metadata inodes. Callers must not join these inodes to the transaction with
+ * non-zero lockflags or concurrency problems will result. The @rtglock_flags
+ * argument takes XFS_RTGLOCK_* flags.
+ */
+int
+xchk_rtgroup_init(
+ struct xfs_scrub *sc,
+ xfs_rgnumber_t rgno,
+ struct xchk_rt *sr)
+{
+ ASSERT(sr->rtg == NULL);
+ ASSERT(sr->rtlock_flags == 0);
+
+ sr->rtg = xfs_rtgroup_get(sc->mp, rgno);
+ if (!sr->rtg)
+ return -ENOENT;
+ return 0;
+}
+
+void
+xchk_rtgroup_lock(
+ struct xchk_rt *sr,
+ unsigned int rtglock_flags)
+{
+ xfs_rtgroup_lock(sr->rtg, rtglock_flags);
+ sr->rtlock_flags = rtglock_flags;
+}
+
+/*
+ * Unlock the realtime group. This must be done /after/ committing (or
+ * cancelling) the scrub transaction.
+ */
+static void
+xchk_rtgroup_unlock(
+ struct xchk_rt *sr)
+{
+ ASSERT(sr->rtg != NULL);
+
+ if (sr->rtlock_flags) {
+ xfs_rtgroup_unlock(sr->rtg, sr->rtlock_flags);
+ sr->rtlock_flags = 0;
+ }
+}
+
+/*
+ * Unlock the realtime group and release its resources. This must be done
+ * /after/ committing (or cancelling) the scrub transaction.
+ */
+void
+xchk_rtgroup_free(
+ struct xfs_scrub *sc,
+ struct xchk_rt *sr)
+{
+ ASSERT(sr->rtg != NULL);
+
+ xchk_rtgroup_unlock(sr);
+
+ xfs_rtgroup_put(sr->rtg);
+ sr->rtg = NULL;
+}
+#endif /* CONFIG_XFS_RT */
+
/* Per-scrubber setup functions */
void
@@ -947,9 +1026,15 @@ xchk_iget_for_scrubbing(
if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
return xchk_install_live_inode(sc, ip_in);
- /* Reject internal metadata files and obviously bad inode numbers. */
- if (xfs_internal_inum(mp, sc->sm->sm_ino))
+ /*
+ * On pre-metadir filesystems, reject internal metadata files. For
+ * metadir filesystems, limited scrubbing of any file in the metadata
+ * directory tree by handle is allowed, because that is the only way to
+ * validate the lack of parent pointers in the sb-root metadata inodes.
+ */
+ if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino))
return -ENOENT;
+ /* Reject obviously bad inode numbers. */
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
@@ -1084,6 +1169,10 @@ xchk_setup_inode_contents(
if (error)
return error;
+ error = xrep_tempfile_adjust_directory_tree(sc);
+ if (error)
+ return error;
+
/* Lock the inode so the VFS cannot touch this file. */
xchk_ilock(sc, XFS_IOLOCK_EXCL);
@@ -1239,12 +1328,6 @@ xchk_metadata_inode_forks(
return 0;
}
- /* They also should never have extended attributes. */
- if (xfs_inode_hasattr(sc->ip)) {
- xchk_ino_set_corrupt(sc, sc->ip->i_ino);
- return 0;
- }
-
/* Invoke the data fork scrubber. */
error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
@@ -1261,6 +1344,21 @@ xchk_metadata_inode_forks(
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
}
+ /*
+ * Metadata files can only have extended attributes on metadir
+ * filesystems, either for parent pointers or for actual xattr data.
+ */
+ if (xfs_inode_hasattr(sc->ip)) {
+ if (!xfs_has_metadir(sc->mp)) {
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+ }
+
return 0;
}
@@ -1336,7 +1434,7 @@ xchk_inode_is_allocated(
}
/* reject inode numbers outside existing AGs */
- ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
+ ino = xfs_agino_to_ino(pag, agino);
if (!xfs_verify_ino(mp, ino))
return -EINVAL;
@@ -1446,3 +1544,32 @@ out_rcu:
rcu_read_unlock();
return error;
}
+
+/* Is this inode a root directory for either tree? */
+bool
+xchk_inode_is_dirtree_root(const struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ return ip == mp->m_rootip ||
+ (xfs_has_metadir(mp) && ip == mp->m_metadirip);
+}
+
+/* Does the superblock point down to this inode? */
+bool
+xchk_inode_is_sb_rooted(const struct xfs_inode *ip)
+{
+ return xchk_inode_is_dirtree_root(ip) ||
+ xfs_is_sb_inum(ip->i_mount, ip->i_ino);
+}
+
+/* What is the root directory inumber for this inode? */
+xfs_ino_t
+xchk_inode_rootdir_inum(const struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (xfs_is_metadir_inode(ip))
+ return mp->m_metadirip->i_ino;
+ return mp->m_rootip->i_ino;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 47148cc4a833..9ff3cafd8679 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -12,6 +12,8 @@ void xchk_trans_cancel(struct xfs_scrub *sc);
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int *error);
+bool xchk_process_rt_error(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
+ xfs_rgblock_t rgbno, int *error);
bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork,
xfs_fileoff_t offset, int *error);
@@ -73,12 +75,15 @@ int xchk_setup_xattr(struct xfs_scrub *sc);
int xchk_setup_symlink(struct xfs_scrub *sc);
int xchk_setup_parent(struct xfs_scrub *sc);
int xchk_setup_dirtree(struct xfs_scrub *sc);
+int xchk_setup_metapath(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xchk_setup_rtbitmap(struct xfs_scrub *sc);
int xchk_setup_rtsummary(struct xfs_scrub *sc);
+int xchk_setup_rgsuperblock(struct xfs_scrub *sc);
#else
# define xchk_setup_rtbitmap xchk_setup_nothing
# define xchk_setup_rtsummary xchk_setup_nothing
+# define xchk_setup_rgsuperblock xchk_setup_nothing
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_ino_dqattach(struct xfs_scrub *sc);
@@ -117,6 +122,34 @@ xchk_ag_init_existing(
return error == -ENOENT ? -EFSCORRUPTED : error;
}
+#ifdef CONFIG_XFS_RT
+
+/* All the locks we need to check an rtgroup. */
+#define XCHK_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP)
+
+int xchk_rtgroup_init(struct xfs_scrub *sc, xfs_rgnumber_t rgno,
+ struct xchk_rt *sr);
+
+static inline int
+xchk_rtgroup_init_existing(
+ struct xfs_scrub *sc,
+ xfs_rgnumber_t rgno,
+ struct xchk_rt *sr)
+{
+ int error = xchk_rtgroup_init(sc, rgno, sr);
+
+ return error == -ENOENT ? -EFSCORRUPTED : error;
+}
+
+void xchk_rtgroup_lock(struct xchk_rt *sr, unsigned int rtglock_flags);
+void xchk_rtgroup_free(struct xfs_scrub *sc, struct xchk_rt *sr);
+#else
+# define xchk_rtgroup_init(sc, rgno, sr) (-EFSCORRUPTED)
+# define xchk_rtgroup_init_existing(sc, rgno, sr) (-EFSCORRUPTED)
+# define xchk_rtgroup_lock(sc, lockflags) do { } while (0)
+# define xchk_rtgroup_free(sc, sr) do { } while (0)
+#endif /* CONFIG_XFS_RT */
+
int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno,
struct xchk_ag *sa);
void xchk_ag_btcur_free(struct xchk_ag *sa);
@@ -216,7 +249,8 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
#define xchk_xfile_ag_descr(sc, fmt, ...) \
kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \
(sc)->mp->m_super->s_id, \
- (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \
+ (sc)->sa.pag ? \
+ pag_agno((sc)->sa.pag) : (sc)->sm->sm_agno, \
##__VA_ARGS__)
#define xchk_xfile_ino_descr(sc, fmt, ...) \
kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \
@@ -241,4 +275,8 @@ void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks);
int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino,
bool *inuse);
+bool xchk_inode_is_dirtree_root(const struct xfs_inode *ip);
+bool xchk_inode_is_sb_rooted(const struct xfs_inode *ip);
+xfs_ino_t xchk_inode_rootdir_inum(const struct xfs_inode *ip);
+
#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 4de3f0f40f48..5b6194cef3e5 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -137,7 +137,6 @@ xrep_cow_mark_shared_staging(
{
struct xrep_cow *xc = priv;
struct xfs_refcount_irec rrec;
- xfs_fsblock_t fsbno;
if (!xfs_refcount_check_domain(rec) ||
rec->rc_domain != XFS_REFC_DOMAIN_SHARED)
@@ -145,9 +144,10 @@ xrep_cow_mark_shared_staging(
xrep_cow_trim_refcount(xc, &rrec, rec);
- fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
- rrec.rc_startblock);
- return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
+ return xrep_cow_mark_file_range(xc,
+ xfs_agbno_to_fsb(to_perag(cur->bc_group),
+ rrec.rc_startblock),
+ rrec.rc_blockcount);
}
/*
@@ -177,9 +177,9 @@ xrep_cow_mark_missing_staging(
if (xc->next_bno >= rrec.rc_startblock)
goto next;
+
error = xrep_cow_mark_file_range(xc,
- XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
- xc->next_bno),
+ xfs_agbno_to_fsb(to_perag(cur->bc_group), xc->next_bno),
rrec.rc_startblock - xc->next_bno);
if (error)
return error;
@@ -200,7 +200,6 @@ xrep_cow_mark_missing_staging_rmap(
void *priv)
{
struct xrep_cow *xc = priv;
- xfs_fsblock_t fsbno;
xfs_agblock_t rec_bno;
xfs_extlen_t rec_len;
unsigned int adj;
@@ -222,8 +221,9 @@ xrep_cow_mark_missing_staging_rmap(
rec_len -= adj;
}
- fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
- return xrep_cow_mark_file_range(xc, fsbno, rec_len);
+ return xrep_cow_mark_file_range(xc,
+ xfs_agbno_to_fsb(to_perag(cur->bc_group), rec_bno),
+ rec_len);
}
/*
@@ -275,8 +275,7 @@ xrep_cow_find_bad(
if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
error = xrep_cow_mark_file_range(xc,
- XFS_AGB_TO_FSB(sc->mp, pag->pag_agno,
- xc->next_bno),
+ xfs_agbno_to_fsb(pag, xc->next_bno),
xc->irec_startbno + xc->irec.br_blockcount -
xc->next_bno);
if (error)
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index bf9199e8df63..c877bde71e62 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -100,6 +100,14 @@ xchk_dir_check_ftype(
if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ /*
+ * Metadata and regular inodes cannot cross trees. This property
+ * cannot change without a full inode free and realloc cycle, so it's
+ * safe to check this without holding locks.
+ */
+ if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(sc->ip))
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
}
/*
@@ -253,7 +261,7 @@ xchk_dir_actor(
* If this is ".." in the root inode, check that the inum
* matches this dir.
*/
- if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino)
+ if (xchk_inode_is_dirtree_root(dp) && ino != dp->i_ino)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
}
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
index 64679fe08446..249313882108 100644
--- a/fs/xfs/scrub/dir_repair.c
+++ b/fs/xfs/scrub/dir_repair.c
@@ -415,6 +415,12 @@ xrep_dir_salvage_entry(
if (error)
return 0;
+ /* Don't mix metadata and regular directory trees. */
+ if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(rd->sc->ip)) {
+ xchk_irele(sc, ip);
+ return 0;
+ }
+
xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
xchk_irele(sc, ip);
@@ -1270,7 +1276,7 @@ xrep_dir_scan_dirtree(
int error;
/* Roots of directory trees are their own parents. */
- if (sc->ip == sc->mp->m_rootip)
+ if (xchk_inode_is_dirtree_root(sc->ip))
xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
/*
@@ -1632,6 +1638,7 @@ xrep_dir_swap(
struct xrep_dir *rd)
{
struct xfs_scrub *sc = rd->sc;
+ xfs_ino_t ino;
bool ip_local, temp_local;
int error = 0;
@@ -1649,14 +1656,17 @@ xrep_dir_swap(
/*
* Reset the temporary directory's '..' entry to point to the parent
- * that we found. The temporary directory was created with the root
- * directory as the parent, so we can skip this if repairing a
- * subdirectory of the root.
+ * that we found. The dirent replace code asserts if the dirent
+ * already points at the new inumber, so we look it up here.
*
* It's also possible that this replacement could also expand a sf
* tempdir into block format.
*/
- if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) {
+ error = xchk_dir_lookup(sc, rd->sc->tempip, &xfs_name_dotdot, &ino);
+ if (error)
+ return error;
+
+ if (rd->pscan.parent_ino != ino) {
error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
rd->pscan.parent_ino, rd->tx.req.resblks);
if (error)
diff --git a/fs/xfs/scrub/dirtree.c b/fs/xfs/scrub/dirtree.c
index bde58fb561ea..3a9cdf8738b6 100644
--- a/fs/xfs/scrub/dirtree.c
+++ b/fs/xfs/scrub/dirtree.c
@@ -362,7 +362,8 @@ xchk_dirpath_set_outcome(
STATIC int
xchk_dirpath_step_up(
struct xchk_dirtree *dl,
- struct xchk_dirpath *path)
+ struct xchk_dirpath *path,
+ bool is_metadir)
{
struct xfs_scrub *sc = dl->sc;
struct xfs_inode *dp;
@@ -435,6 +436,14 @@ xchk_dirpath_step_up(
goto out_scanlock;
}
+ /* Parent must be in the same directory tree. */
+ if (is_metadir != xfs_is_metadir_inode(dp)) {
+ trace_xchk_dirpath_crosses_tree(dl->sc, dp, path->path_nr,
+ path->nr_steps, &dl->xname, &dl->pptr_rec);
+ error = -EFSCORRUPTED;
+ goto out_scanlock;
+ }
+
/*
* If the extended attributes look as though they has been zapped by
* the inode record repair code, we cannot scan for parent pointers.
@@ -508,6 +517,7 @@ xchk_dirpath_walk_upwards(
struct xchk_dirpath *path)
{
struct xfs_scrub *sc = dl->sc;
+ bool is_metadir;
int error;
ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
@@ -538,6 +548,7 @@ xchk_dirpath_walk_upwards(
* ILOCK state is no longer tracked in the scrub context. Hence we
* must drop @sc->ip's ILOCK during the walk.
*/
+ is_metadir = xfs_is_metadir_inode(sc->ip);
mutex_unlock(&dl->lock);
xchk_iunlock(sc, XFS_ILOCK_EXCL);
@@ -547,7 +558,7 @@ xchk_dirpath_walk_upwards(
* If we see any kind of error here (including corruptions), the parent
* pointer of @sc->ip is corrupt. Stop the whole scan.
*/
- error = xchk_dirpath_step_up(dl, path);
+ error = xchk_dirpath_step_up(dl, path, is_metadir);
if (error) {
xchk_ilock(sc, XFS_ILOCK_EXCL);
mutex_lock(&dl->lock);
@@ -560,7 +571,7 @@ xchk_dirpath_walk_upwards(
* *somewhere* in the path, but we don't need to stop scanning.
*/
while (!error && path->outcome == XCHK_DIRPATH_SCANNING)
- error = xchk_dirpath_step_up(dl, path);
+ error = xchk_dirpath_step_up(dl, path, is_metadir);
/* Retake the locks we had, mark paths, etc. */
xchk_ilock(sc, XFS_ILOCK_EXCL);
@@ -917,7 +928,7 @@ xchk_dirtree(
* scan, because the hook doesn't detach until after sc->ip gets
* released during teardown.
*/
- dl->root_ino = sc->mp->m_rootip->i_ino;
+ dl->root_ino = xchk_inode_rootdir_inum(sc->ip);
dl->scan_ino = sc->ip->i_ino;
trace_xchk_dirtree_start(sc->ip, sc->sm, 0);
@@ -983,3 +994,16 @@ out:
trace_xchk_dirtree_done(sc->ip, sc->sm, error);
return error;
}
+
+/* Does the directory targetted by this scrub have no parents? */
+bool
+xchk_dirtree_parentless(const struct xchk_dirtree *dl)
+{
+ struct xfs_scrub *sc = dl->sc;
+
+ if (xchk_inode_is_dirtree_root(sc->ip))
+ return true;
+ if (VFS_I(sc->ip)->i_nlink == 0)
+ return true;
+ return false;
+}
diff --git a/fs/xfs/scrub/dirtree.h b/fs/xfs/scrub/dirtree.h
index 1e1686365c61..9e5d95492717 100644
--- a/fs/xfs/scrub/dirtree.h
+++ b/fs/xfs/scrub/dirtree.h
@@ -156,17 +156,7 @@ struct xchk_dirtree {
#define xchk_dirtree_for_each_path(dl, path) \
list_for_each_entry((path), &(dl)->path_list, list)
-static inline bool
-xchk_dirtree_parentless(const struct xchk_dirtree *dl)
-{
- struct xfs_scrub *sc = dl->sc;
-
- if (sc->ip == sc->mp->m_rootip)
- return true;
- if (VFS_I(sc->ip)->i_nlink == 0)
- return true;
- return false;
-}
+bool xchk_dirtree_parentless(const struct xchk_dirtree *dl);
int xchk_dirtree_find_paths_to_root(struct xchk_dirtree *dl);
int xchk_dirpath_append(struct xchk_dirtree *dl, struct xfs_inode *ip,
diff --git a/fs/xfs/scrub/findparent.c b/fs/xfs/scrub/findparent.c
index 01766041ba2c..84487072b6dd 100644
--- a/fs/xfs/scrub/findparent.c
+++ b/fs/xfs/scrub/findparent.c
@@ -172,6 +172,10 @@ xrep_findparent_walk_directory(
*/
lock_mode = xfs_ilock_data_map_shared(dp);
+ /* Don't mix metadata and regular directory trees. */
+ if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip))
+ goto out_unlock;
+
/*
* If this directory is known to be sick, we cannot scan it reliably
* and must abort.
@@ -362,15 +366,24 @@ xrep_findparent_confirm(
};
int error;
- /*
- * The root directory always points to itself. Unlinked dirs can point
- * anywhere, so we point them at the root dir too.
- */
- if (sc->ip == sc->mp->m_rootip || VFS_I(sc->ip)->i_nlink == 0) {
+ /* The root directory always points to itself. */
+ if (sc->ip == sc->mp->m_rootip) {
*parent_ino = sc->mp->m_sb.sb_rootino;
return 0;
}
+ /* The metadata root directory always points to itself. */
+ if (sc->ip == sc->mp->m_metadirip) {
+ *parent_ino = sc->mp->m_sb.sb_metadirino;
+ return 0;
+ }
+
+ /* Unlinked dirs can point anywhere; point them up to the root dir. */
+ if (VFS_I(sc->ip)->i_nlink == 0) {
+ *parent_ino = xchk_inode_rootdir_inum(sc->ip);
+ return 0;
+ }
+
/* Reject garbage parent inode numbers and self-referential parents. */
if (*parent_ino == NULLFSINO)
return 0;
@@ -412,8 +425,11 @@ xrep_findparent_self_reference(
if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino)
return sc->mp->m_sb.sb_rootino;
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_metadirino)
+ return sc->mp->m_sb.sb_metadirino;
+
if (VFS_I(sc->ip)->i_nlink == 0)
- return sc->mp->m_sb.sb_rootino;
+ return xchk_inode_rootdir_inum(sc->ip);
return NULLFSINO;
}
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 1d3e98346933..4a50f8e00040 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -19,6 +19,7 @@
#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
+#include "xfs_rtgroup.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -74,10 +75,9 @@ xchk_fscount_warmup(
struct xfs_buf *agi_bp = NULL;
struct xfs_buf *agf_bp = NULL;
struct xfs_perag *pag = NULL;
- xfs_agnumber_t agno;
int error = 0;
- for_each_perag(mp, agno, pag) {
+ while ((pag = xfs_perag_next(mp, pag))) {
if (xchk_should_terminate(sc, &error))
break;
if (xfs_perag_initialised_agi(pag) &&
@@ -295,9 +295,8 @@ xchk_fscount_aggregate_agcounts(
struct xchk_fscounters *fsc)
{
struct xfs_mount *mp = sc->mp;
- struct xfs_perag *pag;
+ struct xfs_perag *pag = NULL;
uint64_t delayed;
- xfs_agnumber_t agno;
int tries = 8;
int error = 0;
@@ -306,7 +305,7 @@ retry:
fsc->ifree = 0;
fsc->fdblocks = 0;
- for_each_perag(mp, agno, pag) {
+ while ((pag = xfs_perag_next(mp, pag))) {
if (xchk_should_terminate(sc, &error))
break;
@@ -327,7 +326,7 @@ retry:
if (xfs_has_lazysbcount(sc->mp)) {
fsc->fdblocks += pag->pagf_btreeblks;
} else {
- error = xchk_fscount_btreeblks(sc, fsc, agno);
+ error = xchk_fscount_btreeblks(sc, fsc, pag_agno(pag));
if (error)
break;
}
@@ -388,7 +387,7 @@ retry:
#ifdef CONFIG_XFS_RT
STATIC int
xchk_fscount_add_frextent(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
@@ -409,6 +408,7 @@ xchk_fscount_count_frextents(
struct xchk_fscounters *fsc)
{
struct xfs_mount *mp = sc->mp;
+ struct xfs_rtgroup *rtg = NULL;
int error;
fsc->frextents = 0;
@@ -416,19 +416,20 @@ xchk_fscount_count_frextents(
if (!xfs_has_realtime(mp))
return 0;
- xfs_rtbitmap_lock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
- error = xfs_rtalloc_query_all(sc->mp, sc->tp,
- xchk_fscount_add_frextent, fsc);
- if (error) {
- xchk_set_incomplete(sc);
- goto out_unlock;
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ error = xfs_rtalloc_query_all(rtg, sc->tp,
+ xchk_fscount_add_frextent, fsc);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ if (error) {
+ xchk_set_incomplete(sc);
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
}
fsc->frextents_delayed = percpu_counter_sum(&mp->m_delalloc_rtextents);
-
-out_unlock:
- xfs_rtbitmap_unlock_shared(sc->mp, XFS_RBMLOCK_BITMAP);
- return error;
+ return 0;
}
#else
STATIC int
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
index 469bf645dbea..cda13447a373 100644
--- a/fs/xfs/scrub/fscounters_repair.c
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -68,15 +68,16 @@ xrep_fscounters(
/*
* Online repair is only supported on v5 file systems, which require
- * lazy sb counters and thus no update of sb_fdblocks here. But as of
- * now we don't support lazy counting sb_frextents yet, and thus need
- * to also update it directly here. And for that we need to keep
+ * lazy sb counters and thus no update of sb_fdblocks here. But
+ * sb_frextents only uses a lazy counter with rtgroups, and thus needs
+ * to be updated directly here otherwise. And for that we need to keep
* track of the delalloc reservations separately, as they are are
* subtracted from m_frextents, but not included in sb_frextents.
*/
percpu_counter_set(&mp->m_frextents,
fsc->frextents - fsc->frextents_delayed);
- mp->m_sb.sb_frextents = fsc->frextents;
+ if (!xfs_has_rtgroups(mp))
+ mp->m_sb.sb_frextents = fsc->frextents;
return 0;
}
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index b712a8bd34f5..ce86bdad37fa 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -12,6 +12,7 @@
#include "xfs_btree.h"
#include "xfs_ag.h"
#include "xfs_health.h"
+#include "xfs_rtgroup.h"
#include "scrub/scrub.h"
#include "scrub/health.h"
#include "scrub/common.h"
@@ -71,9 +72,9 @@
enum xchk_health_group {
XHG_FS = 1,
- XHG_RT,
XHG_AG,
XHG_INO,
+ XHG_RTGROUP,
};
struct xchk_health_map {
@@ -100,8 +101,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR },
[XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK },
[XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT },
- [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP },
- [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY },
+ [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RTGROUP, XFS_SICK_RG_BITMAP },
+ [XFS_SCRUB_TYPE_RTSUM] = { XHG_RTGROUP, XFS_SICK_RG_SUMMARY },
[XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA },
[XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA },
[XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA },
@@ -109,6 +110,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK },
[XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS },
[XFS_SCRUB_TYPE_DIRTREE] = { XHG_INO, XFS_SICK_INO_DIRTREE },
+ [XFS_SCRUB_TYPE_METAPATH] = { XHG_FS, XFS_SICK_FS_METAPATH },
+ [XFS_SCRUB_TYPE_RGSUPER] = { XHG_RTGROUP, XFS_SICK_RG_SUPER },
};
/* Return the health status mask for this scrub type. */
@@ -160,13 +163,14 @@ STATIC void
xchk_mark_all_healthy(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
+ struct xfs_rtgroup *rtg = NULL;
xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT);
- xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT);
- for_each_perag(mp, agno, pag)
- xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT);
+ while ((pag = xfs_perag_next(mp, pag)))
+ xfs_group_mark_healthy(pag_group(pag), XFS_SICK_AG_INDIRECT);
+ while ((rtg = xfs_rtgroup_next(mp, rtg)))
+ xfs_group_mark_healthy(rtg_group(rtg), XFS_SICK_RG_INDIRECT);
}
/*
@@ -184,6 +188,7 @@ xchk_update_health(
struct xfs_scrub *sc)
{
struct xfs_perag *pag;
+ struct xfs_rtgroup *rtg;
bool bad;
/*
@@ -207,9 +212,9 @@ xchk_update_health(
case XHG_AG:
pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
if (bad)
- xfs_ag_mark_corrupt(pag, sc->sick_mask);
+ xfs_group_mark_corrupt(pag_group(pag), sc->sick_mask);
else
- xfs_ag_mark_healthy(pag, sc->sick_mask);
+ xfs_group_mark_healthy(pag_group(pag), sc->sick_mask);
xfs_perag_put(pag);
break;
case XHG_INO:
@@ -236,11 +241,13 @@ xchk_update_health(
else
xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
break;
- case XHG_RT:
+ case XHG_RTGROUP:
+ rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno);
if (bad)
- xfs_rt_mark_corrupt(sc->mp, sc->sick_mask);
+ xfs_group_mark_corrupt(rtg_group(rtg), sc->sick_mask);
else
- xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
+ xfs_group_mark_healthy(rtg_group(rtg), sc->sick_mask);
+ xfs_rtgroup_put(rtg);
break;
default:
ASSERT(0);
@@ -277,7 +284,7 @@ xchk_ag_btree_del_cursor_if_sick(
type_to_health_flag[sc->sm->sm_type].group == XHG_AG)
mask &= ~sc->sick_mask;
- if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) {
+ if (xfs_group_has_sickness((*curp)->bc_group, mask)) {
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR);
*curp = NULL;
@@ -294,9 +301,8 @@ xchk_health_record(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
-
+ struct xfs_perag *pag = NULL;
+ struct xfs_rtgroup *rtg = NULL;
unsigned int sick;
unsigned int checked;
@@ -304,15 +310,17 @@ xchk_health_record(
if (sick & XFS_SICK_FS_PRIMARY)
xchk_set_corrupt(sc);
- xfs_rt_measure_sickness(mp, &sick, &checked);
- if (sick & XFS_SICK_RT_PRIMARY)
- xchk_set_corrupt(sc);
-
- for_each_perag(mp, agno, pag) {
- xfs_ag_measure_sickness(pag, &sick, &checked);
+ while ((pag = xfs_perag_next(mp, pag))) {
+ xfs_group_measure_sickness(pag_group(pag), &sick, &checked);
if (sick & XFS_SICK_AG_PRIMARY)
xchk_set_corrupt(sc);
}
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked);
+ if (sick & XFS_SICK_RG_PRIMARY)
+ xchk_set_corrupt(sc);
+ }
+
return 0;
}
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 750d7b0cd25a..abad54c3621d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -258,7 +258,7 @@ xchk_iallocbt_chunk(
{
struct xfs_scrub *sc = bs->sc;
struct xfs_mount *mp = bs->cur->bc_mp;
- struct xfs_perag *pag = bs->cur->bc_ag.pag;
+ struct xfs_perag *pag = to_perag(bs->cur->bc_group);
xfs_agblock_t agbno;
xfs_extlen_t len;
@@ -303,7 +303,6 @@ xchk_iallocbt_check_cluster_ifree(
unsigned int irec_ino,
struct xfs_dinode *dip)
{
- struct xfs_mount *mp = bs->cur->bc_mp;
xfs_ino_t fsino;
xfs_agino_t agino;
bool irec_free;
@@ -319,7 +318,7 @@ xchk_iallocbt_check_cluster_ifree(
* the record, compute which fs inode we're talking about.
*/
agino = irec->ir_startino + irec_ino;
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino);
+ fsino = xfs_agino_to_ino(to_perag(bs->cur->bc_group), agino);
irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
@@ -368,7 +367,6 @@ xchk_iallocbt_check_cluster(
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_buf *cluster_bp;
unsigned int nr_inodes;
- xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno;
xfs_agblock_t agbno;
unsigned int cluster_index;
uint16_t cluster_mask = 0;
@@ -396,7 +394,7 @@ xchk_iallocbt_check_cluster(
* ir_startino can be large enough to make im_boffset nonzero.
*/
ir_holemask = (irec->ir_holemask & cluster_mask);
- imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap.im_blkno = xfs_agbno_to_daddr(to_perag(bs->cur->bc_group), agbno);
imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) <<
mp->m_sb.sb_inodelog;
@@ -407,9 +405,9 @@ xchk_iallocbt_check_cluster(
return 0;
}
- trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
- imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
- cluster_mask, ir_holemask,
+ trace_xchk_iallocbt_check_cluster(to_perag(bs->cur->bc_group),
+ irec->ir_startino, imap.im_blkno, imap.im_len,
+ cluster_base, nr_inodes, cluster_mask, ir_holemask,
XFS_INO_TO_OFFSET(mp, irec->ir_startino +
cluster_base));
@@ -585,7 +583,7 @@ xchk_iallocbt_rec(
uint16_t holemask;
xfs_inobt_btrec_to_irec(mp, rec, &irec);
- if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
+ if (xfs_inobt_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c
index c8d2196a04e1..14e48d3f1912 100644
--- a/fs/xfs/scrub/ialloc_repair.c
+++ b/fs/xfs/scrub/ialloc_repair.c
@@ -146,15 +146,12 @@ xrep_ibt_check_ifree(
struct xfs_scrub *sc = ri->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_dinode *dip;
- xfs_ino_t fsino;
xfs_agino_t agino;
- xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno;
unsigned int cluster_buf_base;
unsigned int offset;
int error;
agino = cluster_ag_base + cluster_index;
- fsino = XFS_AGINO_TO_INO(mp, agno, agino);
/* Inode uncached or half assembled, read disk buffer */
cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
@@ -165,7 +162,8 @@ xrep_ibt_check_ifree(
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
return -EFSCORRUPTED;
- if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)
+ if (dip->di_version >= 3 &&
+ be64_to_cpu(dip->di_ino) != xfs_agino_to_ino(ri->sc->sa.pag, agino))
return -EFSCORRUPTED;
/* Will the in-core inode tell us if it's in use? */
@@ -194,7 +192,7 @@ xrep_ibt_stash(
if (ri->rie.ir_freecount > 0)
ri->finobt_recs++;
- trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie);
+ trace_xrep_ibt_found(ri->sc->sa.pag, &ri->rie);
error = xfarray_append(ri->inode_records, &ri->rie);
if (error)
@@ -307,7 +305,7 @@ xrep_ibt_process_cluster(
* inobt because imap_to_bp directly maps the buffer without touching
* either inode btree.
*/
- imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno);
+ imap.im_blkno = xfs_agbno_to_daddr(sc->sa.pag, cluster_bno);
imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
imap.im_boffset = 0;
error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
@@ -423,9 +421,7 @@ xrep_ibt_record_inode_blocks(
if (error)
return error;
- trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno,
- rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
- rec->rm_offset, rec->rm_flags);
+ trace_xrep_ibt_walk_rmap(ri->sc->sa.pag, rec);
/*
* Record the free/hole masks for each inode cluster that could be
@@ -634,7 +630,6 @@ xrep_ibt_build_new_trees(
struct xfs_scrub *sc = ri->sc;
struct xfs_btree_cur *ino_cur;
struct xfs_btree_cur *fino_cur = NULL;
- xfs_fsblock_t fsbno;
bool need_finobt;
int error;
@@ -656,9 +651,8 @@ xrep_ibt_build_new_trees(
*
* Start by setting up the inobt staging cursor.
*/
- fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
- XFS_IBT_BLOCK(sc->mp));
- xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno,
+ xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT,
+ xfs_agbno_to_fsb(sc->sa.pag, XFS_IBT_BLOCK(sc->mp)),
XFS_AG_RESV_NONE);
ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
ri->new_inobt.bload.get_records = xrep_ibt_get_records;
@@ -677,10 +671,9 @@ xrep_ibt_build_new_trees(
if (sc->mp->m_finobt_nores)
resv = XFS_AG_RESV_NONE;
- fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
- XFS_FIBT_BLOCK(sc->mp));
xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
- fsbno, resv);
+ xfs_agbno_to_fsb(sc->sa.pag, XFS_FIBT_BLOCK(sc->mp)),
+ resv);
ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
ri->new_finobt.bload.get_records = xrep_fibt_get_records;
@@ -821,7 +814,7 @@ xrep_iallocbt(
sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
/* Set up enough storage to handle an AG with nothing but inodes. */
- xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino);
+ xfs_agino_range(mp, pag_agno(sc->sa.pag), &first_agino, &last_agino);
last_agino /= XFS_INODES_PER_CHUNK;
descr = xchk_xfile_ag_descr(sc, "inode index records");
error = xfarray_create(descr, last_agino,
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index d32716fb2fec..25ee66e7649d 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -60,6 +60,22 @@ xchk_install_handle_iscrub(
if (error)
return error;
+ /*
+ * Don't allow scrubbing by handle of any non-directory inode records
+ * in the metadata directory tree. We don't know if any of the scans
+ * launched by this scrubber will end up indirectly trying to lock this
+ * file.
+ *
+ * Scrubbers of inode-rooted metadata files (e.g. quota files) will
+ * attach all the resources needed to scrub the inode and call
+ * xchk_inode directly. Userspace cannot call this directly.
+ */
+ if (xfs_is_metadir_inode(ip) && !S_ISDIR(VFS_I(ip)->i_mode)) {
+ xchk_irele(sc, ip);
+ sc->ip = NULL;
+ return -ENOENT;
+ }
+
return xchk_prepare_iscrub(sc);
}
@@ -94,9 +110,15 @@ xchk_setup_inode(
return xchk_prepare_iscrub(sc);
}
- /* Reject internal metadata files and obviously bad inode numbers. */
- if (xfs_internal_inum(mp, sc->sm->sm_ino))
+ /*
+ * On pre-metadir filesystems, reject internal metadata files. For
+ * metadir filesystems, limited scrubbing of any file in the metadata
+ * directory tree by handle is allowed, because that is the only way to
+ * validate the lack of parent pointers in the sb-root metadata inodes.
+ */
+ if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino))
return -ENOENT;
+ /* Reject obviously bad inode numbers. */
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
@@ -421,8 +443,13 @@ xchk_dinode(
break;
case 2:
case 3:
- if (dip->di_onlink != 0)
- xchk_ino_set_corrupt(sc, ino);
+ if (xfs_dinode_is_metadir(dip)) {
+ if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX)
+ xchk_ino_set_corrupt(sc, ino);
+ } else {
+ if (dip->di_metatype != 0)
+ xchk_ino_set_corrupt(sc, ino);
+ }
if (dip->di_mode == 0 && sc->ip)
xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 3e45b9b72312..5a58ddd27bd2 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -521,10 +521,17 @@ STATIC void
xrep_dinode_nlinks(
struct xfs_dinode *dip)
{
- if (dip->di_version > 1)
- dip->di_onlink = 0;
- else
+ if (dip->di_version < 2) {
dip->di_nlink = 0;
+ return;
+ }
+
+ if (xfs_dinode_is_metadir(dip)) {
+ if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX)
+ dip->di_metatype = cpu_to_be16(XFS_METAFILE_UNKNOWN);
+ } else {
+ dip->di_metatype = 0;
+ }
}
/* Fix any conflicting flags that the verifiers complain about. */
@@ -565,6 +572,16 @@ xrep_dinode_flags(
dip->di_nrext64_pad = 0;
else if (dip->di_version >= 3)
dip->di_v3_pad = 0;
+
+ if (flags2 & XFS_DIFLAG2_METADATA) {
+ xfs_failaddr_t fa;
+
+ fa = xfs_dinode_verify_metadir(sc->mp, dip, mode, flags,
+ flags2);
+ if (fa)
+ flags2 &= ~XFS_DIFLAG2_METADATA;
+ }
+
dip->di_flags = cpu_to_be16(flags);
dip->di_flags2 = cpu_to_be64(flags2);
}
@@ -761,14 +778,13 @@ STATIC int
xrep_dinode_count_rmaps(
struct xrep_inode *ri)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
int error;
if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
return -EOPNOTSUPP;
- for_each_perag(ri->sc->mp, agno, pag) {
+ while ((pag = xfs_perag_next(ri->sc->mp, pag))) {
error = xrep_dinode_count_ag_rmaps(ri, pag);
if (error) {
xfs_perag_rele(pag);
@@ -1755,15 +1771,8 @@ xrep_inode_pptr(
if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
return 0;
- /* The root directory doesn't have a parent pointer. */
- if (ip == mp->m_rootip)
- return 0;
-
- /*
- * Metadata inodes are rooted in the superblock and do not have any
- * parents.
- */
- if (xfs_is_metadata_inode(ip))
+ /* Children of the superblock do not have parent pointers. */
+ if (xchk_inode_is_sb_rooted(ip))
return 0;
/* Inode already has an attr fork; no further work possible here. */
diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
index cf9d983667ce..84f117667ca2 100644
--- a/fs/xfs/scrub/iscan.c
+++ b/fs/xfs/scrub/iscan.c
@@ -67,7 +67,7 @@ xchk_iscan_mask_skipino(
xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino);
xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino);
- if (pag->pag_agno != skip_agno)
+ if (pag_agno(pag) != skip_agno)
return;
if (skip_agino < rec->ir_startino)
return;
@@ -95,7 +95,7 @@ xchk_iscan_find_next(
struct xfs_btree_cur *cur;
struct xfs_mount *mp = sc->mp;
struct xfs_trans *tp = sc->tp;
- xfs_agnumber_t agno = pag->pag_agno;
+ xfs_agnumber_t agno = pag_agno(pag);
xfs_agino_t lastino = NULLAGINO;
xfs_agino_t first, last;
xfs_agino_t agino = *cursor;
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
new file mode 100644
index 000000000000..b78db6513465
--- /dev/null
+++ b/fs/xfs/scrub/metapath.c
@@ -0,0 +1,689 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_metafile.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dir2.h"
+#include "xfs_parent.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_attr.h"
+#include "xfs_rtgroup.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/readdir.h"
+#include "scrub/repair.h"
+
+/*
+ * Metadata Directory Tree Paths
+ * =============================
+ *
+ * A filesystem with metadir enabled expects to find metadata structures
+ * attached to files that are accessible by walking a path down the metadata
+ * directory tree. Given the metadir path and the incore inode storing the
+ * metadata, this scrubber ensures that the ondisk metadir path points to the
+ * ondisk inode represented by the incore inode.
+ */
+
+struct xchk_metapath {
+ struct xfs_scrub *sc;
+
+ /* Name for lookup */
+ struct xfs_name xname;
+
+ /* Directory update for repairs */
+ struct xfs_dir_update du;
+
+ /* Path down to this metadata file from the parent directory */
+ const char *path;
+
+ /* Directory parent of the metadata file. */
+ struct xfs_inode *dp;
+
+ /* Locks held on dp */
+ unsigned int dp_ilock_flags;
+
+ /* Transaction block reservations */
+ unsigned int link_resblks;
+ unsigned int unlink_resblks;
+
+ /* Parent pointer updates */
+ struct xfs_parent_args link_ppargs;
+ struct xfs_parent_args unlink_ppargs;
+
+ /* Scratchpads for removing links */
+ struct xfs_da_args pptr_args;
+};
+
+/* Release resources tracked in the buffer. */
+static inline void
+xchk_metapath_cleanup(
+ void *buf)
+{
+ struct xchk_metapath *mpath = buf;
+
+ if (mpath->dp_ilock_flags)
+ xfs_iunlock(mpath->dp, mpath->dp_ilock_flags);
+ kfree(mpath->path);
+}
+
+/* Set up a metadir path scan. @path must be dynamically allocated. */
+static inline int
+xchk_setup_metapath_scan(
+ struct xfs_scrub *sc,
+ struct xfs_inode *dp,
+ const char *path,
+ struct xfs_inode *ip)
+{
+ struct xchk_metapath *mpath;
+ int error;
+
+ if (!path)
+ return -ENOMEM;
+
+ error = xchk_install_live_inode(sc, ip);
+ if (error) {
+ kfree(path);
+ return error;
+ }
+
+ mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS);
+ if (!mpath) {
+ kfree(path);
+ return -ENOMEM;
+ }
+
+ mpath->sc = sc;
+ sc->buf = mpath;
+ sc->buf_cleanup = xchk_metapath_cleanup;
+
+ mpath->dp = dp;
+ mpath->path = path; /* path is now owned by mpath */
+
+ mpath->xname.name = mpath->path;
+ mpath->xname.len = strlen(mpath->path);
+ mpath->xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+
+ return 0;
+}
+
+#ifdef CONFIG_XFS_RT
+/* Scan the /rtgroups directory itself. */
+static int
+xchk_setup_metapath_rtdir(
+ struct xfs_scrub *sc)
+{
+ if (!sc->mp->m_rtdirip)
+ return -ENOENT;
+
+ return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
+ kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip);
+}
+
+/* Scan a rtgroup inode under the /rtgroups directory. */
+static int
+xchk_setup_metapath_rtginode(
+ struct xfs_scrub *sc,
+ enum xfs_rtg_inodes type)
+{
+ struct xfs_rtgroup *rtg;
+ struct xfs_inode *ip;
+ int error;
+
+ rtg = xfs_rtgroup_get(sc->mp, sc->sm->sm_agno);
+ if (!rtg)
+ return -ENOENT;
+
+ ip = rtg->rtg_inodes[type];
+ if (!ip) {
+ error = -ENOENT;
+ goto out_put_rtg;
+ }
+
+ error = xchk_setup_metapath_scan(sc, sc->mp->m_rtdirip,
+ xfs_rtginode_path(rtg_rgno(rtg), type), ip);
+
+out_put_rtg:
+ xfs_rtgroup_put(rtg);
+ return error;
+}
+#else
+# define xchk_setup_metapath_rtdir(...) (-ENOENT)
+# define xchk_setup_metapath_rtginode(...) (-ENOENT)
+#endif /* CONFIG_XFS_RT */
+
+#ifdef CONFIG_XFS_QUOTA
+/* Scan the /quota directory itself. */
+static int
+xchk_setup_metapath_quotadir(
+ struct xfs_scrub *sc)
+{
+ struct xfs_trans *tp;
+ struct xfs_inode *dp = NULL;
+ int error;
+
+ error = xfs_trans_alloc_empty(sc->mp, &tp);
+ if (error)
+ return error;
+
+ error = xfs_dqinode_load_parent(tp, &dp);
+ xfs_trans_cancel(tp);
+ if (error)
+ return error;
+
+ error = xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
+ kasprintf(GFP_KERNEL, "quota"), dp);
+ xfs_irele(dp);
+ return error;
+}
+
+/* Scan a quota inode under the /quota directory. */
+static int
+xchk_setup_metapath_dqinode(
+ struct xfs_scrub *sc,
+ xfs_dqtype_t type)
+{
+ struct xfs_trans *tp = NULL;
+ struct xfs_inode *dp = NULL;
+ struct xfs_inode *ip = NULL;
+ const char *path;
+ int error;
+
+ error = xfs_trans_alloc_empty(sc->mp, &tp);
+ if (error)
+ return error;
+
+ error = xfs_dqinode_load_parent(tp, &dp);
+ if (error)
+ goto out_cancel;
+
+ error = xfs_dqinode_load(tp, dp, type, &ip);
+ if (error)
+ goto out_dp;
+
+ xfs_trans_cancel(tp);
+ tp = NULL;
+
+ path = kasprintf(GFP_KERNEL, "%s", xfs_dqinode_path(type));
+ error = xchk_setup_metapath_scan(sc, dp, path, ip);
+
+ xfs_irele(ip);
+out_dp:
+ xfs_irele(dp);
+out_cancel:
+ if (tp)
+ xfs_trans_cancel(tp);
+ return error;
+}
+#else
+# define xchk_setup_metapath_quotadir(...) (-ENOENT)
+# define xchk_setup_metapath_dqinode(...) (-ENOENT)
+#endif /* CONFIG_XFS_QUOTA */
+
+int
+xchk_setup_metapath(
+ struct xfs_scrub *sc)
+{
+ if (!xfs_has_metadir(sc->mp))
+ return -ENOENT;
+ if (sc->sm->sm_gen)
+ return -EINVAL;
+
+ switch (sc->sm->sm_ino) {
+ case XFS_SCRUB_METAPATH_PROBE:
+ /* Just probing, nothing else to do. */
+ if (sc->sm->sm_agno)
+ return -EINVAL;
+ return 0;
+ case XFS_SCRUB_METAPATH_RTDIR:
+ return xchk_setup_metapath_rtdir(sc);
+ case XFS_SCRUB_METAPATH_RTBITMAP:
+ return xchk_setup_metapath_rtginode(sc, XFS_RTGI_BITMAP);
+ case XFS_SCRUB_METAPATH_RTSUMMARY:
+ return xchk_setup_metapath_rtginode(sc, XFS_RTGI_SUMMARY);
+ case XFS_SCRUB_METAPATH_QUOTADIR:
+ return xchk_setup_metapath_quotadir(sc);
+ case XFS_SCRUB_METAPATH_USRQUOTA:
+ return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_USER);
+ case XFS_SCRUB_METAPATH_GRPQUOTA:
+ return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_GROUP);
+ case XFS_SCRUB_METAPATH_PRJQUOTA:
+ return xchk_setup_metapath_dqinode(sc, XFS_DQTYPE_PROJ);
+ default:
+ return -ENOENT;
+ }
+}
+
+/*
+ * Take the ILOCK on the metadata directory parent and child. We do not know
+ * that the metadata directory is not corrupt, so we lock the parent and try
+ * to lock the child. Returns 0 if successful, or -EINTR to abort the scrub.
+ */
+STATIC int
+xchk_metapath_ilock_both(
+ struct xchk_metapath *mpath)
+{
+ struct xfs_scrub *sc = mpath->sc;
+ int error = 0;
+
+ while (true) {
+ xfs_ilock(mpath->dp, XFS_ILOCK_EXCL);
+ if (xchk_ilock_nowait(sc, XFS_ILOCK_EXCL)) {
+ mpath->dp_ilock_flags |= XFS_ILOCK_EXCL;
+ return 0;
+ }
+ xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(1);
+ }
+
+ ASSERT(0);
+ return -EINTR;
+}
+
+/* Unlock parent and child inodes. */
+static inline void
+xchk_metapath_iunlock(
+ struct xchk_metapath *mpath)
+{
+ struct xfs_scrub *sc = mpath->sc;
+
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+
+ mpath->dp_ilock_flags &= ~XFS_ILOCK_EXCL;
+ xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+}
+
+int
+xchk_metapath(
+ struct xfs_scrub *sc)
+{
+ struct xchk_metapath *mpath = sc->buf;
+ xfs_ino_t ino = NULLFSINO;
+ int error;
+
+ /* Just probing, nothing else to do. */
+ if (sc->sm->sm_ino == XFS_SCRUB_METAPATH_PROBE)
+ return 0;
+
+ /* Parent required to do anything else. */
+ if (mpath->dp == NULL) {
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ error = xchk_trans_alloc_empty(sc);
+ if (error)
+ return error;
+
+ error = xchk_metapath_ilock_both(mpath);
+ if (error)
+ goto out_cancel;
+
+ /* Make sure the parent dir has a dirent pointing to this file. */
+ error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino);
+ trace_xchk_metapath_lookup(sc, mpath->path, mpath->dp, ino);
+ if (error == -ENOENT) {
+ /* No directory entry at all */
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ error = 0;
+ goto out_ilock;
+ }
+ if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
+ goto out_ilock;
+ if (ino != sc->ip->i_ino) {
+ /* Pointing to wrong inode */
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ }
+
+out_ilock:
+ xchk_metapath_iunlock(mpath);
+out_cancel:
+ xchk_trans_cancel(sc);
+ return error;
+}
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+/* Create the dirent represented by the final component of the path. */
+STATIC int
+xrep_metapath_link(
+ struct xchk_metapath *mpath)
+{
+ struct xfs_scrub *sc = mpath->sc;
+
+ mpath->du.dp = mpath->dp;
+ mpath->du.name = &mpath->xname;
+ mpath->du.ip = sc->ip;
+
+ if (xfs_has_parent(sc->mp))
+ mpath->du.ppargs = &mpath->link_ppargs;
+ else
+ mpath->du.ppargs = NULL;
+
+ trace_xrep_metapath_link(sc, mpath->path, mpath->dp, sc->ip->i_ino);
+
+ return xfs_dir_add_child(sc->tp, mpath->link_resblks, &mpath->du);
+}
+
+/* Remove the dirent at the final component of the path. */
+STATIC int
+xrep_metapath_unlink(
+ struct xchk_metapath *mpath,
+ xfs_ino_t ino,
+ struct xfs_inode *ip)
+{
+ struct xfs_parent_rec rec;
+ struct xfs_scrub *sc = mpath->sc;
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ trace_xrep_metapath_unlink(sc, mpath->path, mpath->dp, ino);
+
+ if (!ip) {
+ /* The child inode isn't allocated. Junk the dirent. */
+ xfs_trans_log_inode(sc->tp, mpath->dp, XFS_ILOG_CORE);
+ return xfs_dir_removename(sc->tp, mpath->dp, &mpath->xname,
+ ino, mpath->unlink_resblks);
+ }
+
+ mpath->du.dp = mpath->dp;
+ mpath->du.name = &mpath->xname;
+ mpath->du.ip = ip;
+ mpath->du.ppargs = NULL;
+
+ /* Figure out if we're removing a parent pointer too. */
+ if (xfs_has_parent(mp)) {
+ xfs_inode_to_parent_rec(&rec, ip);
+ error = xfs_parent_lookup(sc->tp, ip, &mpath->xname, &rec,
+ &mpath->pptr_args);
+ switch (error) {
+ case -ENOATTR:
+ break;
+ case 0:
+ mpath->du.ppargs = &mpath->unlink_ppargs;
+ break;
+ default:
+ return error;
+ }
+ }
+
+ return xfs_dir_remove_child(sc->tp, mpath->unlink_resblks, &mpath->du);
+}
+
+/*
+ * Try to create a dirent in @mpath->dp with the name @mpath->xname that points
+ * to @sc->ip. Returns:
+ *
+ * -EEXIST and an @alleged_child if the dirent that points to the wrong inode;
+ * 0 if there is now a dirent pointing to @sc->ip; or
+ * A negative errno on error.
+ */
+STATIC int
+xrep_metapath_try_link(
+ struct xchk_metapath *mpath,
+ xfs_ino_t *alleged_child)
+{
+ struct xfs_scrub *sc = mpath->sc;
+ xfs_ino_t ino;
+ int error;
+
+ /* Allocate transaction, lock inodes, join to transaction. */
+ error = xchk_trans_alloc(sc, mpath->link_resblks);
+ if (error)
+ return error;
+
+ error = xchk_metapath_ilock_both(mpath);
+ if (error) {
+ xchk_trans_cancel(sc);
+ return error;
+ }
+ xfs_trans_ijoin(sc->tp, mpath->dp, 0);
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino);
+ trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino);
+ if (error == -ENOENT) {
+ /*
+ * There is no dirent in the directory. Create an entry
+ * pointing to @sc->ip.
+ */
+ error = xrep_metapath_link(mpath);
+ if (error)
+ goto out_cancel;
+
+ error = xrep_trans_commit(sc);
+ xchk_metapath_iunlock(mpath);
+ return error;
+ }
+ if (error)
+ goto out_cancel;
+
+ if (ino == sc->ip->i_ino) {
+ /* The dirent already points to @sc->ip; we're done. */
+ error = 0;
+ goto out_cancel;
+ }
+
+ /*
+ * The dirent points elsewhere; pass that back so that the caller
+ * can try to remove the dirent.
+ */
+ *alleged_child = ino;
+ error = -EEXIST;
+
+out_cancel:
+ xchk_trans_cancel(sc);
+ xchk_metapath_iunlock(mpath);
+ return error;
+}
+
+/*
+ * Take the ILOCK on the metadata directory parent and a bad child, if one is
+ * supplied. We do not know that the metadata directory is not corrupt, so we
+ * lock the parent and try to lock the child. Returns 0 if successful, or
+ * -EINTR to abort the repair. The lock state of @dp is not recorded in @mpath.
+ */
+STATIC int
+xchk_metapath_ilock_parent_and_child(
+ struct xchk_metapath *mpath,
+ struct xfs_inode *ip)
+{
+ struct xfs_scrub *sc = mpath->sc;
+ int error = 0;
+
+ while (true) {
+ xfs_ilock(mpath->dp, XFS_ILOCK_EXCL);
+ if (!ip || xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ return 0;
+ xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(1);
+ }
+
+ ASSERT(0);
+ return -EINTR;
+}
+
+/*
+ * Try to remove a dirent in @mpath->dp with the name @mpath->xname that points
+ * to @alleged_child. Returns:
+ *
+ * 0 if there is no longer a dirent;
+ * -EEXIST if the dirent points to @sc->ip;
+ * -EAGAIN and an updated @alleged_child if the dirent points elsewhere; or
+ * A negative errno for any other error.
+ */
+STATIC int
+xrep_metapath_try_unlink(
+ struct xchk_metapath *mpath,
+ xfs_ino_t *alleged_child)
+{
+ struct xfs_scrub *sc = mpath->sc;
+ struct xfs_inode *ip = NULL;
+ xfs_ino_t ino;
+ int error;
+
+ ASSERT(*alleged_child != sc->ip->i_ino);
+
+ trace_xrep_metapath_try_unlink(sc, mpath->path, mpath->dp,
+ *alleged_child);
+
+ /*
+ * Allocate transaction, grab the alleged child inode, lock inodes,
+ * join to transaction.
+ */
+ error = xchk_trans_alloc(sc, mpath->unlink_resblks);
+ if (error)
+ return error;
+
+ error = xchk_iget(sc, *alleged_child, &ip);
+ if (error == -EINVAL || error == -ENOENT) {
+ /* inode number is bogus, junk the dirent */
+ error = 0;
+ }
+ if (error) {
+ xchk_trans_cancel(sc);
+ return error;
+ }
+
+ error = xchk_metapath_ilock_parent_and_child(mpath, ip);
+ if (error) {
+ xchk_trans_cancel(sc);
+ return error;
+ }
+ xfs_trans_ijoin(sc->tp, mpath->dp, 0);
+ if (ip)
+ xfs_trans_ijoin(sc->tp, ip, 0);
+
+ error = xchk_dir_lookup(sc, mpath->dp, &mpath->xname, &ino);
+ trace_xrep_metapath_lookup(sc, mpath->path, mpath->dp, ino);
+ if (error == -ENOENT) {
+ /*
+ * There is no dirent in the directory anymore. We're ready to
+ * try the link operation again.
+ */
+ error = 0;
+ goto out_cancel;
+ }
+ if (error)
+ goto out_cancel;
+
+ if (ino == sc->ip->i_ino) {
+ /* The dirent already points to @sc->ip; we're done. */
+ error = -EEXIST;
+ goto out_cancel;
+ }
+
+ /*
+ * The dirent does not point to the alleged child. Update the caller
+ * and signal that we want to be called again.
+ */
+ if (ino != *alleged_child) {
+ *alleged_child = ino;
+ error = -EAGAIN;
+ goto out_cancel;
+ }
+
+ /* Remove the link to the child. */
+ error = xrep_metapath_unlink(mpath, ino, ip);
+ if (error)
+ goto out_cancel;
+
+ error = xrep_trans_commit(sc);
+ goto out_unlock;
+
+out_cancel:
+ xchk_trans_cancel(sc);
+out_unlock:
+ xfs_iunlock(mpath->dp, XFS_ILOCK_EXCL);
+ if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xchk_irele(sc, ip);
+ }
+ return error;
+}
+
+/*
+ * Make sure the metadata directory path points to the child being examined.
+ *
+ * Repair needs to be able to create a directory structure, create its own
+ * transactions, and take ILOCKs. This function /must/ be called after all
+ * other repairs have completed.
+ */
+int
+xrep_metapath(
+ struct xfs_scrub *sc)
+{
+ struct xchk_metapath *mpath = sc->buf;
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ /* Just probing, nothing to repair. */
+ if (sc->sm->sm_ino == XFS_SCRUB_METAPATH_PROBE)
+ return 0;
+
+ /* Parent required to do anything else. */
+ if (mpath->dp == NULL)
+ return -EFSCORRUPTED;
+
+ /*
+ * Make sure the child file actually has an attr fork to receive a new
+ * parent pointer if the fs has parent pointers.
+ */
+ if (xfs_has_parent(mp)) {
+ error = xfs_attr_add_fork(sc->ip,
+ sizeof(struct xfs_attr_sf_hdr), 1);
+ if (error)
+ return error;
+ }
+
+ /* Compute block reservation required to unlink and link a file. */
+ mpath->unlink_resblks = xfs_remove_space_res(mp, MAXNAMELEN);
+ mpath->link_resblks = xfs_link_space_res(mp, MAXNAMELEN);
+
+ do {
+ xfs_ino_t alleged_child;
+
+ /* Re-establish the link, or tell us which inode to remove. */
+ error = xrep_metapath_try_link(mpath, &alleged_child);
+ if (!error)
+ return 0;
+ if (error != -EEXIST)
+ return error;
+
+ /*
+ * Remove an incorrect link to an alleged child, or tell us
+ * which inode to remove.
+ */
+ do {
+ error = xrep_metapath_try_unlink(mpath, &alleged_child);
+ } while (error == -EAGAIN);
+ if (error == -EEXIST) {
+ /* Link established; we're done. */
+ error = 0;
+ break;
+ }
+ } while (!error);
+
+ return error;
+}
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index 2aa14b7ab630..70af27d98734 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -58,7 +58,7 @@ xrep_newbt_estimate_slack(
if (sc->ops->type == ST_PERAG) {
free = sc->sa.pag->pagf_freeblks;
- sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
+ sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
} else {
free = percpu_counter_sum(&sc->mp->m_fdblocks);
sz = sc->mp->m_sb.sb_dblocks;
@@ -186,11 +186,10 @@ xrep_newbt_add_extent(
xfs_agblock_t agbno,
xfs_extlen_t len)
{
- struct xfs_mount *mp = xnr->sc->mp;
struct xfs_alloc_arg args = {
.tp = NULL, /* no autoreap */
.oinfo = xnr->oinfo,
- .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
+ .fsbno = xfs_agbno_to_fsb(pag, agbno),
.len = len,
.resv = xnr->resv,
};
@@ -206,12 +205,12 @@ xrep_newbt_validate_ag_alloc_hint(
struct xfs_scrub *sc = xnr->sc;
xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
- if (agno == sc->sa.pag->pag_agno &&
+ if (agno == pag_agno(sc->sa.pag) &&
xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
return;
- xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
- XFS_AGFL_BLOCK(sc->mp) + 1);
+ xnr->alloc_hint =
+ xfs_agbno_to_fsb(sc->sa.pag, XFS_AGFL_BLOCK(sc->mp) + 1);
}
/* Allocate disk space for a new per-AG btree. */
@@ -251,16 +250,15 @@ xrep_newbt_alloc_ag_blocks(
return -ENOSPC;
agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+ if (agno != pag_agno(sc->sa.pag)) {
+ ASSERT(agno == pag_agno(sc->sa.pag));
+ return -EFSCORRUPTED;
+ }
- trace_xrep_newbt_alloc_ag_blocks(mp, agno,
+ trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag,
XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
xnr->oinfo.oi_owner);
- if (agno != sc->sa.pag->pag_agno) {
- ASSERT(agno == sc->sa.pag->pag_agno);
- return -EFSCORRUPTED;
- }
-
error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
if (error)
return error;
@@ -326,16 +324,16 @@ xrep_newbt_alloc_file_blocks(
agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
- trace_xrep_newbt_alloc_file_blocks(mp, agno,
- XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
- xnr->oinfo.oi_owner);
-
pag = xfs_perag_get(mp, agno);
if (!pag) {
ASSERT(0);
return -EFSCORRUPTED;
}
+ trace_xrep_newbt_alloc_file_blocks(pag,
+ XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+ xnr->oinfo.oi_owner);
+
error = xrep_newbt_add_blocks(xnr, pag, &args);
xfs_perag_put(pag);
if (error)
@@ -376,7 +374,6 @@ xrep_newbt_free_extent(
struct xfs_scrub *sc = xnr->sc;
xfs_agblock_t free_agbno = resv->agbno;
xfs_extlen_t free_aglen = resv->len;
- xfs_fsblock_t fsbno;
int error;
if (!btree_committed || resv->used == 0) {
@@ -385,8 +382,8 @@ xrep_newbt_free_extent(
* space reservation, let the existing EFI free the entire
* space extent.
*/
- trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
- free_agbno, free_aglen, xnr->oinfo.oi_owner);
+ trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
+ xnr->oinfo.oi_owner);
xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
return 1;
}
@@ -403,8 +400,8 @@ xrep_newbt_free_extent(
if (free_aglen == 0)
return 0;
- trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
- free_aglen, xnr->oinfo.oi_owner);
+ trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen,
+ xnr->oinfo.oi_owner);
ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
@@ -413,9 +410,9 @@ xrep_newbt_free_extent(
* Use EFIs to free the reservations. This reduces the chance
* that we leak blocks if the system goes down.
*/
- fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
- error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
- xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
+ error = xfs_free_extent_later(sc->tp,
+ xfs_agbno_to_fsb(resv->pag, free_agbno), free_aglen,
+ &xnr->oinfo, xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
if (error)
return error;
@@ -516,7 +513,6 @@ xrep_newbt_claim_block(
union xfs_btree_ptr *ptr)
{
struct xrep_newbt_resv *resv;
- struct xfs_mount *mp = cur->bc_mp;
xfs_agblock_t agbno;
/*
@@ -541,12 +537,10 @@ xrep_newbt_claim_block(
if (resv->used == resv->len)
list_move_tail(&resv->list, &xnr->resv_list);
- trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
- xnr->oinfo.oi_owner);
+ trace_xrep_newbt_claim_block(resv->pag, agbno, 1, xnr->oinfo.oi_owner);
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
- ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
- agbno));
+ ptr->l = cpu_to_be64(xfs_agbno_to_fsb(resv->pag, agbno));
else
ptr->s = cpu_to_be32(agbno);
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 80aee30886c4..4a47d0aabf73 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -279,7 +279,7 @@ xchk_nlinks_collect_dirent(
* determine the backref count.
*/
if (dotdot) {
- if (dp == sc->mp->m_rootip)
+ if (xchk_inode_is_dirtree_root(dp))
error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
else if (!xfs_has_parent(sc->mp))
error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
@@ -735,7 +735,7 @@ xchk_nlinks_compare_inode(
}
}
- if (ip == sc->mp->m_rootip) {
+ if (xchk_inode_is_dirtree_root(ip)) {
/*
* For the root of a directory tree, both the '.' and '..'
* entries should point to the root directory. The dotdot
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
index b3e707f47b7b..4ebdee095428 100644
--- a/fs/xfs/scrub/nlinks_repair.c
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -60,11 +60,9 @@ xrep_nlinks_is_orphaned(
unsigned int actual_nlink,
const struct xchk_nlink *obs)
{
- struct xfs_mount *mp = ip->i_mount;
-
if (obs->parents != 0)
return false;
- if (ip == mp->m_rootip || ip == sc->orphanage)
+ if (xchk_inode_is_dirtree_root(ip) || ip == sc->orphanage)
return false;
return actual_nlink != 0;
}
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index 7148d8362db8..c287c755f2c5 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -295,7 +295,9 @@ xrep_orphanage_can_adopt(
return false;
if (sc->ip == sc->orphanage)
return false;
- if (xfs_internal_inum(sc->mp, sc->ip->i_ino))
+ if (xchk_inode_is_sb_rooted(sc->ip))
+ return false;
+ if (xfs_is_internal_inode(sc->ip))
return false;
return true;
}
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 91e7b51ce068..3b692c4acc1e 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -132,6 +132,14 @@ xchk_parent_validate(
return 0;
}
+ /* Is this the metadata root dir? Then '..' must point to itself. */
+ if (sc->ip == mp->m_metadirip) {
+ if (sc->ip->i_ino != mp->m_sb.sb_metadirino ||
+ sc->ip->i_ino != parent_ino)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
/* '..' must not point to ourselves. */
if (sc->ip->i_ino == parent_ino) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
@@ -185,6 +193,12 @@ xchk_parent_validate(
goto out_unlock;
}
+ /* Metadata and regular inodes cannot cross trees. */
+ if (xfs_is_metadir_inode(dp) != xfs_is_metadir_inode(sc->ip)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ goto out_unlock;
+ }
+
/* Look for a directory entry in the parent pointing to the child. */
error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc);
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
@@ -300,7 +314,7 @@ xchk_parent_pptr_and_dotdot(
}
/* Is this the root dir? Then '..' must point to itself. */
- if (sc->ip == sc->mp->m_rootip) {
+ if (xchk_inode_is_dirtree_root(sc->ip)) {
if (sc->ip->i_ino != pp->parent_ino)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
@@ -711,7 +725,7 @@ xchk_parent_count_pptrs(
}
if (S_ISDIR(VFS_I(sc->ip)->i_mode)) {
- if (sc->ip == sc->mp->m_rootip)
+ if (xchk_inode_is_dirtree_root(sc->ip))
pp->pptrs_found++;
if (VFS_I(sc->ip)->i_nlink == 0 && pp->pptrs_found > 0)
@@ -720,6 +734,14 @@ xchk_parent_count_pptrs(
pp->pptrs_found == 0)
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
} else {
+ /*
+ * Starting with metadir, we allow checking of parent pointers
+ * of non-directory files that are children of the superblock.
+ * Pretend that we found a parent pointer attr.
+ */
+ if (xfs_has_metadir(sc->mp) && xchk_inode_is_sb_rooted(sc->ip))
+ pp->pptrs_found++;
+
if (VFS_I(sc->ip)->i_nlink != pp->pptrs_found)
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
}
@@ -885,10 +907,9 @@ bool
xchk_pptr_looks_zapped(
struct xfs_inode *ip)
{
- struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
- ASSERT(xfs_has_parent(mp));
+ ASSERT(xfs_has_parent(ip->i_mount));
/*
* Temporary files that cannot be linked into the directory tree do not
@@ -902,15 +923,15 @@ xchk_pptr_looks_zapped(
* of a parent pointer scan is always the empty set. It's safe to scan
* them even if the attr fork was zapped.
*/
- if (ip == mp->m_rootip)
+ if (xchk_inode_is_dirtree_root(ip))
return false;
/*
- * Metadata inodes are all rooted in the superblock and do not have
- * any parents. Hence the attr fork will not be initialized, but
- * there are no parent pointers that might have been zapped.
+ * Metadata inodes that are rooted in the superblock do not have any
+ * parents. Hence the attr fork will not be initialized, but there are
+ * no parent pointers that might have been zapped.
*/
- if (xfs_is_metadata_inode(ip))
+ if (xchk_inode_is_sb_rooted(ip))
return false;
/*
diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c
index 7b42b7f65a0b..31bfe10be22a 100644
--- a/fs/xfs/scrub/parent_repair.c
+++ b/fs/xfs/scrub/parent_repair.c
@@ -1334,7 +1334,7 @@ xrep_parent_rebuild_pptrs(
* so that we can decide if we're moving this file to the orphanage.
* For this purpose, root directories are their own parents.
*/
- if (sc->ip == sc->mp->m_rootip) {
+ if (xchk_inode_is_dirtree_root(sc->ip)) {
xrep_findparent_scan_found(&rp->pscan, sc->ip->i_ino);
} else {
error = xrep_parent_lookup_pptrs(sc, &parent_ino);
@@ -1354,21 +1354,40 @@ STATIC int
xrep_parent_rebuild_tree(
struct xrep_parent *rp)
{
+ struct xfs_scrub *sc = rp->sc;
+ bool try_adoption;
int error;
- if (xfs_has_parent(rp->sc->mp)) {
+ if (xfs_has_parent(sc->mp)) {
error = xrep_parent_rebuild_pptrs(rp);
if (error)
return error;
}
- if (rp->pscan.parent_ino == NULLFSINO) {
- if (xrep_orphanage_can_adopt(rp->sc))
+ /*
+ * Any file with no parent could be adopted. This check happens after
+ * rebuilding the parent pointer structure because we might have cycled
+ * the ILOCK during that process.
+ */
+ try_adoption = rp->pscan.parent_ino == NULLFSINO;
+
+ /*
+ * Starting with metadir, we allow checking of parent pointers
+ * of non-directory files that are children of the superblock.
+ * Lack of parent is ok here.
+ */
+ if (try_adoption && xfs_has_metadir(sc->mp) &&
+ xchk_inode_is_sb_rooted(sc->ip))
+ try_adoption = false;
+
+ if (try_adoption) {
+ if (xrep_orphanage_can_adopt(sc))
return xrep_parent_move_to_orphanage(rp);
return -EFSCORRUPTED;
+
}
- if (S_ISDIR(VFS_I(rp->sc->ip)->i_mode))
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
return xrep_parent_reset_dotdot(rp);
return 0;
@@ -1422,6 +1441,14 @@ xrep_parent_set_nondir_nlink(
if (error)
return error;
+ /*
+ * Starting with metadir, we allow checking of parent pointers of
+ * non-directory files that are children of the superblock. Pretend
+ * that we found a parent pointer attr.
+ */
+ if (xfs_has_metadir(sc->mp) && xchk_inode_is_sb_rooted(sc->ip))
+ rp->parents++;
+
if (rp->parents > 0 && xfs_inode_on_unlinked_list(ip)) {
xfs_trans_ijoin(sc->tp, sc->ip, 0);
joined = true;
diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c
index c77eb2de8df7..dc4033b91e44 100644
--- a/fs/xfs/scrub/quotacheck.c
+++ b/fs/xfs/scrub/quotacheck.c
@@ -398,10 +398,13 @@ xqcheck_collect_inode(
bool isreg = S_ISREG(VFS_I(ip)->i_mode);
int error = 0;
- if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) {
+ if (xfs_is_metadir_inode(ip) ||
+ xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) {
/*
* Quota files are never counted towards quota, so we do not
- * need to take the lock.
+ * need to take the lock. Files do not switch between the
+ * metadata and regular directory trees without a reallocation,
+ * so we do not need to ILOCK them either.
*/
xchk_iscan_mark_visited(&xqc->iscan, ip);
return 0;
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 53697f3c5e1b..08230952053b 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -137,7 +137,7 @@ xreap_put_freelist(
agfl_bp, agbno, 0);
if (error)
return error;
- xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
+ xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
return 0;
@@ -263,7 +263,6 @@ xreap_agextent_binval(
struct xfs_scrub *sc = rs->sc;
struct xfs_perag *pag = sc->sa.pag;
struct xfs_mount *mp = sc->mp;
- xfs_agnumber_t agno = sc->sa.pag->pag_agno;
xfs_agblock_t agbno_next = agbno + *aglenp;
xfs_agblock_t bno = agbno;
@@ -284,7 +283,7 @@ xreap_agextent_binval(
*/
while (bno < agbno_next) {
struct xrep_bufscan scan = {
- .daddr = XFS_AGB_TO_DADDR(mp, agno, bno),
+ .daddr = xfs_agbno_to_daddr(pag, bno),
.max_sectors = xrep_bufscan_max_sectors(mp,
agbno_next - bno),
.daddr_step = XFS_FSB_TO_BB(mp, 1),
@@ -391,7 +390,7 @@ xreap_agextent_iter(
xfs_fsblock_t fsbno;
int error = 0;
- fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
+ fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno);
/*
* If there are other rmappings, this block is cross linked and must
@@ -780,7 +779,6 @@ xreap_bmapi_binval(
xfs_fileoff_t off;
xfs_fileoff_t max_off;
xfs_extlen_t scan_blocks;
- xfs_agnumber_t agno = sc->sa.pag->pag_agno;
xfs_agblock_t bno;
xfs_agblock_t agbno;
xfs_agblock_t agbno_next;
@@ -837,7 +835,7 @@ xreap_bmapi_binval(
*/
while (bno < agbno_next) {
struct xrep_bufscan scan = {
- .daddr = XFS_AGB_TO_DADDR(mp, agno, bno),
+ .daddr = xfs_agbno_to_daddr(pag, bno),
.max_sectors = xrep_bufscan_max_sectors(mp,
scan_blocks),
.daddr_step = XFS_FSB_TO_BB(mp, 1),
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index d0c7d4a29c0f..2b6be75e9424 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -453,7 +453,8 @@ xchk_refcountbt_rec(
struct xchk_refcbt_records *rrc = bs->private;
xfs_refcount_btrec_to_irec(rec, &irec);
- if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
+ if (xfs_refcount_check_irec(to_perag(bs->cur->bc_group), &irec) !=
+ NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index a00d7ce7ae5b..4e572b81c986 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -215,7 +215,7 @@ xrep_refc_rmap_shareable(
return false;
/* Metadata in files are never shareable */
- if (xfs_internal_inum(mp, rmap->rm_owner))
+ if (xfs_is_sb_inum(mp, rmap->rm_owner))
return false;
/* Metadata and unwritten file blocks are not shareable. */
@@ -590,7 +590,6 @@ xrep_refc_build_new_tree(
struct xfs_scrub *sc = rr->sc;
struct xfs_btree_cur *refc_cur;
struct xfs_perag *pag = sc->sa.pag;
- xfs_fsblock_t fsbno;
int error;
error = xrep_refc_sort_records(rr);
@@ -603,8 +602,8 @@ xrep_refc_build_new_tree(
* to root the new btree while it's under construction and before we
* attach it to the AG header.
*/
- fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp));
- xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno,
+ xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC,
+ xfs_agbno_to_fsb(pag, xfs_refc_block(sc->mp)),
XFS_AG_RESV_METADATA);
rr->new_btree.bload.get_records = xrep_refc_get_records;
rr->new_btree.bload.claim_block = xrep_refc_claim_block;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 155bbaaa496e..91c8bc055a4f 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -21,6 +21,7 @@
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
+#include "xfs_rtbitmap.h"
#include "xfs_extent_busy.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
@@ -305,7 +306,7 @@ xrep_calc_ag_resblks(
/* Now grab the block counters from the AGF. */
error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
if (error) {
- aglen = pag->block_count;
+ aglen = pag_group(pag)->xg_block_count;
freelen = aglen;
usedlen = aglen;
} else {
@@ -325,16 +326,14 @@ xrep_calc_ag_resblks(
/* If the block counts are impossible, make worst-case assumptions. */
if (aglen == NULLAGBLOCK ||
- aglen != pag->block_count ||
+ aglen != pag_group(pag)->xg_block_count ||
freelen >= aglen) {
- aglen = pag->block_count;
+ aglen = pag_group(pag)->xg_block_count;
freelen = aglen;
usedlen = aglen;
}
- xfs_perag_put(pag);
- trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
- freelen, usedlen);
+ trace_xrep_calc_ag_resblks(pag, icount, aglen, freelen, usedlen);
/*
* Figure out how many blocks we'd need worst case to rebuild
@@ -372,8 +371,9 @@ xrep_calc_ag_resblks(
rmapbt_sz = 0;
}
- trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
- inobt_sz, rmapbt_sz, refcbt_sz);
+ trace_xrep_calc_ag_resblks_btsize(pag, bnobt_sz, inobt_sz, rmapbt_sz,
+ refcbt_sz);
+ xfs_perag_put(pag);
return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
}
@@ -414,7 +414,7 @@ xrep_fix_freelist(
args.mp = sc->mp;
args.tp = sc->tp;
- args.agno = sc->sa.pag->pag_agno;
+ args.agno = pag_agno(sc->sa.pag);
args.alignment = 1;
args.pag = sc->sa.pag;
@@ -483,7 +483,7 @@ xrep_findroot_block(
int block_level;
int error = 0;
- daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
+ daddr = xfs_agbno_to_daddr(ri->sc->sa.pag, agbno);
/*
* Blocks in the AGFL have stale contents that might just happen to
@@ -612,7 +612,7 @@ xrep_findroot_block(
else
fab->root = NULLAGBLOCK;
- trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
+ trace_xrep_findroot_block(ri->sc->sa.pag, agbno,
be32_to_cpu(btblock->bb_magic), fab->height - 1);
out:
xfs_trans_brelse(ri->sc->tp, bp);
@@ -953,6 +953,29 @@ xrep_ag_init(
return 0;
}
+#ifdef CONFIG_XFS_RT
+/*
+ * Given a reference to a rtgroup structure, lock rtgroup btree inodes and
+ * create btree cursors. Must only be called to repair a regular rt file.
+ */
+int
+xrep_rtgroup_init(
+ struct xfs_scrub *sc,
+ struct xfs_rtgroup *rtg,
+ struct xchk_rt *sr,
+ unsigned int rtglock_flags)
+{
+ ASSERT(sr->rtg == NULL);
+
+ xfs_rtgroup_lock(rtg, rtglock_flags);
+ sr->rtlock_flags = rtglock_flags;
+
+ /* Grab our own passive reference from the caller's ref. */
+ sr->rtg = xfs_rtgroup_hold(rtg);
+ return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
/* Reinitialize the per-AG block reservation for the AG we just fixed. */
int
xrep_reset_perag_resv(
@@ -973,7 +996,7 @@ xrep_reset_perag_resv(
if (error == -ENOSPC) {
xfs_err(sc->mp,
"Insufficient free space to reset per-AG reservation for AG %u after repair.",
- sc->sa.pag->pag_agno);
+ pag_agno(sc->sa.pag));
error = 0;
}
@@ -1083,7 +1106,12 @@ xrep_metadata_inode_forks(
if (error)
return error;
- /* Make sure the attr fork looks ok before we delete it. */
+ /*
+ * Metadata files can only have extended attributes on metadir
+ * filesystems, either for parent pointers or for actual xattr data.
+ * For a non-metadir filesystem, make sure the attr fork looks ok
+ * before we delete it.
+ */
if (xfs_inode_hasattr(sc->ip)) {
error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
if (error)
@@ -1099,8 +1127,11 @@ xrep_metadata_inode_forks(
return error;
}
- /* Clear the attr forks since metadata shouldn't have that. */
- if (xfs_inode_hasattr(sc->ip)) {
+ /*
+ * Metadata files on non-metadir filesystems cannot have attr forks,
+ * so clear them now.
+ */
+ if (xfs_inode_hasattr(sc->ip) && !xfs_has_metadir(sc->mp)) {
if (!dirty) {
dirty = true;
xfs_trans_ijoin(sc->tp, sc->ip, 0);
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 0e0dc2bf985c..b649da1a93eb 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -8,6 +8,7 @@
#include "xfs_quota_defs.h"
+struct xfs_rtgroup;
struct xchk_stats_run;
static inline int xrep_notsupported(struct xfs_scrub *sc)
@@ -106,6 +107,12 @@ int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap);
void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
struct xchk_ag *sa);
+#ifdef CONFIG_XFS_RT
+int xrep_rtgroup_init(struct xfs_scrub *sc, struct xfs_rtgroup *rtg,
+ struct xchk_rt *sr, unsigned int rtglock_flags);
+#else
+# define xrep_rtgroup_init(sc, rtg, sr, lockflags) (-ENOSYS)
+#endif /* CONFIG_XFS_RT */
/* Metadata revalidators */
@@ -134,13 +141,16 @@ int xrep_directory(struct xfs_scrub *sc);
int xrep_parent(struct xfs_scrub *sc);
int xrep_symlink(struct xfs_scrub *sc);
int xrep_dirtree(struct xfs_scrub *sc);
+int xrep_metapath(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xrep_rtbitmap(struct xfs_scrub *sc);
int xrep_rtsummary(struct xfs_scrub *sc);
+int xrep_rgsuperblock(struct xfs_scrub *sc);
#else
# define xrep_rtbitmap xrep_notsupported
# define xrep_rtsummary xrep_notsupported
+# define xrep_rgsuperblock xrep_notsupported
#endif /* CONFIG_XFS_RT */
#ifdef CONFIG_XFS_QUOTA
@@ -208,6 +218,7 @@ xrep_setup_nothing(
#define xrep_setup_parent xrep_setup_nothing
#define xrep_setup_nlinks xrep_setup_nothing
#define xrep_setup_dirtree xrep_setup_nothing
+#define xrep_setup_metapath xrep_setup_nothing
#define xrep_setup_inode(sc, imap) ((void)0)
@@ -243,6 +254,8 @@ static inline int xrep_setup_symlink(struct xfs_scrub *sc, unsigned int *x)
#define xrep_parent xrep_notsupported
#define xrep_symlink xrep_notsupported
#define xrep_dirtree xrep_notsupported
+#define xrep_metapath xrep_notsupported
+#define xrep_rgsuperblock xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rgsuper.c b/fs/xfs/scrub/rgsuper.c
new file mode 100644
index 000000000000..463b3573bb76
--- /dev/null
+++ b/fs/xfs/scrub/rgsuper.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_rtgroup.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+
+/* Set us up with a transaction and an empty context. */
+int
+xchk_setup_rgsuperblock(
+ struct xfs_scrub *sc)
+{
+ return xchk_trans_alloc(sc, 0);
+}
+
+/* Cross-reference with the other rt metadata. */
+STATIC void
+xchk_rgsuperblock_xref(
+ struct xfs_scrub *sc)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return;
+
+ xchk_xref_is_used_rt_space(sc, xfs_rgbno_to_rtb(sc->sr.rtg, 0), 1);
+}
+
+int
+xchk_rgsuperblock(
+ struct xfs_scrub *sc)
+{
+ xfs_rgnumber_t rgno = sc->sm->sm_agno;
+ int error;
+
+ /*
+ * Only rtgroup 0 has a superblock. We may someday want to use higher
+ * rgno for other functions, similar to what we do with the primary
+ * super scrub function.
+ */
+ if (rgno != 0)
+ return -ENOENT;
+
+ /*
+ * Grab an active reference to the rtgroup structure. If we can't get
+ * it, we're racing with something that's tearing down the group, so
+ * signal that the group no longer exists. Take the rtbitmap in shared
+ * mode so that the group can't change while we're doing things.
+ */
+ error = xchk_rtgroup_init_existing(sc, rgno, &sc->sr);
+ if (!xchk_xref_process_error(sc, 0, 0, &error))
+ return error;
+
+ xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP_SHARED);
+
+ /*
+ * Since we already validated the rt superblock at mount time, we don't
+ * need to check its contents again. All we need is to cross-reference.
+ */
+ xchk_rgsuperblock_xref(sc);
+ return 0;
+}
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int
+xrep_rgsuperblock(
+ struct xfs_scrub *sc)
+{
+ ASSERT(rtg_rgno(sc->sr.rtg) == 0);
+
+ xfs_log_sb(sc->tp);
+ return 0;
+}
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index ba5bbc3fb754..39e9ad7cd8ae 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -358,7 +358,7 @@ xchk_rmapbt_rec(
struct xfs_rmap_irec irec;
if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
- xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
+ xfs_rmap_check_irec(to_perag(bs->cur->bc_group), &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -410,7 +410,7 @@ xchk_rmapbt_walk_ag_metadata(
goto out;
/* OWN_LOG: Internal log */
- if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) {
+ if (xfs_ag_contains_log(mp, pag_agno(sc->sa.pag))) {
error = xagb_bitmap_set(&cr->log_owned,
XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart),
mp->m_sb.sb_logblocks);
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index e8080eba37d2..a0a227d183d2 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -231,7 +231,7 @@ xrep_rmap_stash(
if (xchk_iscan_aborted(&rr->iscan))
return -EFSCORRUPTED;
- trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap);
+ trace_xrep_rmap_found(sc->sa.pag, &rmap);
mutex_lock(&rr->lock);
mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
@@ -344,7 +344,7 @@ xrep_rmap_visit_bmbt(
int error;
if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
- rf->rr->sc->sa.pag->pag_agno)
+ pag_agno(rf->rr->sc->sa.pag))
return 0;
agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
@@ -391,7 +391,7 @@ xrep_rmap_visit_iroot_btree_block(
return 0;
fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
- if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno)
+ if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != pag_agno(rf->rr->sc->sa.pag))
return 0;
agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -622,7 +622,7 @@ xrep_rmap_walk_inobt(
return error;
xfs_inobt_btrec_to_irec(mp, rec, &irec);
- if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL)
+ if (xfs_inobt_check_irec(to_perag(cur->bc_group), &irec) != NULL)
return -EFSCORRUPTED;
agino = irec.ir_startino;
@@ -801,7 +801,7 @@ xrep_rmap_find_log_rmaps(
{
struct xfs_scrub *sc = rr->sc;
- if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno))
+ if (!xfs_ag_contains_log(sc->mp, pag_agno(sc->sa.pag)))
return 0;
return xrep_rmap_stash(rr,
@@ -976,7 +976,7 @@ xrep_rmap_try_reserve(
{
struct xrep_rmap_agfl ra = {
.bitmap = freesp_blocks,
- .agno = rr->sc->sa.pag->pag_agno,
+ .agno = pag_agno(rr->sc->sa.pag),
};
struct xfs_scrub *sc = rr->sc;
struct xrep_newbt_resv *resv, *n;
@@ -1272,7 +1272,6 @@ xrep_rmap_build_new_tree(
struct xfs_perag *pag = sc->sa.pag;
struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
struct xfs_btree_cur *rmap_cur;
- xfs_fsblock_t fsbno;
int error;
/*
@@ -1290,9 +1289,9 @@ xrep_rmap_build_new_tree(
* rmapbt per-AG reservation, which we will adjust further after
* committing the new btree.
*/
- fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp));
xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
- fsbno, XFS_AG_RESV_RMAPBT);
+ xfs_agbno_to_fsb(pag, XFS_RMAP_BLOCK(sc->mp)),
+ XFS_AG_RESV_RMAPBT);
rr->new_btree.bload.get_records = xrep_rmap_get_records;
rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
@@ -1553,7 +1552,7 @@ xrep_rmapbt_live_update(
if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
goto out_unlock;
- trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p);
+ trace_xrep_rmap_live_update(rr->sc->sa.pag, action, p);
error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
if (error)
@@ -1597,7 +1596,7 @@ xrep_rmap_setup_scan(
/* Set up in-memory rmap btree */
error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
- sc->sa.pag->pag_agno);
+ pag_agno(sc->sa.pag));
if (error)
goto out_mutex;
@@ -1612,7 +1611,7 @@ xrep_rmap_setup_scan(
*/
ASSERT(sc->flags & XCHK_FSGATES_RMAP);
xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
- error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook);
+ error = xfs_rmap_hook_add(pag_group(sc->sa.pag), &rr->rhook);
if (error)
goto out_iscan;
return 0;
@@ -1633,7 +1632,7 @@ xrep_rmap_teardown(
struct xfs_scrub *sc = rr->sc;
xchk_iscan_abort(&rr->iscan);
- xfs_rmap_hook_del(sc->sa.pag, &rr->rhook);
+ xfs_rmap_hook_del(pag_group(sc->sa.pag), &rr->rhook);
xchk_iscan_teardown(&rr->iscan);
xfbtree_destroy(&rr->rmap_btree);
mutex_destroy(&rr->lock);
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 46583517377f..376a36fd9a9c 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -35,6 +35,10 @@ xchk_setup_rtbitmap(
return -ENOMEM;
sc->buf = rtb;
+ error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+ if (error)
+ return error;
+
if (xchk_could_repair(sc)) {
error = xrep_setup_rtbitmap(sc, rtb);
if (error)
@@ -45,7 +49,8 @@ xchk_setup_rtbitmap(
if (error)
return error;
- error = xchk_install_live_inode(sc, sc->mp->m_rbmip);
+ error = xchk_install_live_inode(sc,
+ sc->sr.rtg->rtg_inodes[XFS_RTGI_BITMAP]);
if (error)
return error;
@@ -53,18 +58,18 @@ xchk_setup_rtbitmap(
if (error)
return error;
- xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
-
/*
* Now that we've locked the rtbitmap, we can't race with growfsrt
* trying to expand the bitmap or change the size of the rt volume.
* Hence it is safe to compute and check the geometry values.
*/
+ xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
if (mp->m_sb.sb_rblocks) {
- rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+ rtb->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
- rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents);
+ rtb->rbmblocks = xfs_rtbitmap_blockcount(mp);
}
+
return 0;
}
@@ -73,7 +78,7 @@ xchk_setup_rtbitmap(
/* Scrub a free extent record from the realtime bitmap. */
STATIC int
xchk_rtbitmap_rec(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
@@ -82,10 +87,10 @@ xchk_rtbitmap_rec(
xfs_rtblock_t startblock;
xfs_filblks_t blockcount;
- startblock = xfs_rtx_to_rtb(mp, rec->ar_startext);
- blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount);
+ startblock = xfs_rtx_to_rtb(rtg, rec->ar_startext);
+ blockcount = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount);
- if (!xfs_verify_rtbext(mp, startblock, blockcount))
+ if (!xfs_verify_rtbext(rtg_mount(rtg), startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
@@ -140,18 +145,20 @@ xchk_rtbitmap(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
struct xchk_rtbitmap *rtb = sc->buf;
int error;
/* Is sb_rextents correct? */
if (mp->m_sb.sb_rextents != rtb->rextents) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
/* Is sb_rextslog correct? */
if (mp->m_sb.sb_rextslog != rtb->rextslog) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
@@ -160,17 +167,17 @@ xchk_rtbitmap(
* case can we exceed 4bn bitmap blocks since the super field is a u32.
*/
if (rtb->rbmblocks > U32_MAX) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
/* The bitmap file length must be aligned to an fsblock. */
- if (mp->m_rbmip->i_disk_size & mp->m_blockmask) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ if (rbmip->i_disk_size & mp->m_blockmask) {
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
@@ -179,8 +186,8 @@ xchk_rtbitmap(
* growfsrt expands the bitmap file before updating sb_rextents, so the
* file can be larger than sb_rbmblocks.
*/
- if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ if (rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) {
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
return 0;
}
@@ -193,7 +200,7 @@ xchk_rtbitmap(
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc);
+ error = xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtbitmap_rec, sc);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
return error;
@@ -207,6 +214,8 @@ xchk_xref_is_used_rt_space(
xfs_rtblock_t rtbno,
xfs_extlen_t len)
{
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
xfs_rtxnum_t startext;
xfs_rtxnum_t endext;
bool is_free;
@@ -217,13 +226,10 @@ xchk_xref_is_used_rt_space(
startext = xfs_rtb_to_rtx(sc->mp, rtbno);
endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
- xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext,
+ error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext,
endext - startext + 1, &is_free);
if (!xchk_should_check_xref(sc, &error, NULL))
- goto out_unlock;
+ return;
if (is_free)
- xchk_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
-out_unlock:
- xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
}
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 7c7366c98338..49fc6250bafc 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -18,6 +18,7 @@
#include "xfs_bmap.h"
#include "xfs_sb.h"
#include "xfs_exchmaps.h"
+#include "xfs_rtgroup.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -46,12 +47,19 @@ xchk_setup_rtsummary(
struct xchk_rtsummary *rts;
int error;
+ if (xchk_need_intent_drain(sc))
+ xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize),
XCHK_GFP_FLAGS);
if (!rts)
return -ENOMEM;
sc->buf = rts;
+ error = xchk_rtgroup_init(sc, sc->sm->sm_agno, &sc->sr);
+ if (error)
+ return error;
+
if (xchk_could_repair(sc)) {
error = xrep_setup_rtsummary(sc, rts);
if (error)
@@ -73,7 +81,8 @@ xchk_setup_rtsummary(
if (error)
return error;
- error = xchk_install_live_inode(sc, mp->m_rsumip);
+ error = xchk_install_live_inode(sc,
+ sc->sr.rtg->rtg_inodes[XFS_RTGI_SUMMARY]);
if (error)
return error;
@@ -82,29 +91,23 @@ xchk_setup_rtsummary(
return error;
/*
- * Locking order requires us to take the rtbitmap first. We must be
- * careful to unlock it ourselves when we are done with the rtbitmap
- * file since the scrub infrastructure won't do that for us. Only
- * then we can lock the rtsummary inode.
- */
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
-
- /*
* Now that we've locked the rtbitmap and rtsummary, we can't race with
* growfsrt trying to expand the summary or change the size of the rt
* volume. Hence it is safe to compute and check the geometry values.
+ *
+ * Note that there is no strict requirement for an exclusive lock on the
+ * summary here, but to keep the locking APIs simple we lock both inodes
+ * exclusively here. If we ever start caring about running concurrent
+ * fsmap with scrub this could be changed.
*/
+ xchk_rtgroup_lock(&sc->sr, XFS_RTGLOCK_BITMAP);
if (mp->m_sb.sb_rblocks) {
- int rextslog;
-
- rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
- rextslog = xfs_compute_rextslog(rts->rextents);
- rts->rsumlevels = rextslog + 1;
- rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents);
- rts->rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels,
- rts->rbmblocks);
+ rts->rextents = xfs_blen_to_rtbxlen(mp, mp->m_sb.sb_rblocks);
+ rts->rbmblocks = xfs_rtbitmap_blockcount(mp);
+ rts->rsumblocks =
+ xfs_rtsummary_blockcount(mp, &rts->rsumlevels);
}
+
return 0;
}
@@ -148,6 +151,11 @@ xchk_rtsum_inc(
struct xfs_mount *mp,
union xfs_suminfo_raw *v)
{
+ if (xfs_has_rtgroups(mp)) {
+ be32_add_cpu(&v->rtg, 1);
+ return be32_to_cpu(v->rtg);
+ }
+
v->old += 1;
return v->old;
}
@@ -155,11 +163,12 @@ xchk_rtsum_inc(
/* Update the summary file to reflect the free extent that we've accumulated. */
STATIC int
xchk_rtsum_record_free(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
{
+ struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_scrub *sc = priv;
xfs_fileoff_t rbmoff;
xfs_rtblock_t rtbno;
@@ -178,11 +187,12 @@ xchk_rtsum_record_free(
lenlog = xfs_highbit64(rec->ar_extcount);
offs = xfs_rtsumoffs(mp, lenlog, rbmoff);
- rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
- rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
+ rtbno = xfs_rtx_to_rtb(rtg, rec->ar_startext);
+ rtlen = xfs_rtxlen_to_extlen(mp, rec->ar_extcount);
if (!xfs_verify_rtbext(mp, rtbno, rtlen)) {
- xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
+ xchk_ino_xref_set_corrupt(sc,
+ rtg->rtg_inodes[XFS_RTGI_BITMAP]->i_ino);
return -EFSCORRUPTED;
}
@@ -204,15 +214,14 @@ xchk_rtsum_compute(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
- unsigned long long rtbmp_blocks;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
/* If the bitmap size doesn't match the computed size, bail. */
- rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents);
- if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size)
+ if (XFS_FSB_TO_B(mp, xfs_rtbitmap_blockcount(mp)) !=
+ rtg->rtg_inodes[XFS_RTGI_BITMAP]->i_disk_size)
return -EFSCORRUPTED;
- return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free,
- sc);
+ return xfs_rtalloc_query_all(rtg, sc->tp, xchk_rtsum_record_free, sc);
}
/* Compare the rtsummary file against the one we computed. */
@@ -231,8 +240,9 @@ xchk_rtsum_compare(
xfs_rtsumoff_t sumoff = 0;
int error = 0;
- rts->args.mp = sc->mp;
+ rts->args.mp = mp;
rts->args.tp = sc->tp;
+ rts->args.rtg = sc->sr.rtg;
/* Mappings may not cross or lie beyond EOF. */
endoff = XFS_B_TO_FSB(mp, ip->i_disk_size);
@@ -299,31 +309,34 @@ xchk_rtsummary(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
+ struct xfs_rtgroup *rtg = sc->sr.rtg;
+ struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+ struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY];
struct xchk_rtsummary *rts = sc->buf;
- int error = 0;
+ int error;
/* Is sb_rextents correct? */
if (mp->m_sb.sb_rextents != rts->rextents) {
- xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
- goto out_rbm;
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
+ return 0;
}
/* Is m_rsumlevels correct? */
if (mp->m_rsumlevels != rts->rsumlevels) {
- xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
- goto out_rbm;
+ xchk_ino_set_corrupt(sc, rsumip->i_ino);
+ return 0;
}
/* Is m_rsumsize correct? */
if (mp->m_rsumblocks != rts->rsumblocks) {
- xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
- goto out_rbm;
+ xchk_ino_set_corrupt(sc, rsumip->i_ino);
+ return 0;
}
/* The summary file length must be aligned to an fsblock. */
- if (mp->m_rsumip->i_disk_size & mp->m_blockmask) {
- xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
- goto out_rbm;
+ if (rsumip->i_disk_size & mp->m_blockmask) {
+ xchk_ino_set_corrupt(sc, rsumip->i_ino);
+ return 0;
}
/*
@@ -331,15 +344,15 @@ xchk_rtsummary(
* growfsrt expands the summary file before updating sb_rextents, so
* the file can be larger than rsumsize.
*/
- if (mp->m_rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) {
- xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
- goto out_rbm;
+ if (rsumip->i_disk_size < XFS_FSB_TO_B(mp, rts->rsumblocks)) {
+ xchk_ino_set_corrupt(sc, rsumip->i_ino);
+ return 0;
}
/* Invoke the fork scrubber. */
error = xchk_metadata_inode_forks(sc);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
- goto out_rbm;
+ return error;
/* Construct the new summary file from the rtbitmap. */
error = xchk_rtsum_compute(sc);
@@ -348,23 +361,12 @@ xchk_rtsummary(
* EFSCORRUPTED means the rtbitmap is corrupt, which is an xref
* error since we're checking the summary file.
*/
- xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
- error = 0;
- goto out_rbm;
+ xchk_ino_set_corrupt(sc, rbmip->i_ino);
+ return 0;
}
if (error)
- goto out_rbm;
+ return error;
/* Does the computed summary file match the actual rtsummary file? */
- error = xchk_rtsum_compare(sc);
-
-out_rbm:
- /*
- * Unlock the rtbitmap since we're done with it. All other writers of
- * the rt free space metadata grab the bitmap and summary ILOCKs in
- * that order, so we're still protected against allocation activities
- * even if we continue on to the repair function.
- */
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- return error;
+ return xchk_rtsum_compare(sc);
}
diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c
index 7deeb948cb70..8198ea84ad70 100644
--- a/fs/xfs/scrub/rtsummary_repair.c
+++ b/fs/xfs/scrub/rtsummary_repair.c
@@ -76,18 +76,30 @@ xrep_rtsummary_prep_buf(
union xfs_suminfo_raw *ondisk;
int error;
- rts->args.mp = sc->mp;
+ rts->args.mp = mp;
rts->args.tp = sc->tp;
+ rts->args.rtg = sc->sr.rtg;
rts->args.sumbp = bp;
ondisk = xfs_rsumblock_infoptr(&rts->args, 0);
rts->args.sumbp = NULL;
- bp->b_ops = &xfs_rtbuf_ops;
-
error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize);
if (error)
return error;
+ if (xfs_has_rtgroups(sc->mp)) {
+ struct xfs_rtbuf_blkinfo *hdr = bp->b_addr;
+
+ hdr->rt_magic = cpu_to_be32(XFS_RTSUMMARY_MAGIC);
+ hdr->rt_owner = cpu_to_be64(sc->ip->i_ino);
+ hdr->rt_blkno = cpu_to_be64(xfs_buf_daddr(bp));
+ hdr->rt_lsn = 0;
+ uuid_copy(&hdr->rt_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ bp->b_ops = &xfs_rtsummary_buf_ops;
+ } else {
+ bp->b_ops = &xfs_rtbuf_ops;
+ }
+
rts->prep_wordoff += mp->m_blockwsize;
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF);
return 0;
@@ -162,8 +174,8 @@ xrep_rtsummary(
return error;
/* Reset incore state and blow out the summary cache. */
- if (mp->m_rsum_cache)
- memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks);
+ if (sc->sr.rtg->rtg_rsum_cache)
+ memset(sc->sr.rtg->rtg_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks);
mp->m_rsumlevels = rts->rsumlevels;
mp->m_rsumblocks = rts->rsumblocks;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4cbcf7a86dbe..950f5a58dcd9 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -225,6 +225,8 @@ xchk_teardown(
xfs_trans_cancel(sc->tp);
sc->tp = NULL;
}
+ if (sc->sr.rtg)
+ xchk_rtgroup_free(sc, &sc->sr);
if (sc->ip) {
if (sc->ilock_flags)
xchk_iunlock(sc, sc->ilock_flags);
@@ -382,13 +384,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.repair = xrep_parent,
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
- .type = ST_FS,
+ .type = ST_RTGROUP,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
- .type = ST_FS,
+ .type = ST_RTGROUP,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
.repair = xrep_rtsummary,
@@ -442,6 +444,20 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.has = xfs_has_parent,
.repair = xrep_dirtree,
},
+ [XFS_SCRUB_TYPE_METAPATH] = { /* metadata directory tree path */
+ .type = ST_GENERIC,
+ .setup = xchk_setup_metapath,
+ .scrub = xchk_metapath,
+ .has = xfs_has_metadir,
+ .repair = xrep_metapath,
+ },
+ [XFS_SCRUB_TYPE_RGSUPER] = { /* realtime group superblock */
+ .type = ST_RTGROUP,
+ .setup = xchk_setup_rgsuperblock,
+ .scrub = xchk_rgsuperblock,
+ .has = xfs_has_rtsb,
+ .repair = xrep_rgsuperblock,
+ },
};
static int
@@ -489,6 +505,35 @@ xchk_validate_inputs(
if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
goto out;
break;
+ case ST_GENERIC:
+ break;
+ case ST_RTGROUP:
+ if (sm->sm_ino || sm->sm_gen)
+ goto out;
+ if (xfs_has_rtgroups(mp)) {
+ /*
+ * On a rtgroups filesystem, there won't be an rtbitmap
+ * or rtsummary file for group 0 unless there's
+ * actually a realtime volume attached. However, older
+ * xfs_scrub always calls the rtbitmap/rtsummary
+ * scrubbers with sm_agno==0 so transform the error
+ * code to ENOENT.
+ */
+ if (sm->sm_agno >= mp->m_sb.sb_rgcount) {
+ if (sm->sm_agno == 0)
+ error = -ENOENT;
+ goto out;
+ }
+ } else {
+ /*
+ * Prior to rtgroups, the rtbitmap/rtsummary scrubbers
+ * accepted sm_agno==0, so we still accept that for
+ * scrubbing pre-rtgroups filesystems.
+ */
+ if (sm->sm_agno != 0)
+ goto out;
+ }
+ break;
default:
goto out;
}
@@ -605,8 +650,7 @@ xfs_scrub_metadata(
if (error)
goto out;
- xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
- "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB);
sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
if (!sc) {
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 5993fcaffb2c..a7fda3e2b013 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -73,6 +73,8 @@ enum xchk_type {
ST_PERAG, /* per-AG metadata */
ST_FS, /* per-FS metadata */
ST_INODE, /* per-inode metadata */
+ ST_GENERIC, /* determined by the scrubber */
+ ST_RTGROUP, /* rtgroup metadata */
};
struct xchk_meta_ops {
@@ -117,6 +119,15 @@ struct xchk_ag {
struct xfs_btree_cur *refc_cur;
};
+/* Inode lock state for the RT volume. */
+struct xchk_rt {
+ /* incore rtgroup, if applicable */
+ struct xfs_rtgroup *rtg;
+
+ /* XFS_RTGLOCK_* lock state if locked */
+ unsigned int rtlock_flags;
+};
+
struct xfs_scrub {
/* General scrub state. */
struct xfs_mount *mp;
@@ -178,6 +189,9 @@ struct xfs_scrub {
/* State tracking for single-AG operations. */
struct xchk_ag sa;
+
+ /* State tracking for realtime operations. */
+ struct xchk_rt sr;
};
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
@@ -255,12 +269,15 @@ int xchk_xattr(struct xfs_scrub *sc);
int xchk_symlink(struct xfs_scrub *sc);
int xchk_parent(struct xfs_scrub *sc);
int xchk_dirtree(struct xfs_scrub *sc);
+int xchk_metapath(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
int xchk_rtbitmap(struct xfs_scrub *sc);
int xchk_rtsummary(struct xfs_scrub *sc);
+int xchk_rgsuperblock(struct xfs_scrub *sc);
#else
# define xchk_rtbitmap xchk_nothing
# define xchk_rtsummary xchk_nothing
+# define xchk_rgsuperblock xchk_nothing
#endif
#ifdef CONFIG_XFS_QUOTA
int xchk_quota(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index 7996c2335476..a476c7b2ab75 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -80,6 +80,8 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
[XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck",
[XFS_SCRUB_TYPE_NLINKS] = "nlinks",
[XFS_SCRUB_TYPE_DIRTREE] = "dirtree",
+ [XFS_SCRUB_TYPE_METAPATH] = "metapath",
+ [XFS_SCRUB_TYPE_RGSUPER] = "rgsuper",
};
/* Format the scrub stats into a text buffer, similar to pcp style. */
diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c
index 177f922acfaf..4b7f7860e37e 100644
--- a/fs/xfs/scrub/tempfile.c
+++ b/fs/xfs/scrub/tempfile.c
@@ -22,6 +22,7 @@
#include "xfs_exchmaps.h"
#include "xfs_defer.h"
#include "xfs_symlink_remote.h"
+#include "xfs_metafile.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
@@ -182,6 +183,101 @@ out_release_dquots:
return error;
}
+/*
+ * Temporary files have to be created before we even know which inode we're
+ * going to scrub, so we assume that they will be part of the regular directory
+ * tree. If it turns out that we're actually scrubbing a file from the
+ * metadata directory tree, we have to subtract the temp file from the root
+ * dquots and detach the dquots.
+ */
+int
+xrep_tempfile_adjust_directory_tree(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (!sc->tempip)
+ return 0;
+
+ ASSERT(sc->tp == NULL);
+ ASSERT(!xfs_is_metadir_inode(sc->tempip));
+
+ if (!sc->ip || !xfs_is_metadir_inode(sc->ip))
+ return 0;
+
+ xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
+ sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ goto out_iolock;
+
+ xrep_tempfile_ilock(sc);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+ /* Metadir files are not accounted in quota, so drop icount */
+ xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, -1L);
+ xfs_metafile_set_iflag(sc->tp, sc->tempip, XFS_METAFILE_UNKNOWN);
+
+ error = xrep_trans_commit(sc);
+ if (error)
+ goto out_ilock;
+
+ xfs_qm_dqdetach(sc->tempip);
+out_ilock:
+ xrep_tempfile_iunlock(sc);
+out_iolock:
+ xrep_tempfile_iounlock(sc);
+ return error;
+}
+
+/*
+ * Remove this temporary file from the metadata directory tree so that it can
+ * be inactivated the normal way.
+ */
+STATIC int
+xrep_tempfile_remove_metadir(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (!sc->tempip || !xfs_is_metadir_inode(sc->tempip))
+ return 0;
+
+ ASSERT(sc->tp == NULL);
+
+ xfs_ilock(sc->tempip, XFS_IOLOCK_EXCL);
+ sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ goto out_iolock;
+
+ xrep_tempfile_ilock(sc);
+ xfs_trans_ijoin(sc->tp, sc->tempip, 0);
+
+ xfs_metafile_clear_iflag(sc->tp, sc->tempip);
+
+ /* Non-metadir files are accounted in quota, so bump bcount/icount */
+ error = xfs_qm_dqattach_locked(sc->tempip, false);
+ if (error)
+ goto out_cancel;
+
+ xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_ICOUNT, 1L);
+ xfs_trans_mod_dquot_byino(sc->tp, sc->tempip, XFS_TRANS_DQ_BCOUNT,
+ sc->tempip->i_nblocks);
+ error = xrep_trans_commit(sc);
+ goto out_ilock;
+
+out_cancel:
+ xchk_trans_cancel(sc);
+out_ilock:
+ xrep_tempfile_iunlock(sc);
+out_iolock:
+ xrep_tempfile_iounlock(sc);
+ return error;
+}
+
/* Take IOLOCK_EXCL on the temporary file, maybe. */
bool
xrep_tempfile_iolock_nowait(
@@ -290,6 +386,7 @@ xrep_tempfile_rele(
sc->temp_ilock_flags = 0;
}
+ xrep_tempfile_remove_metadir(sc);
xchk_irele(sc, sc->tempip);
sc->tempip = NULL;
}
@@ -844,6 +941,14 @@ xrep_is_tempfile(
const struct xfs_inode *ip)
{
const struct inode *inode = &ip->i_vnode;
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * Files in the metadata directory tree also have S_PRIVATE set and
+ * IOP_XATTR unset, so we must distinguish them separately.
+ */
+ if (xfs_has_metadir(mp) && (ip->i_diflags2 & XFS_DIFLAG2_METADATA))
+ return false;
if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR))
return true;
diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h
index e51399f595fe..71c1b54599c3 100644
--- a/fs/xfs/scrub/tempfile.h
+++ b/fs/xfs/scrub/tempfile.h
@@ -10,6 +10,8 @@
int xrep_tempfile_create(struct xfs_scrub *sc, uint16_t mode);
void xrep_tempfile_rele(struct xfs_scrub *sc);
+int xrep_tempfile_adjust_directory_tree(struct xfs_scrub *sc);
+
bool xrep_tempfile_iolock_nowait(struct xfs_scrub *sc);
int xrep_tempfile_iolock_polled(struct xfs_scrub *sc);
void xrep_tempfile_iounlock(struct xfs_scrub *sc);
@@ -42,6 +44,7 @@ static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc)
xchk_ilock(sc, XFS_IOLOCK_EXCL);
}
# define xrep_is_tempfile(ip) (false)
+# define xrep_tempfile_adjust_directory_tree(sc) (0)
# define xrep_tempfile_rele(sc)
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 4470ad0533b8..98f923ae664d 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -20,6 +20,7 @@
#include "xfs_dir2.h"
#include "xfs_rmap.h"
#include "xfs_parent.h"
+#include "xfs_metafile.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index c886d5d0eb02..9b38f5ad1eaf 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -70,6 +70,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIRTREE);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_METAPATH);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RGSUPER);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
@@ -101,7 +103,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BARRIER);
{ XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \
{ XFS_SCRUB_TYPE_HEALTHY, "healthy" }, \
{ XFS_SCRUB_TYPE_DIRTREE, "dirtree" }, \
- { XFS_SCRUB_TYPE_BARRIER, "barrier" }
+ { XFS_SCRUB_TYPE_BARRIER, "barrier" }, \
+ { XFS_SCRUB_TYPE_METAPATH, "metapath" }, \
+ { XFS_SCRUB_TYPE_RGSUPER, "rgsuper" }
#define XFS_SCRUB_FLAG_STRINGS \
{ XFS_SCRUB_IFLAG_REPAIR, "repair" }, \
@@ -772,12 +776,12 @@ TRACE_EVENT(xchk_xref_error,
);
TRACE_EVENT(xchk_iallocbt_check_cluster,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t startino, xfs_daddr_t map_daddr,
- unsigned short map_len, unsigned int chunk_ino,
- unsigned int nr_inodes, uint16_t cluster_mask,
- uint16_t holemask, unsigned int cluster_ino),
- TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+ TP_PROTO(const struct xfs_perag *pag, xfs_agino_t startino,
+ xfs_daddr_t map_daddr, unsigned short map_len,
+ unsigned int chunk_ino, unsigned int nr_inodes,
+ uint16_t cluster_mask, uint16_t holemask,
+ unsigned int cluster_ino),
+ TP_ARGS(pag, startino, map_daddr, map_len, chunk_ino, nr_inodes,
cluster_mask, holemask, cluster_ino),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -792,8 +796,8 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
__field(uint16_t, holemask)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->startino = startino;
__entry->map_daddr = map_daddr;
__entry->map_len = map_len;
@@ -922,7 +926,8 @@ DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze);
DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw);
TRACE_EVENT(xchk_refcount_incorrect,
- TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec,
+ TP_PROTO(const struct xfs_perag *pag,
+ const struct xfs_refcount_irec *irec,
xfs_nlink_t seen),
TP_ARGS(pag, irec, seen),
TP_STRUCT__entry(
@@ -935,8 +940,8 @@ TRACE_EVENT(xchk_refcount_incorrect,
__field(xfs_nlink_t, seen)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
@@ -1752,6 +1757,7 @@ DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_badgen);
DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_nondir_parent);
DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_unlinked_parent);
DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_found_next_step);
+DEFINE_XCHK_DIRPATH_EVENT(xchk_dirpath_crosses_tree);
TRACE_DEFINE_ENUM(XCHK_DIRPATH_SCANNING);
TRACE_DEFINE_ENUM(XCHK_DIRPATH_DELETE);
@@ -1914,11 +1920,44 @@ TRACE_EVENT(xchk_dirtree_live_update,
__get_str(name))
);
+DECLARE_EVENT_CLASS(xchk_metapath_class,
+ TP_PROTO(struct xfs_scrub *sc, const char *path,
+ struct xfs_inode *dp, xfs_ino_t ino),
+ TP_ARGS(sc, path, dp, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, scrub_ino)
+ __field(xfs_ino_t, parent_ino)
+ __field(xfs_ino_t, ino)
+ __string(name, path)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->scrub_ino = sc->ip ? sc->ip->i_ino : NULLFSINO;
+ __entry->parent_ino = dp ? dp->i_ino : NULLFSINO;
+ __entry->ino = ino;
+ __assign_str(name);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx name '%s' ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->scrub_ino,
+ __entry->parent_ino,
+ __get_str(name),
+ __entry->ino)
+);
+#define DEFINE_XCHK_METAPATH_EVENT(name) \
+DEFINE_EVENT(xchk_metapath_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, const char *path, \
+ struct xfs_inode *dp, xfs_ino_t ino), \
+ TP_ARGS(sc, path, dp, ino))
+DEFINE_XCHK_METAPATH_EVENT(xchk_metapath_lookup);
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
DECLARE_EVENT_CLASS(xrep_extent_class,
- TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
+ xfs_extlen_t len),
TP_ARGS(pag, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1927,8 +1966,8 @@ DECLARE_EVENT_CLASS(xrep_extent_class,
__field(xfs_extlen_t, len)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->agbno = agbno;
__entry->len = len;
),
@@ -1940,7 +1979,8 @@ DECLARE_EVENT_CLASS(xrep_extent_class,
);
#define DEFINE_REPAIR_EXTENT_EVENT(name) \
DEFINE_EVENT(xrep_extent_class, name, \
- TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \
+ xfs_extlen_t len), \
TP_ARGS(pag, agbno, len))
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
@@ -1949,8 +1989,8 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
DECLARE_EVENT_CLASS(xrep_reap_find_class,
- TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len,
- bool crosslinked),
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
+ xfs_extlen_t len, bool crosslinked),
TP_ARGS(pag, agbno, len, crosslinked),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1960,8 +2000,8 @@ DECLARE_EVENT_CLASS(xrep_reap_find_class,
__field(bool, crosslinked)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->agbno = agbno;
__entry->len = len;
__entry->crosslinked = crosslinked;
@@ -1975,17 +2015,15 @@ DECLARE_EVENT_CLASS(xrep_reap_find_class,
);
#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \
DEFINE_EVENT(xrep_reap_find_class, name, \
- TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \
- bool crosslinked), \
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \
+ xfs_extlen_t len, bool crosslinked), \
TP_ARGS(pag, agbno, len, crosslinked))
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
DEFINE_REPAIR_REAP_FIND_EVENT(xreap_bmapi_select);
-DECLARE_EVENT_CLASS(xrep_rmap_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len,
- uint64_t owner, uint64_t offset, unsigned int flags),
- TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+TRACE_EVENT(xrep_ibt_walk_rmap,
+ TP_PROTO(const struct xfs_perag *pag, const struct xfs_rmap_irec *rec),
+ TP_ARGS(pag, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -1996,13 +2034,13 @@ DECLARE_EVENT_CLASS(xrep_rmap_class,
__field(unsigned int, flags)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->agbno = agbno;
- __entry->len = len;
- __entry->owner = owner;
- __entry->offset = offset;
- __entry->flags = flags;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->agbno = rec->rm_startblock;
+ __entry->len = rec->rm_blockcount;
+ __entry->owner = rec->rm_owner;
+ __entry->offset = rec->rm_offset;
+ __entry->flags = rec->rm_flags;
),
TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -2013,19 +2051,11 @@ DECLARE_EVENT_CLASS(xrep_rmap_class,
__entry->offset,
__entry->flags)
);
-#define DEFINE_REPAIR_RMAP_EVENT(name) \
-DEFINE_EVENT(xrep_rmap_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- xfs_agblock_t agbno, xfs_extlen_t len, \
- uint64_t owner, uint64_t offset, unsigned int flags), \
- TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap);
-DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap);
TRACE_EVENT(xrep_abt_found,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ TP_PROTO(const struct xfs_perag *pag,
const struct xfs_alloc_rec_incore *rec),
- TP_ARGS(mp, agno, rec),
+ TP_ARGS(pag, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -2033,8 +2063,8 @@ TRACE_EVENT(xrep_abt_found,
__field(xfs_extlen_t, blockcount)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->startblock = rec->ar_startblock;
__entry->blockcount = rec->ar_blockcount;
),
@@ -2046,9 +2076,9 @@ TRACE_EVENT(xrep_abt_found,
)
TRACE_EVENT(xrep_ibt_found,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ TP_PROTO(const struct xfs_perag *pag,
const struct xfs_inobt_rec_incore *rec),
- TP_ARGS(mp, agno, rec),
+ TP_ARGS(pag, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -2059,8 +2089,8 @@ TRACE_EVENT(xrep_ibt_found,
__field(uint64_t, freemask)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->startino = rec->ir_startino;
__entry->holemask = rec->ir_holemask;
__entry->count = rec->ir_count;
@@ -2078,7 +2108,8 @@ TRACE_EVENT(xrep_ibt_found,
)
TRACE_EVENT(xrep_refc_found,
- TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec),
+ TP_PROTO(const struct xfs_perag *pag,
+ const struct xfs_refcount_irec *rec),
TP_ARGS(pag, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -2089,8 +2120,8 @@ TRACE_EVENT(xrep_refc_found,
__field(xfs_nlink_t, refcount)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->domain = rec->rc_domain;
__entry->startblock = rec->rc_startblock;
__entry->blockcount = rec->rc_blockcount;
@@ -2138,9 +2169,8 @@ TRACE_EVENT(xrep_bmap_found,
);
TRACE_EVENT(xrep_rmap_found,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- const struct xfs_rmap_irec *rec),
- TP_ARGS(mp, agno, rec),
+ TP_PROTO(const struct xfs_perag *pag, const struct xfs_rmap_irec *rec),
+ TP_ARGS(pag, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -2151,8 +2181,8 @@ TRACE_EVENT(xrep_rmap_found,
__field(unsigned int, flags)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->agbno = rec->rm_startblock;
__entry->len = rec->rm_blockcount;
__entry->owner = rec->rm_owner;
@@ -2170,9 +2200,9 @@ TRACE_EVENT(xrep_rmap_found,
);
TRACE_EVENT(xrep_findroot_block,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
uint32_t magic, uint16_t level),
- TP_ARGS(mp, agno, agbno, magic, level),
+ TP_ARGS(pag, agbno, magic, level),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -2181,8 +2211,8 @@ TRACE_EVENT(xrep_findroot_block,
__field(uint16_t, level)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->agbno = agbno;
__entry->magic = magic;
__entry->level = level;
@@ -2195,10 +2225,10 @@ TRACE_EVENT(xrep_findroot_block,
__entry->level)
)
TRACE_EVENT(xrep_calc_ag_resblks,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen,
+ TP_PROTO(const struct xfs_perag *pag, xfs_agino_t icount,
+ xfs_agblock_t aglen, xfs_agblock_t freelen,
xfs_agblock_t usedlen),
- TP_ARGS(mp, agno, icount, aglen, freelen, usedlen),
+ TP_ARGS(pag, icount, aglen, freelen, usedlen),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -2208,8 +2238,8 @@ TRACE_EVENT(xrep_calc_ag_resblks,
__field(xfs_agblock_t, usedlen)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->icount = icount;
__entry->aglen = aglen;
__entry->freelen = freelen;
@@ -2224,10 +2254,10 @@ TRACE_EVENT(xrep_calc_ag_resblks,
__entry->usedlen)
)
TRACE_EVENT(xrep_calc_ag_resblks_btsize,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz,
- xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz),
- TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz),
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t bnobt_sz,
+ xfs_agblock_t inobt_sz, xfs_agblock_t rmapbt_sz,
+ xfs_agblock_t refcbt_sz),
+ TP_ARGS(pag, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -2237,8 +2267,8 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
__field(xfs_agblock_t, refcbt_sz)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->bnobt_sz = bnobt_sz;
__entry->inobt_sz = inobt_sz;
__entry->rmapbt_sz = rmapbt_sz;
@@ -2278,10 +2308,9 @@ TRACE_EVENT(xrep_reset_counters,
)
DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len,
- int64_t owner),
- TP_ARGS(mp, agno, agbno, len, owner),
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
+ xfs_extlen_t len, int64_t owner),
+ TP_ARGS(pag, agbno, len, owner),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -2290,8 +2319,8 @@ DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
__field(int64_t, owner)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->agbno = agbno;
__entry->len = len;
__entry->owner = owner;
@@ -2305,10 +2334,9 @@ DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
);
#define DEFINE_NEWBT_EXTENT_EVENT(name) \
DEFINE_EVENT(xrep_newbt_extent_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- xfs_agblock_t agbno, xfs_extlen_t len, \
- int64_t owner), \
- TP_ARGS(mp, agno, agbno, len, owner))
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno, \
+ xfs_extlen_t len, int64_t owner), \
+ TP_ARGS(pag, agbno, len, owner))
DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks);
DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks);
DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
@@ -2596,7 +2624,7 @@ TRACE_EVENT(xrep_cow_replace_mapping,
);
TRACE_EVENT(xrep_cow_free_staging,
- TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno,
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
xfs_extlen_t blockcount),
TP_ARGS(pag, agbno, blockcount),
TP_STRUCT__entry(
@@ -2606,8 +2634,8 @@ TRACE_EVENT(xrep_cow_free_staging,
__field(xfs_extlen_t, blockcount)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->agbno = agbno;
__entry->blockcount = blockcount;
),
@@ -2652,9 +2680,9 @@ DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode);
DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode);
TRACE_EVENT(xrep_rmap_live_update,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int op,
const struct xfs_rmap_update_params *p),
- TP_ARGS(mp, agno, op, p),
+ TP_ARGS(pag, op, p),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -2666,8 +2694,8 @@ TRACE_EVENT(xrep_rmap_live_update,
__field(unsigned int, flags)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->op = op;
__entry->agbno = p->startblock;
__entry->len = p->blockcount;
@@ -3313,7 +3341,7 @@ DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_rebuild);
DEFINE_XREP_SYMLINK_EVENT(xrep_symlink_reset_fork);
TRACE_EVENT(xrep_iunlink_visit,
- TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
xfs_agino_t bucket_agino, struct xfs_inode *ip),
TP_ARGS(pag, bucket, bucket_agino, ip),
TP_STRUCT__entry(
@@ -3326,9 +3354,9 @@ TRACE_EVENT(xrep_iunlink_visit,
__field(xfs_agino_t, next_agino)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
- __entry->agino = XFS_INO_TO_AGINO(pag->pag_mount, ip->i_ino);
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->agino = XFS_INO_TO_AGINO(pag_mount(pag), ip->i_ino);
__entry->bucket = bucket;
__entry->bucket_agino = bucket_agino;
__entry->prev_agino = ip->i_prev_unlinked;
@@ -3403,7 +3431,7 @@ TRACE_EVENT(xrep_iunlink_reload_ondisk,
);
TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket,
- TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
xfs_agino_t prev_agino, xfs_agino_t next_agino),
TP_ARGS(pag, bucket, prev_agino, next_agino),
TP_STRUCT__entry(
@@ -3414,8 +3442,8 @@ TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket,
__field(xfs_agino_t, next_agino)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->bucket = bucket;
__entry->prev_agino = prev_agino;
__entry->next_agino = next_agino;
@@ -3429,7 +3457,7 @@ TRACE_EVENT(xrep_iunlink_walk_ondisk_bucket,
);
DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class,
- TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
xfs_agino_t prev_agino, xfs_agino_t next_agino),
TP_ARGS(pag, bucket, prev_agino, next_agino),
TP_STRUCT__entry(
@@ -3440,8 +3468,8 @@ DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class,
__field(xfs_agino_t, next_agino)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->bucket = bucket;
__entry->prev_agino = prev_agino;
__entry->next_agino = next_agino;
@@ -3455,7 +3483,7 @@ DECLARE_EVENT_CLASS(xrep_iunlink_resolve_class,
);
#define DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(name) \
DEFINE_EVENT(xrep_iunlink_resolve_class, name, \
- TP_PROTO(struct xfs_perag *pag, unsigned int bucket, \
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket, \
xfs_agino_t prev_agino, xfs_agino_t next_agino), \
TP_ARGS(pag, bucket, prev_agino, next_agino))
DEFINE_REPAIR_IUNLINK_RESOLVE_EVENT(xrep_iunlink_resolve_uncached);
@@ -3516,7 +3544,7 @@ TRACE_EVENT(xrep_iunlink_relink_prev,
);
TRACE_EVENT(xrep_iunlink_add_to_bucket,
- TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
xfs_agino_t agino, xfs_agino_t curr_head),
TP_ARGS(pag, bucket, agino, curr_head),
TP_STRUCT__entry(
@@ -3527,8 +3555,8 @@ TRACE_EVENT(xrep_iunlink_add_to_bucket,
__field(xfs_agino_t, next_agino)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->bucket = bucket;
__entry->agino = agino;
__entry->next_agino = curr_head;
@@ -3542,7 +3570,7 @@ TRACE_EVENT(xrep_iunlink_add_to_bucket,
);
TRACE_EVENT(xrep_iunlink_commit_bucket,
- TP_PROTO(struct xfs_perag *pag, unsigned int bucket,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
xfs_agino_t old_agino, xfs_agino_t agino),
TP_ARGS(pag, bucket, old_agino, agino),
TP_STRUCT__entry(
@@ -3553,8 +3581,8 @@ TRACE_EVENT(xrep_iunlink_commit_bucket,
__field(xfs_agino_t, agino)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->bucket = bucket;
__entry->old_agino = old_agino;
__entry->agino = agino;
@@ -3572,6 +3600,11 @@ DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_delete_path);
DEFINE_XCHK_DIRTREE_EVENT(xrep_dirtree_create_adoption);
DEFINE_XCHK_DIRTREE_EVALUATE_EVENT(xrep_dirtree_decided_fate);
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_lookup);
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_try_unlink);
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_unlink);
+DEFINE_XCHK_METAPATH_EVENT(xrep_metapath_link);
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 35a8c1b8b3cb..3d52e9d7ad57 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -318,14 +318,16 @@ xfs_bmap_update_create_done(
return &budp->bud_item;
}
-/* Take a passive ref to the AG containing the space we're mapping. */
+/* Take a passive ref to the group containing the space we're mapping. */
static inline void
xfs_bmap_update_get_group(
struct xfs_mount *mp,
struct xfs_bmap_intent *bi)
{
+ enum xfs_group_type type = XG_TYPE_AG;
+
if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
- return;
+ type = XG_TYPE_RTG;
/*
* Bump the intent count on behalf of the deferred rmap and refcount
@@ -334,7 +336,8 @@ xfs_bmap_update_get_group(
* intent drops the intent count, ensuring that the intent count
* remains nonzero across the transaction roll.
*/
- bi->bi_pag = xfs_perag_intent_get(mp, bi->bi_bmap.br_startblock);
+ bi->bi_group = xfs_group_intent_get(mp, bi->bi_bmap.br_startblock,
+ type);
}
/* Add this deferred BUI to the transaction. */
@@ -343,8 +346,6 @@ xfs_bmap_defer_add(
struct xfs_trans *tp,
struct xfs_bmap_intent *bi)
{
- trace_xfs_bmap_defer(bi);
-
xfs_bmap_update_get_group(tp->t_mountp, bi);
/*
@@ -357,18 +358,9 @@ xfs_bmap_defer_add(
*/
if (bi->bi_type == XFS_BMAP_MAP)
bi->bi_owner->i_delayed_blks += bi->bi_bmap.br_blockcount;
- xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
-}
-
-/* Release a passive AG ref after finishing mapping work. */
-static inline void
-xfs_bmap_update_put_group(
- struct xfs_bmap_intent *bi)
-{
- if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
- return;
- xfs_perag_intent_put(bi->bi_pag);
+ trace_xfs_bmap_defer(bi);
+ xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
}
/* Cancel a deferred bmap update. */
@@ -381,7 +373,7 @@ xfs_bmap_update_cancel_item(
if (bi->bi_type == XFS_BMAP_MAP)
bi->bi_owner->i_delayed_blks -= bi->bi_bmap.br_blockcount;
- xfs_bmap_update_put_group(bi);
+ xfs_group_intent_put(bi->bi_group);
kmem_cache_free(xfs_bmap_intent_cache, bi);
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 4719ec90029c..a59bbe767a7d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -29,6 +29,7 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
/* Kernel only BMAP related definitions and functions */
@@ -41,16 +42,12 @@ xfs_daddr_t
xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
{
if (XFS_IS_REALTIME_INODE(ip))
- return XFS_FSB_TO_BB(ip->i_mount, fsb);
+ return xfs_rtb_to_daddr(ip->i_mount, fsb);
return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
}
/*
* Routine to zero an extent on disk allocated to the specific inode.
- *
- * The VFS functions take a linearised filesystem block offset, so we have to
- * convert the sparse xfs fsb to the right format first.
- * VFS types are real funky, too.
*/
int
xfs_zero_extent(
@@ -58,15 +55,10 @@ xfs_zero_extent(
xfs_fsblock_t start_fsb,
xfs_off_t count_fsb)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
- xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
- sector_t block = XFS_BB_TO_FSBT(mp, sector);
-
- return blkdev_issue_zeroout(target->bt_bdev,
- block << (mp->m_super->s_blocksize_bits - 9),
- count_fsb << (mp->m_super->s_blocksize_bits - 9),
- GFP_KERNEL, 0);
+ return blkdev_issue_zeroout(xfs_inode_buftarg(ip)->bt_bdev,
+ xfs_fsb_to_db(ip, start_fsb),
+ XFS_FSB_TO_BB(ip->i_mount, count_fsb),
+ GFP_KERNEL, 0);
}
/*
@@ -540,16 +532,20 @@ xfs_can_free_eofblocks(
*/
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
if (xfs_inode_has_bigrtalloc(ip))
- end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
+ end_fsb = xfs_fileoff_roundup_rtx(mp, end_fsb);
last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (last_fsb <= end_fsb)
return false;
/*
- * Check if there is an post-EOF extent to free.
+ * Check if there is an post-EOF extent to free. If there are any
+ * delalloc blocks attached to the inode (data fork delalloc
+ * reservations or CoW extents of any kind), we need to free them so
+ * that inactivation doesn't fail to erase them.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_iext_lookup_extent(ip, &ip->i_df, end_fsb, &icur, &imap))
+ if (ip->i_delayed_blks ||
+ xfs_iext_lookup_extent(ip, &ip->i_df, end_fsb, &icur, &imap))
found_blocks = true;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return found_blocks;
@@ -858,8 +854,8 @@ xfs_free_file_space(
/* We can only free complete realtime extents. */
if (xfs_inode_has_bigrtalloc(ip)) {
- startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb);
- endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb);
+ startoffset_fsb = xfs_fileoff_roundup_rtx(mp, startoffset_fsb);
+ endoffset_fsb = xfs_fileoff_rounddown_rtx(mp, endoffset_fsb);
}
/*
@@ -1527,6 +1523,18 @@ xfs_swap_extents(
goto out_unlock;
}
+ /*
+ * The rmapbt implementation is unable to resume a swapext operation
+ * after a crash if the allocation unit size is larger than a block.
+ * This (deprecated) interface will not be upgraded to handle this
+ * situation. Defragmentation must be performed with the commit range
+ * ioctl.
+ */
+ if (XFS_IS_REALTIME_INODE(ip) && xfs_has_rtgroups(ip->i_mount)) {
+ error = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
error = xfs_qm_dqattach(ip);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index aa4dbda7b536..e8196f5778e2 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2115,6 +2115,13 @@ xfs_alloc_buftarg(
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
+ if (bdev_can_atomic_write(btp->bt_bdev)) {
+ btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes(
+ btp->bt_bdev);
+ btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes(
+ btp->bt_bdev);
+ }
+
/*
* When allocating the buftargs we have not yet read the super block and
* thus don't know the file system sector size yet.
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 209a389f2abc..3d56bc7a35cc 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -124,6 +124,10 @@ struct xfs_buftarg {
struct percpu_counter bt_io_count;
struct ratelimit_state bt_ioerror_rl;
+ /* Atomic write unit values */
+ unsigned int bt_bdev_awu_min;
+ unsigned int bt_bdev_awu_max;
+
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
};
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 5180cbf5a90b..3d0c6402cb36 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -25,6 +25,8 @@
#include "xfs_alloc.h"
#include "xfs_ag.h"
#include "xfs_sb.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtbitmap.h"
/*
* This is the number of entries in the l_buf_cancel_table used during
@@ -393,9 +395,18 @@ xlog_recover_validate_buf_type(
break;
#ifdef CONFIG_XFS_RT
case XFS_BLFT_RTBITMAP_BUF:
+ if (xfs_has_rtgroups(mp) && magic32 != XFS_RTBITMAP_MAGIC) {
+ warnmsg = "Bad rtbitmap magic!";
+ break;
+ }
+ bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_BITMAP);
+ break;
case XFS_BLFT_RTSUMMARY_BUF:
- /* no magic numbers for verification of RT buffers */
- bp->b_ops = &xfs_rtbuf_ops;
+ if (xfs_has_rtgroups(mp) && magic32 != XFS_RTSUMMARY_MAGIC) {
+ warnmsg = "Bad rtsummary magic!";
+ break;
+ }
+ bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_SUMMARY);
break;
#endif /* CONFIG_XFS_RT */
default:
@@ -704,6 +715,7 @@ xlog_recover_do_primary_sb_buffer(
{
struct xfs_dsb *dsb = bp->b_addr;
xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount;
+ xfs_rgnumber_t orig_rgcount = mp->m_sb.sb_rgcount;
int error;
xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
@@ -722,17 +734,32 @@ xlog_recover_do_primary_sb_buffer(
xfs_alert(mp, "Shrinking AG count in log recovery not supported");
return -EFSCORRUPTED;
}
+ if (mp->m_sb.sb_rgcount < orig_rgcount) {
+ xfs_warn(mp,
+ "Shrinking rtgroup count in log recovery not supported");
+ return -EFSCORRUPTED;
+ }
/*
- * Growfs can also grow the last existing AG. In this case we also need
- * to update the length in the in-core perag structure and values
- * depending on it.
+ * If the last AG was grown or shrunk, we also need to update the
+ * length in the in-core perag structure and values depending on it.
*/
error = xfs_update_last_ag_size(mp, orig_agcount);
if (error)
return error;
/*
+ * If the last rtgroup was grown or shrunk, we also need to update the
+ * length in the in-core rtgroup structure and values depending on it.
+ * Ignore this on any filesystem with zero rtgroups.
+ */
+ if (orig_rgcount > 0) {
+ error = xfs_update_last_rtgroup_size(mp, orig_rgcount);
+ if (error)
+ return error;
+ }
+
+ /*
* Initialize the new perags, and also update various block and inode
* allocator setting based off the number of AGs or total blocks.
* Because of the latter this also needs to happen if the agcount did
@@ -745,6 +772,13 @@ xlog_recover_do_primary_sb_buffer(
return error;
}
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+
+ error = xfs_initialize_rtgroups(mp, orig_rgcount, mp->m_sb.sb_rgcount,
+ mp->m_sb.sb_rextents);
+ if (error) {
+ xfs_warn(mp, "Failed recovery rtgroup init: %d", error);
+ return error;
+ }
return 0;
}
@@ -791,11 +825,20 @@ xlog_recover_get_buf_lsn(
* UUIDs, so we must recover them immediately.
*/
blft = xfs_blft_from_flags(buf_f);
- if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF)
+ if (!xfs_has_rtgroups(mp) && (blft == XFS_BLFT_RTBITMAP_BUF ||
+ blft == XFS_BLFT_RTSUMMARY_BUF))
goto recover_immediately;
magic32 = be32_to_cpu(*(__be32 *)blk);
switch (magic32) {
+ case XFS_RTSUMMARY_MAGIC:
+ case XFS_RTBITMAP_MAGIC: {
+ struct xfs_rtbuf_blkinfo *hdr = blk;
+
+ lsn = be64_to_cpu(hdr->rt_lsn);
+ uuid = &hdr->rt_uuid;
+ break;
+ }
case XFS_ABTB_CRC_MAGIC:
case XFS_ABTC_CRC_MAGIC:
case XFS_ABTB_MAGIC:
@@ -1037,6 +1080,18 @@ xlog_recover_buf_commit_pass2(
current_lsn);
if (error)
goto out_release;
+
+ /* Update the rt superblock if we have one. */
+ if (xfs_has_rtsb(mp) && mp->m_rtsb_bp) {
+ struct xfs_buf *rtsb_bp = mp->m_rtsb_bp;
+
+ xfs_buf_lock(rtsb_bp);
+ xfs_buf_hold(rtsb_bp);
+ xfs_update_rtsb(rtsb_bp, bp);
+ rtsb_bp->b_flags |= _XBF_LOGRECOVERY;
+ xfs_buf_delwri_queue(rtsb_bp, buffer_list);
+ xfs_buf_relse(rtsb_bp);
+ }
} else {
xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
}
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d8c4a5dcca7a..c4bd145f5ec1 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -21,6 +21,7 @@
#include "xfs_ag.h"
#include "xfs_health.h"
#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
/*
* Notes on an efficient, low latency fstrim algorithm
@@ -72,6 +73,8 @@
* extent search so that it overlaps in flight discard IO.
*/
+#define XFS_DISCARD_MAX_EXAMINE (100)
+
struct workqueue_struct *xfs_discard_wq;
static void
@@ -81,7 +84,7 @@ xfs_discard_endio_work(
struct xfs_busy_extents *extents =
container_of(work, struct xfs_busy_extents, endio_work);
- xfs_extent_busy_clear(extents->mount, &extents->extent_list, false);
+ xfs_extent_busy_clear(&extents->extent_list, false);
kfree(extents->owner);
}
@@ -100,6 +103,24 @@ xfs_discard_endio(
bio_put(bio);
}
+static inline struct block_device *
+xfs_group_bdev(
+ const struct xfs_group *xg)
+{
+ struct xfs_mount *mp = xg->xg_mount;
+
+ switch (xg->xg_type) {
+ case XG_TYPE_AG:
+ return mp->m_ddev_targp->bt_bdev;
+ case XG_TYPE_RTG:
+ return mp->m_rtdev_targp->bt_bdev;
+ default:
+ ASSERT(0);
+ break;
+ }
+ return NULL;
+}
+
/*
* Walk the discard list and issue discards on all the busy extents in the
* list. We plug and chain the bios so that we only need a single completion
@@ -117,11 +138,11 @@ xfs_discard_extents(
blk_start_plug(&plug);
list_for_each_entry(busyp, &extents->extent_list, list) {
- trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
- busyp->length);
+ trace_xfs_discard_extent(busyp->group, busyp->bno,
+ busyp->length);
- error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
- XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+ error = __blkdev_issue_discard(xfs_group_bdev(busyp->group),
+ xfs_gbno_to_daddr(busyp->group, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
GFP_KERNEL, &bio);
if (error && error != -EOPNOTSUPP) {
@@ -160,13 +181,13 @@ xfs_trim_gather_extents(
struct xfs_trim_cur *tcur,
struct xfs_busy_extents *extents)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_trans *tp;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
int error;
int i;
- int batch = 100;
+ int batch = XFS_DISCARD_MAX_EXAMINE;
/*
* Force out the log. This means any transactions that might have freed
@@ -239,11 +260,11 @@ xfs_trim_gather_extents(
* overlapping ranges for now.
*/
if (fbno + flen < tcur->start) {
- trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen);
+ trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
goto next_extent;
}
if (fbno > tcur->end) {
- trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen);
+ trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
if (tcur->by_bno) {
tcur->count = 0;
break;
@@ -261,7 +282,7 @@ xfs_trim_gather_extents(
/* Too small? Give up. */
if (flen < tcur->minlen) {
- trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
+ trace_xfs_discard_toosmall(pag_group(pag), fbno, flen);
if (tcur->by_bno)
goto next_extent;
tcur->count = 0;
@@ -272,12 +293,12 @@ xfs_trim_gather_extents(
* If any blocks in the range are still busy, skip the
* discard and try again the next time.
*/
- if (xfs_extent_busy_search(mp, pag, fbno, flen)) {
- trace_xfs_discard_busy(mp, pag->pag_agno, fbno, flen);
+ if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) {
+ trace_xfs_discard_busy(pag_group(pag), fbno, flen);
goto next_extent;
}
- xfs_extent_busy_insert_discard(pag, fbno, flen,
+ xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen,
&extents->extent_list);
next_extent:
if (tcur->by_bno)
@@ -301,7 +322,7 @@ next_extent:
* we aren't going to issue a discard on them any more.
*/
if (error)
- xfs_extent_busy_clear(mp, &extents->extent_list, false);
+ xfs_extent_busy_clear(&extents->extent_list, false);
out_del_cursor:
xfs_btree_del_cursor(cur, error);
out_trans_cancel:
@@ -335,7 +356,7 @@ xfs_trim_perag_extents(
};
int error = 0;
- if (start != 0 || end != pag->block_count)
+ if (start != 0 || end != pag_group(pag)->xg_block_count)
tcur.by_bno = true;
do {
@@ -347,7 +368,6 @@ xfs_trim_perag_extents(
break;
}
- extents->mount = pag->pag_mount;
extents->owner = extents;
INIT_LIST_HEAD(&extents->extent_list);
@@ -367,7 +387,7 @@ xfs_trim_perag_extents(
* list after this function call, as it may have been freed by
* the time control returns to us.
*/
- error = xfs_discard_extents(pag->pag_mount, extents);
+ error = xfs_discard_extents(pag_mount(pag), extents);
if (error)
break;
@@ -389,8 +409,8 @@ xfs_trim_datadev_extents(
{
xfs_agnumber_t start_agno, end_agno;
xfs_agblock_t start_agbno, end_agbno;
+ struct xfs_perag *pag = NULL;
xfs_daddr_t ddev_end;
- struct xfs_perag *pag;
int last_error = 0, error;
ddev_end = min_t(xfs_daddr_t, end,
@@ -401,10 +421,10 @@ xfs_trim_datadev_extents(
end_agno = xfs_daddr_to_agno(mp, ddev_end);
end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
- for_each_perag_range(mp, start_agno, end_agno, pag) {
- xfs_agblock_t agend = pag->block_count;
+ while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) {
+ xfs_agblock_t agend = pag_group(pag)->xg_block_count;
- if (start_agno == end_agno)
+ if (pag_agno(pag) == end_agno)
agend = end_agbno;
error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
if (error)
@@ -479,7 +499,7 @@ xfs_discard_rtdev_extents(
trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
error = __blkdev_issue_discard(bdev,
- XFS_FSB_TO_BB(mp, busyp->bno),
+ xfs_rtb_to_daddr(mp, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
GFP_NOFS, &bio);
if (error)
@@ -506,7 +526,7 @@ xfs_discard_rtdev_extents(
static int
xfs_trim_gather_rtextent(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
@@ -525,12 +545,12 @@ xfs_trim_gather_rtextent(
return -ECANCELED;
}
- rbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
- rlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
+ rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext);
+ rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount);
/* Ignore too small. */
if (rlen < tr->minlen_fsb) {
- trace_xfs_discard_rttoosmall(mp, rbno, rlen);
+ trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen);
return 0;
}
@@ -547,70 +567,185 @@ xfs_trim_gather_rtextent(
return 0;
}
+/* Trim extents on an !rtgroups realtime device */
static int
-xfs_trim_rtdev_extents(
- struct xfs_mount *mp,
- xfs_daddr_t start,
- xfs_daddr_t end,
+xfs_trim_rtextents(
+ struct xfs_rtgroup *rtg,
+ xfs_rtxnum_t low,
+ xfs_rtxnum_t high,
xfs_daddr_t minlen)
{
+ struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_trim_rtdev tr = {
.minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
+ .extent_list = LIST_HEAD_INIT(tr.extent_list),
};
- xfs_rtxnum_t low, high;
struct xfs_trans *tp;
- xfs_daddr_t rtdev_daddr;
int error;
- INIT_LIST_HEAD(&tr.extent_list);
-
- /* Shift the start and end downwards to match the rt device. */
- rtdev_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
- if (start > rtdev_daddr)
- start -= rtdev_daddr;
- else
- start = 0;
-
- if (end <= rtdev_daddr)
- return 0;
- end -= rtdev_daddr;
-
error = xfs_trans_alloc_empty(mp, &tp);
if (error)
return error;
- end = min_t(xfs_daddr_t, end,
- XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1);
-
- /* Convert the rt blocks to rt extents */
- low = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start));
- high = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end));
-
/*
* Walk the free ranges between low and high. The query_range function
* trims the extents returned.
*/
do {
- tr.stop_rtx = low + (mp->m_sb.sb_blocksize * NBBY);
- xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
- error = xfs_rtalloc_query_range(mp, tp, low, high,
+ tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp);
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ error = xfs_rtalloc_query_range(rtg, tp, low, high,
xfs_trim_gather_rtextent, &tr);
if (error == -ECANCELED)
error = 0;
if (error) {
- xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
xfs_discard_free_rtdev_extents(&tr);
break;
}
if (list_empty(&tr.extent_list)) {
- xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
break;
}
error = xfs_discard_rtdev_extents(mp, &tr);
- xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ if (error)
+ break;
+
+ low = tr.restart_rtx;
+ } while (!xfs_trim_should_stop() && low <= high);
+
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+struct xfs_trim_rtgroup {
+ /* list of rtgroup extents to free */
+ struct xfs_busy_extents *extents;
+
+ /* minimum length that caller allows us to trim */
+ xfs_rtblock_t minlen_fsb;
+
+ /* restart point for the rtbitmap walk */
+ xfs_rtxnum_t restart_rtx;
+
+ /* number of extents to examine before stopping to issue discard ios */
+ int batch;
+
+ /* number of extents queued for discard */
+ int queued;
+};
+
+static int
+xfs_trim_gather_rtgroup_extent(
+ struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_trim_rtgroup *tr = priv;
+ xfs_rgblock_t rgbno;
+ xfs_extlen_t len;
+
+ if (--tr->batch <= 0) {
+ /*
+ * If we've checked a large number of extents, update the
+ * cursor to point at this extent so we restart the next batch
+ * from this extent.
+ */
+ tr->restart_rtx = rec->ar_startext;
+ return -ECANCELED;
+ }
+
+ rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext);
+ len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount);
+
+ /* Ignore too small. */
+ if (len < tr->minlen_fsb) {
+ trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len);
+ return 0;
+ }
+
+ /*
+ * If any blocks in the range are still busy, skip the discard and try
+ * again the next time.
+ */
+ if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) {
+ trace_xfs_discard_busy(rtg_group(rtg), rgbno, len);
+ return 0;
+ }
+
+ xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len,
+ &tr->extents->extent_list);
+
+ tr->queued++;
+ tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
+ return 0;
+}
+
+/* Trim extents in this rtgroup using the busy extent machinery. */
+static int
+xfs_trim_rtgroup_extents(
+ struct xfs_rtgroup *rtg,
+ xfs_rtxnum_t low,
+ xfs_rtxnum_t high,
+ xfs_daddr_t minlen)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_trim_rtgroup tr = {
+ .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
+ };
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ /*
+ * Walk the free ranges between low and high. The query_range function
+ * trims the extents returned.
+ */
+ do {
+ tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL);
+ if (!tr.extents) {
+ error = -ENOMEM;
+ break;
+ }
+
+ tr.queued = 0;
+ tr.batch = XFS_DISCARD_MAX_EXAMINE;
+ tr.extents->owner = tr.extents;
+ INIT_LIST_HEAD(&tr.extents->extent_list);
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ error = xfs_rtalloc_query_range(rtg, tp, low, high,
+ xfs_trim_gather_rtgroup_extent, &tr);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ if (error == -ECANCELED)
+ error = 0;
+ if (error) {
+ kfree(tr.extents);
+ break;
+ }
+
+ if (!tr.queued)
+ break;
+
+ /*
+ * We hand the extent list to the discard function here so the
+ * discarded extents can be removed from the busy extent list.
+ * This allows the discards to run asynchronously with
+ * gathering the next round of extents to discard.
+ *
+ * However, we must ensure that we do not reference the extent
+ * list after this function call, as it may have been freed by
+ * the time control returns to us.
+ */
+ error = xfs_discard_extents(rtg_mount(rtg), tr.extents);
if (error)
break;
@@ -620,6 +755,63 @@ xfs_trim_rtdev_extents(
xfs_trans_cancel(tp);
return error;
}
+
+static int
+xfs_trim_rtdev_extents(
+ struct xfs_mount *mp,
+ xfs_daddr_t start,
+ xfs_daddr_t end,
+ xfs_daddr_t minlen)
+{
+ xfs_rtblock_t start_rtbno, end_rtbno;
+ xfs_rtxnum_t start_rtx, end_rtx;
+ xfs_rgnumber_t start_rgno, end_rgno;
+ xfs_daddr_t daddr_offset;
+ int last_error = 0, error;
+ struct xfs_rtgroup *rtg = NULL;
+
+ /* Shift the start and end downwards to match the rt device. */
+ daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+ if (start > daddr_offset)
+ start -= daddr_offset;
+ else
+ start = 0;
+ start_rtbno = xfs_daddr_to_rtb(mp, start);
+ start_rtx = xfs_rtb_to_rtx(mp, start_rtbno);
+ start_rgno = xfs_rtb_to_rgno(mp, start_rtbno);
+
+ if (end <= daddr_offset)
+ return 0;
+ else
+ end -= daddr_offset;
+ end_rtbno = xfs_daddr_to_rtb(mp, end);
+ end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1);
+ end_rgno = xfs_rtb_to_rgno(mp, end_rtbno);
+
+ while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) {
+ xfs_rtxnum_t rtg_end = rtg->rtg_extents;
+
+ if (rtg_rgno(rtg) == end_rgno)
+ rtg_end = min(rtg_end, end_rtx);
+
+ if (xfs_has_rtgroups(mp))
+ error = xfs_trim_rtgroup_extents(rtg, start_rtx,
+ rtg_end, minlen);
+ else
+ error = xfs_trim_rtextents(rtg, start_rtx, rtg_end,
+ minlen);
+ if (error)
+ last_error = error;
+
+ if (xfs_trim_should_stop()) {
+ xfs_rtgroup_rele(rtg);
+ break;
+ }
+ start_rtx = 0;
+ }
+
+ return last_error;
+}
#else
# define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP)
#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index c1b211c260a9..ff982d983989 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -277,6 +277,25 @@ xfs_qm_init_dquot_blk(
xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
+static void
+xfs_dquot_set_prealloc(
+ struct xfs_dquot_pre *pre,
+ const struct xfs_dquot_res *res)
+{
+ xfs_qcnt_t space;
+
+ pre->q_prealloc_hi_wmark = res->hardlimit;
+ pre->q_prealloc_lo_wmark = res->softlimit;
+
+ space = div_u64(pre->q_prealloc_hi_wmark, 100);
+ if (!pre->q_prealloc_lo_wmark)
+ pre->q_prealloc_lo_wmark = space * 95;
+
+ pre->q_low_space[XFS_QLOWSP_1_PCNT] = space;
+ pre->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3;
+ pre->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
+}
+
/*
* Initialize the dynamic speculative preallocation thresholds. The lo/hi
* watermarks correspond to the soft and hard limits by default. If a soft limit
@@ -285,22 +304,8 @@ xfs_qm_init_dquot_blk(
void
xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
{
- uint64_t space;
-
- dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit;
- dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit;
- if (!dqp->q_prealloc_lo_wmark) {
- dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
- do_div(dqp->q_prealloc_lo_wmark, 100);
- dqp->q_prealloc_lo_wmark *= 95;
- }
-
- space = dqp->q_prealloc_hi_wmark;
-
- do_div(space, 100);
- dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space;
- dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3;
- dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
+ xfs_dquot_set_prealloc(&dqp->q_blk_prealloc, &dqp->q_blk);
+ xfs_dquot_set_prealloc(&dqp->q_rtb_prealloc, &dqp->q_rtb);
}
/*
@@ -983,6 +988,7 @@ xfs_qm_dqget_inode(
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(xfs_inode_dquot(ip, type) == NULL);
+ ASSERT(!xfs_is_metadir_inode(ip));
id = xfs_qm_id_for_quotatype(ip, type);
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 677bb2dc9ac9..d73d179df009 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -56,6 +56,12 @@ xfs_dquot_res_over_limits(
return false;
}
+struct xfs_dquot_pre {
+ xfs_qcnt_t q_prealloc_lo_wmark;
+ xfs_qcnt_t q_prealloc_hi_wmark;
+ int64_t q_low_space[XFS_QLOWSP_MAX];
+};
+
/*
* The incore dquot structure
*/
@@ -76,9 +82,9 @@ struct xfs_dquot {
struct xfs_dq_logitem q_logitem;
- xfs_qcnt_t q_prealloc_lo_wmark;
- xfs_qcnt_t q_prealloc_hi_wmark;
- int64_t q_low_space[XFS_QLOWSP_MAX];
+ struct xfs_dquot_pre q_blk_prealloc;
+ struct xfs_dquot_pre q_rtb_prealloc;
+
struct mutex q_qlock;
struct completion q_flush;
atomic_t q_pincount;
@@ -192,7 +198,11 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
int64_t freesp;
freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved;
- if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
+ if (freesp < dqp->q_blk_prealloc.q_low_space[XFS_QLOWSP_1_PCNT])
+ return true;
+
+ freesp = dqp->q_rtb.hardlimit - dqp->q_rtb.reserved;
+ if (freesp < dqp->q_rtb_prealloc.q_low_space[XFS_QLOWSP_1_PCNT])
return true;
return false;
diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c
index 7bdb9688c0f5..5ede81fadbd8 100644
--- a/fs/xfs/xfs_drain.c
+++ b/fs/xfs/xfs_drain.c
@@ -94,55 +94,39 @@ static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr)
}
/*
- * Get a passive reference to the AG that contains a fsbno and declare an intent
- * to update its metadata.
+ * Get a passive reference to the group that contains a fsbno and declare an
+ * intent to update its metadata.
+ *
+ * Other threads that need exclusive access can decide to back off if they see
+ * declared intentions.
*/
-struct xfs_perag *
-xfs_perag_intent_get(
+struct xfs_group *
+xfs_group_intent_get(
struct xfs_mount *mp,
- xfs_fsblock_t fsbno)
+ xfs_fsblock_t fsbno,
+ enum xfs_group_type type)
{
- struct xfs_perag *pag;
+ struct xfs_group *xg;
- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, fsbno));
- if (!pag)
+ xg = xfs_group_get_by_fsb(mp, fsbno, type);
+ if (!xg)
return NULL;
-
- xfs_perag_intent_hold(pag);
- return pag;
-}
-
-/*
- * Release our intent to update this AG's metadata, and then release our
- * passive ref to the AG.
- */
-void
-xfs_perag_intent_put(
- struct xfs_perag *pag)
-{
- xfs_perag_intent_rele(pag);
- xfs_perag_put(pag);
+ trace_xfs_group_intent_hold(xg, __return_address);
+ xfs_defer_drain_grab(&xg->xg_intents_drain);
+ return xg;
}
/*
- * Declare an intent to update AG metadata. Other threads that need exclusive
- * access can decide to back off if they see declared intentions.
+ * Release our intent to update this groups metadata, and then release our
+ * passive ref to it.
*/
void
-xfs_perag_intent_hold(
- struct xfs_perag *pag)
+xfs_group_intent_put(
+ struct xfs_group *xg)
{
- trace_xfs_perag_intent_hold(pag, __return_address);
- xfs_defer_drain_grab(&pag->pag_intents_drain);
-}
-
-/* Release our intent to update this AG's metadata. */
-void
-xfs_perag_intent_rele(
- struct xfs_perag *pag)
-{
- trace_xfs_perag_intent_rele(pag, __return_address);
- xfs_defer_drain_rele(&pag->pag_intents_drain);
+ trace_xfs_group_intent_rele(xg, __return_address);
+ xfs_defer_drain_rele(&xg->xg_intents_drain);
+ xfs_group_put(xg);
}
/*
@@ -150,17 +134,19 @@ xfs_perag_intent_rele(
* Callers must not hold any AG header buffers.
*/
int
-xfs_perag_intent_drain(
- struct xfs_perag *pag)
+xfs_group_intent_drain(
+ struct xfs_group *xg)
{
- trace_xfs_perag_wait_intents(pag, __return_address);
- return xfs_defer_drain_wait(&pag->pag_intents_drain);
+ trace_xfs_group_wait_intents(xg, __return_address);
+ return xfs_defer_drain_wait(&xg->xg_intents_drain);
}
-/* Has anyone declared an intent to update this AG? */
+/*
+ * Has anyone declared an intent to update this group?
+ */
bool
-xfs_perag_intent_busy(
- struct xfs_perag *pag)
+xfs_group_intent_busy(
+ struct xfs_group *xg)
{
- return xfs_defer_drain_busy(&pag->pag_intents_drain);
+ return xfs_defer_drain_busy(&xg->xg_intents_drain);
}
diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h
index 775164f54ea6..efcf88df9a5e 100644
--- a/fs/xfs/xfs_drain.h
+++ b/fs/xfs/xfs_drain.h
@@ -6,6 +6,7 @@
#ifndef XFS_DRAIN_H_
#define XFS_DRAIN_H_
+struct xfs_group;
struct xfs_perag;
#ifdef CONFIG_XFS_DRAIN_INTENTS
@@ -61,27 +62,22 @@ void xfs_drain_wait_enable(void);
* soon as the item is added to the transaction and cannot drop the counter
* until the item is finished or cancelled.
*/
-struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp,
- xfs_fsblock_t fsbno);
-void xfs_perag_intent_put(struct xfs_perag *pag);
+struct xfs_group *xfs_group_intent_get(struct xfs_mount *mp,
+ xfs_fsblock_t fsbno, enum xfs_group_type type);
+void xfs_group_intent_put(struct xfs_group *rtg);
-void xfs_perag_intent_hold(struct xfs_perag *pag);
-void xfs_perag_intent_rele(struct xfs_perag *pag);
+int xfs_group_intent_drain(struct xfs_group *xg);
+bool xfs_group_intent_busy(struct xfs_group *xg);
-int xfs_perag_intent_drain(struct xfs_perag *pag);
-bool xfs_perag_intent_busy(struct xfs_perag *pag);
#else
struct xfs_defer_drain { /* empty */ };
#define xfs_defer_drain_free(dr) ((void)0)
#define xfs_defer_drain_init(dr) ((void)0)
-#define xfs_perag_intent_get(mp, fsbno) \
- xfs_perag_get((mp), XFS_FSB_TO_AGNO(mp, fsbno))
-#define xfs_perag_intent_put(pag) xfs_perag_put(pag)
-
-static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { }
-static inline void xfs_perag_intent_rele(struct xfs_perag *pag) { }
+#define xfs_group_intent_get(_mp, _fsbno, _type) \
+ xfs_group_get_by_fsb((_mp), (_fsbno), (_type))
+#define xfs_group_intent_put(xg) xfs_group_put(xg)
#endif /* CONFIG_XFS_DRAIN_INTENTS */
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index 75cb53f090d1..9ab05ad224d1 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -217,7 +217,7 @@ xfs_exchrange_mappings(
* length in @fxr are safe to round up.
*/
if (xfs_inode_has_bigrtalloc(ip2))
- req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount);
+ req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount);
error = xfs_exchrange_estimate(&req);
if (error)
@@ -813,8 +813,6 @@ xfs_ioc_exchange_range(
.file2 = file,
};
struct xfs_exchange_range args;
- struct fd file1;
- int error;
if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
@@ -828,14 +826,12 @@ xfs_ioc_exchange_range(
fxr.length = args.length;
fxr.flags = args.flags;
- file1 = fdget(args.file1_fd);
- if (!fd_file(file1))
+ CLASS(fd, file1)(args.file1_fd);
+ if (fd_empty(file1))
return -EBADF;
fxr.file1 = fd_file(file1);
- error = xfs_exchange_range(&fxr);
- fdput(file1);
- return error;
+ return xfs_exchange_range(&fxr);
}
/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
@@ -909,8 +905,6 @@ xfs_ioc_commit_range(
struct xfs_commit_range_fresh *kern_f;
struct xfs_inode *ip2 = XFS_I(file_inode(file));
struct xfs_mount *mp = ip2->i_mount;
- struct fd file1;
- int error;
kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
@@ -934,12 +928,10 @@ xfs_ioc_commit_range(
fxr.file2_ctime.tv_sec = kern_f->file2_ctime;
fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
- file1 = fdget(args.file1_fd);
+ CLASS(fd, file1)(args.file1_fd);
if (fd_empty(file1))
return -EBADF;
fxr.file1 = fd_file(file1);
- error = xfs_exchange_range(&fxr);
- fdput(file1);
- return error;
+ return xfs_exchange_range(&fxr);
}
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index a73e7c73b664..ea43c9a6e54c 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -18,15 +18,24 @@
#include "xfs_trans.h"
#include "xfs_log.h"
#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
+
+struct xfs_extent_busy_tree {
+ spinlock_t eb_lock;
+ struct rb_root eb_tree;
+ unsigned int eb_gen;
+ wait_queue_head_t eb_wait;
+};
static void
xfs_extent_busy_insert_list(
- struct xfs_perag *pag,
+ struct xfs_group *xg,
xfs_agblock_t bno,
xfs_extlen_t len,
unsigned int flags,
struct list_head *busy_list)
{
+ struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
struct xfs_extent_busy *new;
struct xfs_extent_busy *busyp;
struct rb_node **rbp;
@@ -34,17 +43,17 @@ xfs_extent_busy_insert_list(
new = kzalloc(sizeof(struct xfs_extent_busy),
GFP_KERNEL | __GFP_NOFAIL);
- new->agno = pag->pag_agno;
+ new->group = xfs_group_hold(xg);
new->bno = bno;
new->length = len;
INIT_LIST_HEAD(&new->list);
new->flags = flags;
/* trace before insert to be able to see failed inserts */
- trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len);
+ trace_xfs_extent_busy(xg, bno, len);
- spin_lock(&pag->pagb_lock);
- rbp = &pag->pagb_tree.rb_node;
+ spin_lock(&eb->eb_lock);
+ rbp = &eb->eb_tree.rb_node;
while (*rbp) {
parent = *rbp;
busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
@@ -61,32 +70,32 @@ xfs_extent_busy_insert_list(
}
rb_link_node(&new->rb_node, parent, rbp);
- rb_insert_color(&new->rb_node, &pag->pagb_tree);
+ rb_insert_color(&new->rb_node, &eb->eb_tree);
/* always process discard lists in fifo order */
list_add_tail(&new->list, busy_list);
- spin_unlock(&pag->pagb_lock);
+ spin_unlock(&eb->eb_lock);
}
void
xfs_extent_busy_insert(
struct xfs_trans *tp,
- struct xfs_perag *pag,
+ struct xfs_group *xg,
xfs_agblock_t bno,
xfs_extlen_t len,
unsigned int flags)
{
- xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy);
+ xfs_extent_busy_insert_list(xg, bno, len, flags, &tp->t_busy);
}
void
xfs_extent_busy_insert_discard(
- struct xfs_perag *pag,
+ struct xfs_group *xg,
xfs_agblock_t bno,
xfs_extlen_t len,
struct list_head *busy_list)
{
- xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED,
+ xfs_extent_busy_insert_list(xg, bno, len, XFS_EXTENT_BUSY_DISCARDED,
busy_list);
}
@@ -101,18 +110,18 @@ xfs_extent_busy_insert_discard(
*/
int
xfs_extent_busy_search(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
+ struct xfs_group *xg,
xfs_agblock_t bno,
xfs_extlen_t len)
{
+ struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
struct rb_node *rbp;
struct xfs_extent_busy *busyp;
int match = 0;
/* find closest start bno overlap */
- spin_lock(&pag->pagb_lock);
- rbp = pag->pagb_tree.rb_node;
+ spin_lock(&eb->eb_lock);
+ rbp = eb->eb_tree.rb_node;
while (rbp) {
busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
if (bno < busyp->bno) {
@@ -131,7 +140,7 @@ xfs_extent_busy_search(
break;
}
}
- spin_unlock(&pag->pagb_lock);
+ spin_unlock(&eb->eb_lock);
return match;
}
@@ -148,14 +157,15 @@ xfs_extent_busy_search(
*/
STATIC bool
xfs_extent_busy_update_extent(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
+ struct xfs_group *xg,
struct xfs_extent_busy *busyp,
xfs_agblock_t fbno,
xfs_extlen_t flen,
- bool userdata) __releases(&pag->pagb_lock)
- __acquires(&pag->pagb_lock)
+ bool userdata)
+ __releases(&eb->eb_lock)
+ __acquires(&eb->eb_lock)
{
+ struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
xfs_agblock_t fend = fbno + flen;
xfs_agblock_t bbno = busyp->bno;
xfs_agblock_t bend = bbno + busyp->length;
@@ -166,9 +176,9 @@ xfs_extent_busy_update_extent(
* and retry.
*/
if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
- spin_unlock(&pag->pagb_lock);
+ spin_unlock(&eb->eb_lock);
delay(1);
- spin_lock(&pag->pagb_lock);
+ spin_lock(&eb->eb_lock);
return false;
}
@@ -241,7 +251,7 @@ xfs_extent_busy_update_extent(
* tree root, because erasing the node can rearrange the
* tree topology.
*/
- rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ rb_erase(&busyp->rb_node, &eb->eb_tree);
busyp->length = 0;
return false;
} else if (fend < bend) {
@@ -280,35 +290,34 @@ xfs_extent_busy_update_extent(
ASSERT(0);
}
- trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
+ trace_xfs_extent_busy_reuse(xg, fbno, flen);
return true;
out_force_log:
- spin_unlock(&pag->pagb_lock);
- xfs_log_force(mp, XFS_LOG_SYNC);
- trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
- spin_lock(&pag->pagb_lock);
+ spin_unlock(&eb->eb_lock);
+ xfs_log_force(xg->xg_mount, XFS_LOG_SYNC);
+ trace_xfs_extent_busy_force(xg, fbno, flen);
+ spin_lock(&eb->eb_lock);
return false;
}
-
/*
* For a given extent [fbno, flen], make sure we can reuse it safely.
*/
void
xfs_extent_busy_reuse(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
+ struct xfs_group *xg,
xfs_agblock_t fbno,
xfs_extlen_t flen,
bool userdata)
{
+ struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
struct rb_node *rbp;
ASSERT(flen > 0);
- spin_lock(&pag->pagb_lock);
+ spin_lock(&eb->eb_lock);
restart:
- rbp = pag->pagb_tree.rb_node;
+ rbp = eb->eb_tree.rb_node;
while (rbp) {
struct xfs_extent_busy *busyp =
rb_entry(rbp, struct xfs_extent_busy, rb_node);
@@ -323,11 +332,11 @@ restart:
continue;
}
- if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
+ if (!xfs_extent_busy_update_extent(xg, busyp, fbno, flen,
userdata))
goto restart;
}
- spin_unlock(&pag->pagb_lock);
+ spin_unlock(&eb->eb_lock);
}
/*
@@ -336,7 +345,7 @@ restart:
* args->minlen no suitable extent could be found, and the higher level
* code needs to force out the log and retry the allocation.
*
- * Return the current busy generation for the AG if the extent is busy. This
+ * Return the current busy generation for the group if the extent is busy. This
* value can be used to wait for at least one of the currently busy extents
* to be cleared. Note that the busy list is not guaranteed to be empty after
* the gen is woken. The state of a specific extent must always be confirmed
@@ -344,11 +353,14 @@ restart:
*/
bool
xfs_extent_busy_trim(
- struct xfs_alloc_arg *args,
+ struct xfs_group *xg,
+ xfs_extlen_t minlen,
+ xfs_extlen_t maxlen,
xfs_agblock_t *bno,
xfs_extlen_t *len,
unsigned *busy_gen)
{
+ struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
xfs_agblock_t fbno;
xfs_extlen_t flen;
struct rb_node *rbp;
@@ -356,11 +368,11 @@ xfs_extent_busy_trim(
ASSERT(*len > 0);
- spin_lock(&args->pag->pagb_lock);
+ spin_lock(&eb->eb_lock);
fbno = *bno;
flen = *len;
- rbp = args->pag->pagb_tree.rb_node;
- while (rbp && flen >= args->minlen) {
+ rbp = eb->eb_tree.rb_node;
+ while (rbp && flen >= minlen) {
struct xfs_extent_busy *busyp =
rb_entry(rbp, struct xfs_extent_busy, rb_node);
xfs_agblock_t fend = fbno + flen;
@@ -481,13 +493,13 @@ xfs_extent_busy_trim(
* good chance subsequent allocations will be
* contiguous.
*/
- if (bbno - fbno >= args->maxlen) {
+ if (bbno - fbno >= maxlen) {
/* left candidate fits perfect */
fend = bbno;
- } else if (fend - bend >= args->maxlen * 4) {
+ } else if (fend - bend >= maxlen * 4) {
/* right candidate has enough free space */
fbno = bend;
- } else if (bbno - fbno >= args->minlen) {
+ } else if (bbno - fbno >= minlen) {
/* left candidate fits minimum requirement */
fend = bbno;
} else {
@@ -500,14 +512,13 @@ xfs_extent_busy_trim(
out:
if (fbno != *bno || flen != *len) {
- trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
- fbno, flen);
+ trace_xfs_extent_busy_trim(xg, *bno, *len, fbno, flen);
*bno = fbno;
*len = flen;
- *busy_gen = args->pag->pagb_gen;
+ *busy_gen = eb->eb_gen;
ret = true;
}
- spin_unlock(&args->pag->pagb_lock);
+ spin_unlock(&eb->eb_lock);
return ret;
fail:
/*
@@ -520,22 +531,24 @@ fail:
static bool
xfs_extent_busy_clear_one(
- struct xfs_perag *pag,
struct xfs_extent_busy *busyp,
bool do_discard)
{
+ struct xfs_extent_busy_tree *eb = busyp->group->xg_busy_extents;
+
if (busyp->length) {
if (do_discard &&
!(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
return false;
}
- trace_xfs_extent_busy_clear(pag->pag_mount, busyp->agno,
- busyp->bno, busyp->length);
- rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ trace_xfs_extent_busy_clear(busyp->group, busyp->bno,
+ busyp->length);
+ rb_erase(&busyp->rb_node, &eb->eb_tree);
}
list_del_init(&busyp->list);
+ xfs_group_put(busyp->group);
kfree(busyp);
return true;
}
@@ -547,7 +560,6 @@ xfs_extent_busy_clear_one(
*/
void
xfs_extent_busy_clear(
- struct xfs_mount *mp,
struct list_head *list,
bool do_discard)
{
@@ -558,30 +570,30 @@ xfs_extent_busy_clear(
return;
do {
+ struct xfs_group *xg = xfs_group_hold(busyp->group);
+ struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
bool wakeup = false;
- struct xfs_perag *pag;
- pag = xfs_perag_get(mp, busyp->agno);
- spin_lock(&pag->pagb_lock);
+ spin_lock(&eb->eb_lock);
do {
next = list_next_entry(busyp, list);
- if (xfs_extent_busy_clear_one(pag, busyp, do_discard))
+ if (xfs_extent_busy_clear_one(busyp, do_discard))
wakeup = true;
busyp = next;
} while (!list_entry_is_head(busyp, list, list) &&
- busyp->agno == pag->pag_agno);
+ busyp->group == xg);
if (wakeup) {
- pag->pagb_gen++;
- wake_up_all(&pag->pagb_wait);
+ eb->eb_gen++;
+ wake_up_all(&eb->eb_wait);
}
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
+ spin_unlock(&eb->eb_lock);
+ xfs_group_put(xg);
} while (!list_entry_is_head(busyp, list, list));
}
/*
- * Flush out all busy extents for this AG.
+ * Flush out all busy extents for this group.
*
* If the current transaction is holding busy extents, the caller may not want
* to wait for committed busy extents to resolve. If we are being told just to
@@ -597,10 +609,11 @@ xfs_extent_busy_clear(
int
xfs_extent_busy_flush(
struct xfs_trans *tp,
- struct xfs_perag *pag,
+ struct xfs_group *xg,
unsigned busy_gen,
uint32_t alloc_flags)
{
+ struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
DEFINE_WAIT (wait);
int error;
@@ -613,7 +626,7 @@ xfs_extent_busy_flush(
if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH)
return 0;
- if (busy_gen != READ_ONCE(pag->pagb_gen))
+ if (busy_gen != READ_ONCE(eb->eb_gen))
return 0;
if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
@@ -622,37 +635,49 @@ xfs_extent_busy_flush(
/* Wait for committed busy extents to resolve. */
do {
- prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
- if (busy_gen != READ_ONCE(pag->pagb_gen))
+ prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE);
+ if (busy_gen != READ_ONCE(eb->eb_gen))
break;
schedule();
} while (1);
- finish_wait(&pag->pagb_wait, &wait);
+ finish_wait(&eb->eb_wait, &wait);
return 0;
}
+static void
+xfs_extent_busy_wait_group(
+ struct xfs_group *xg)
+{
+ DEFINE_WAIT (wait);
+ struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
+
+ do {
+ prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE);
+ if (RB_EMPTY_ROOT(&eb->eb_tree))
+ break;
+ schedule();
+ } while (1);
+ finish_wait(&eb->eb_wait, &wait);
+}
+
void
xfs_extent_busy_wait_all(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- DEFINE_WAIT (wait);
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
+ struct xfs_rtgroup *rtg = NULL;
- for_each_perag(mp, agno, pag) {
- do {
- prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
- if (RB_EMPTY_ROOT(&pag->pagb_tree))
- break;
- schedule();
- } while (1);
- finish_wait(&pag->pagb_wait, &wait);
- }
+ while ((pag = xfs_perag_next(mp, pag)))
+ xfs_extent_busy_wait_group(pag_group(pag));
+
+ if (xfs_has_rtgroups(mp))
+ while ((rtg = xfs_rtgroup_next(mp, rtg)))
+ xfs_extent_busy_wait_group(rtg_group(rtg));
}
/*
- * Callback for list_sort to sort busy extents by the AG they reside in.
+ * Callback for list_sort to sort busy extents by the group they reside in.
*/
int
xfs_extent_busy_ag_cmp(
@@ -666,21 +691,38 @@ xfs_extent_busy_ag_cmp(
container_of(l2, struct xfs_extent_busy, list);
s32 diff;
- diff = b1->agno - b2->agno;
+ diff = b1->group->xg_gno - b2->group->xg_gno;
if (!diff)
diff = b1->bno - b2->bno;
return diff;
}
-/* Are there any busy extents in this AG? */
+/* Are there any busy extents in this group? */
bool
xfs_extent_busy_list_empty(
- struct xfs_perag *pag)
+ struct xfs_group *xg,
+ unsigned *busy_gen)
{
+ struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
bool res;
- spin_lock(&pag->pagb_lock);
- res = RB_EMPTY_ROOT(&pag->pagb_tree);
- spin_unlock(&pag->pagb_lock);
+ spin_lock(&eb->eb_lock);
+ res = RB_EMPTY_ROOT(&eb->eb_tree);
+ *busy_gen = READ_ONCE(eb->eb_gen);
+ spin_unlock(&eb->eb_lock);
return res;
}
+
+struct xfs_extent_busy_tree *
+xfs_extent_busy_alloc(void)
+{
+ struct xfs_extent_busy_tree *eb;
+
+ eb = kzalloc(sizeof(*eb), GFP_KERNEL);
+ if (!eb)
+ return NULL;
+ spin_lock_init(&eb->eb_lock);
+ init_waitqueue_head(&eb->eb_wait);
+ eb->eb_tree = RB_ROOT;
+ return eb;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 470032de3139..f069b04e8ea1 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -8,19 +8,18 @@
#ifndef __XFS_EXTENT_BUSY_H__
#define __XFS_EXTENT_BUSY_H__
+struct xfs_group;
struct xfs_mount;
-struct xfs_perag;
struct xfs_trans;
-struct xfs_alloc_arg;
/*
- * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
- * have been freed but whose transactions aren't committed to disk yet.
+ * Busy block/extent entry. Indexed by a rbtree in the group to mark blocks
+ * that have been freed but whose transactions aren't committed to disk yet.
*/
struct xfs_extent_busy {
- struct rb_node rb_node; /* ag by-bno indexed search tree */
+ struct rb_node rb_node; /* group by-bno indexed search tree */
struct list_head list; /* transaction busy extent list */
- xfs_agnumber_t agno;
+ struct xfs_group *group;
xfs_agblock_t bno;
xfs_extlen_t length;
unsigned int flags;
@@ -33,7 +32,6 @@ struct xfs_extent_busy {
* to discard completion.
*/
struct xfs_busy_extents {
- struct xfs_mount *mount;
struct list_head extent_list;
struct work_struct endio_work;
@@ -45,46 +43,29 @@ struct xfs_busy_extents {
void *owner;
};
-void
-xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag,
- xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
-
-void
-xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno,
- xfs_extlen_t len, struct list_head *busy_list);
-
-void
-xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
- bool do_discard);
-
-int
-xfs_extent_busy_search(struct xfs_mount *mp, struct xfs_perag *pag,
- xfs_agblock_t bno, xfs_extlen_t len);
-
-void
-xfs_extent_busy_reuse(struct xfs_mount *mp, struct xfs_perag *pag,
- xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
-
-bool
-xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno,
- xfs_extlen_t *len, unsigned *busy_gen);
-
-int
-xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_perag *pag,
+void xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_group *xg,
+ xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
+void xfs_extent_busy_insert_discard(struct xfs_group *xg, xfs_agblock_t bno,
+ xfs_extlen_t len, struct list_head *busy_list);
+void xfs_extent_busy_clear(struct list_head *list, bool do_discard);
+int xfs_extent_busy_search(struct xfs_group *xg, xfs_agblock_t bno,
+ xfs_extlen_t len);
+void xfs_extent_busy_reuse(struct xfs_group *xg, xfs_agblock_t fbno,
+ xfs_extlen_t flen, bool userdata);
+bool xfs_extent_busy_trim(struct xfs_group *xg, xfs_extlen_t minlen,
+ xfs_extlen_t maxlen, xfs_agblock_t *bno, xfs_extlen_t *len,
+ unsigned *busy_gen);
+int xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_group *xg,
unsigned busy_gen, uint32_t alloc_flags);
+void xfs_extent_busy_wait_all(struct xfs_mount *mp);
+bool xfs_extent_busy_list_empty(struct xfs_group *xg, unsigned int *busy_gen);
+struct xfs_extent_busy_tree *xfs_extent_busy_alloc(void);
-void
-xfs_extent_busy_wait_all(struct xfs_mount *mp);
-
-int
-xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a,
- const struct list_head *b);
-
+int xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b);
static inline void xfs_extent_busy_sort(struct list_head *list)
{
list_sort(NULL, list, xfs_extent_busy_ag_cmp);
}
-bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
-
#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index abffc74a924f..a25c713ff888 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -25,6 +25,10 @@
#include "xfs_error.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
struct kmem_cache *xfs_efi_cache;
struct kmem_cache *xfs_efd_cache;
@@ -95,16 +99,15 @@ xfs_efi_item_format(
ASSERT(atomic_read(&efip->efi_next_extent) ==
efip->efi_format.efi_nextents);
+ ASSERT(lip->li_type == XFS_LI_EFI || lip->li_type == XFS_LI_EFI_RT);
- efip->efi_format.efi_type = XFS_LI_EFI;
+ efip->efi_format.efi_type = lip->li_type;
efip->efi_format.efi_size = 1;
- xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
- &efip->efi_format,
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, &efip->efi_format,
xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents));
}
-
/*
* The unpin operation is the last place an EFI is manipulated in the log. It is
* either inserted in the AIL or aborted in the event of a log I/O error. In
@@ -140,12 +143,14 @@ xfs_efi_item_release(
STATIC struct xfs_efi_log_item *
xfs_efi_init(
struct xfs_mount *mp,
+ unsigned short item_type,
uint nextents)
-
{
struct xfs_efi_log_item *efip;
+ ASSERT(item_type == XFS_LI_EFI || item_type == XFS_LI_EFI_RT);
ASSERT(nextents > 0);
+
if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
efip = kzalloc(xfs_efi_log_item_sizeof(nextents),
GFP_KERNEL | __GFP_NOFAIL);
@@ -154,7 +159,7 @@ xfs_efi_init(
GFP_KERNEL | __GFP_NOFAIL);
}
- xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
+ xfs_log_item_init(mp, &efip->efi_item, item_type, &xfs_efi_item_ops);
efip->efi_format.efi_nextents = nextents;
efip->efi_format.efi_id = (uintptr_t)(void *)efip;
atomic_set(&efip->efi_next_extent, 0);
@@ -264,12 +269,12 @@ xfs_efd_item_format(
struct xfs_log_iovec *vecp = NULL;
ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
+ ASSERT(lip->li_type == XFS_LI_EFD || lip->li_type == XFS_LI_EFD_RT);
- efdp->efd_format.efd_type = XFS_LI_EFD;
+ efdp->efd_format.efd_type = lip->li_type;
efdp->efd_format.efd_size = 1;
- xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
- &efdp->efd_format,
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, &efdp->efd_format,
xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents));
}
@@ -308,6 +313,14 @@ static inline struct xfs_extent_free_item *xefi_entry(const struct list_head *e)
return list_entry(e, struct xfs_extent_free_item, xefi_list);
}
+static inline bool
+xfs_efi_item_isrt(const struct xfs_log_item *lip)
+{
+ ASSERT(lip->li_type == XFS_LI_EFI || lip->li_type == XFS_LI_EFI_RT);
+
+ return lip->li_type == XFS_LI_EFI_RT;
+}
+
/*
* Fill the EFD with all extents from the EFI when we need to roll the
* transaction and continue with a new EFI.
@@ -362,7 +375,7 @@ xfs_extent_free_diff_items(
struct xfs_extent_free_item *ra = xefi_entry(a);
struct xfs_extent_free_item *rb = xefi_entry(b);
- return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno;
+ return ra->xefi_group->xg_gno - rb->xefi_group->xg_gno;
}
/* Log a free extent to the intent item. */
@@ -388,18 +401,20 @@ xfs_extent_free_log_item(
}
static struct xfs_log_item *
-xfs_extent_free_create_intent(
+__xfs_extent_free_create_intent(
struct xfs_trans *tp,
struct list_head *items,
unsigned int count,
- bool sort)
+ bool sort,
+ unsigned short item_type)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_efi_log_item *efip = xfs_efi_init(mp, count);
+ struct xfs_efi_log_item *efip;
struct xfs_extent_free_item *xefi;
ASSERT(count > 0);
+ efip = xfs_efi_init(mp, item_type, count);
if (sort)
list_sort(mp, items, xfs_extent_free_diff_items);
list_for_each_entry(xefi, items, xefi_list)
@@ -407,6 +422,23 @@ xfs_extent_free_create_intent(
return &efip->efi_item;
}
+static struct xfs_log_item *
+xfs_extent_free_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return __xfs_extent_free_create_intent(tp, items, count, sort,
+ XFS_LI_EFI);
+}
+
+static inline unsigned short
+xfs_efd_type_from_efi(const struct xfs_efi_log_item *efip)
+{
+ return xfs_efi_item_isrt(&efip->efi_item) ? XFS_LI_EFD_RT : XFS_LI_EFD;
+}
+
/* Get an EFD so we can process all the free extents. */
static struct xfs_log_item *
xfs_extent_free_create_done(
@@ -427,8 +459,8 @@ xfs_extent_free_create_done(
GFP_KERNEL | __GFP_NOFAIL);
}
- xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
- &xfs_efd_item_ops);
+ xfs_log_item_init(tp->t_mountp, &efdp->efd_item,
+ xfs_efd_type_from_efi(efip), &xfs_efd_item_ops);
efdp->efd_efip = efip;
efdp->efd_format.efd_nextents = count;
efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
@@ -436,6 +468,17 @@ xfs_extent_free_create_done(
return &efdp->efd_item;
}
+static inline const struct xfs_defer_op_type *
+xefi_ops(
+ struct xfs_extent_free_item *xefi)
+{
+ if (xfs_efi_is_realtime(xefi))
+ return &xfs_rtextent_free_defer_type;
+ if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
+ return &xfs_agfl_free_defer_type;
+ return &xfs_extent_free_defer_type;
+}
+
/* Add this deferred EFI to the transaction. */
void
xfs_extent_free_defer_add(
@@ -445,15 +488,11 @@ xfs_extent_free_defer_add(
{
struct xfs_mount *mp = tp->t_mountp;
- trace_xfs_extent_free_defer(mp, xefi);
+ xefi->xefi_group = xfs_group_intent_get(mp, xefi->xefi_startblock,
+ xfs_efi_is_realtime(xefi) ? XG_TYPE_RTG : XG_TYPE_AG);
- xefi->xefi_pag = xfs_perag_intent_get(mp, xefi->xefi_startblock);
- if (xefi->xefi_agresv == XFS_AG_RESV_AGFL)
- *dfpp = xfs_defer_add(tp, &xefi->xefi_list,
- &xfs_agfl_free_defer_type);
- else
- *dfpp = xfs_defer_add(tp, &xefi->xefi_list,
- &xfs_extent_free_defer_type);
+ trace_xfs_extent_free_defer(mp, xefi);
+ *dfpp = xfs_defer_add(tp, &xefi->xefi_list, xefi_ops(xefi));
}
/* Cancel a free extent. */
@@ -463,7 +502,7 @@ xfs_extent_free_cancel_item(
{
struct xfs_extent_free_item *xefi = xefi_entry(item);
- xfs_perag_intent_put(xefi->xefi_pag);
+ xfs_group_intent_put(xefi->xefi_group);
kmem_cache_free(xfs_extfree_item_cache, xefi);
}
@@ -499,7 +538,7 @@ xfs_extent_free_finish_item(
* in this EFI to the EFD so this works correctly.
*/
if (!(xefi->xefi_flags & XFS_EFI_CANCELLED))
- error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
+ error = __xfs_free_extent(tp, to_perag(xefi->xefi_group), agbno,
xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv,
xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
if (error == -EAGAIN) {
@@ -545,10 +584,10 @@ xfs_agfl_free_finish_item(
trace_xfs_agfl_free_deferred(mp, xefi);
- error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp);
+ error = xfs_alloc_read_agf(to_perag(xefi->xefi_group), tp, 0, &agbp);
if (!error)
- error = xfs_free_ag_extent(tp, agbp, xefi->xefi_pag->pag_agno,
- agbno, 1, &oinfo, XFS_AG_RESV_AGFL);
+ error = xfs_free_ag_extent(tp, agbp, agbno, 1, &oinfo,
+ XFS_AG_RESV_AGFL);
xfs_efd_add_extent(efdp, xefi);
xfs_extent_free_cancel_item(&xefi->xefi_list);
@@ -559,8 +598,12 @@ xfs_agfl_free_finish_item(
static inline bool
xfs_efi_validate_ext(
struct xfs_mount *mp,
+ bool isrt,
struct xfs_extent *extp)
{
+ if (isrt)
+ return xfs_verify_rtbext(mp, extp->ext_start, extp->ext_len);
+
return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len);
}
@@ -568,6 +611,7 @@ static inline void
xfs_efi_recover_work(
struct xfs_mount *mp,
struct xfs_defer_pending *dfp,
+ bool isrt,
struct xfs_extent *extp)
{
struct xfs_extent_free_item *xefi;
@@ -578,7 +622,10 @@ xfs_efi_recover_work(
xefi->xefi_blockcount = extp->ext_len;
xefi->xefi_agresv = XFS_AG_RESV_NONE;
xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN;
- xefi->xefi_pag = xfs_perag_intent_get(mp, extp->ext_start);
+ xefi->xefi_group = xfs_group_intent_get(mp, extp->ext_start,
+ isrt ? XG_TYPE_RTG : XG_TYPE_AG);
+ if (isrt)
+ xefi->xefi_flags |= XFS_EFI_REALTIME;
xfs_defer_add_item(dfp, &xefi->xefi_list);
}
@@ -599,14 +646,15 @@ xfs_extent_free_recover_work(
struct xfs_trans *tp;
int i;
int error = 0;
+ bool isrt = xfs_efi_item_isrt(lip);
/*
- * First check the validity of the extents described by the
- * EFI. If any are bad, then assume that all are bad and
- * just toss the EFI.
+ * First check the validity of the extents described by the EFI. If
+ * any are bad, then assume that all are bad and just toss the EFI.
+ * Mixing RT and non-RT extents in the same EFI item is not allowed.
*/
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
- if (!xfs_efi_validate_ext(mp,
+ if (!xfs_efi_validate_ext(mp, isrt,
&efip->efi_format.efi_extents[i])) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
&efip->efi_format,
@@ -614,7 +662,8 @@ xfs_extent_free_recover_work(
return -EFSCORRUPTED;
}
- xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]);
+ xfs_efi_recover_work(mp, dfp, isrt,
+ &efip->efi_format.efi_extents[i]);
}
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -652,10 +701,12 @@ xfs_extent_free_relog_intent(
count = EFI_ITEM(intent)->efi_format.efi_nextents;
extp = EFI_ITEM(intent)->efi_format.efi_extents;
+ ASSERT(intent->li_type == XFS_LI_EFI || intent->li_type == XFS_LI_EFI_RT);
+
efdp->efd_next_extent = count;
memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
- efip = xfs_efi_init(tp->t_mountp, count);
+ efip = xfs_efi_init(tp->t_mountp, intent->li_type, count);
memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
atomic_set(&efip->efi_next_extent, count);
@@ -687,6 +738,72 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
.relog_intent = xfs_extent_free_relog_intent,
};
+#ifdef CONFIG_XFS_RT
+/* Create a realtime extent freeing */
+static struct xfs_log_item *
+xfs_rtextent_free_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return __xfs_extent_free_create_intent(tp, items, count, sort,
+ XFS_LI_EFI_RT);
+}
+
+/* Process a free realtime extent. */
+STATIC int
+xfs_rtextent_free_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_extent_free_item *xefi = xefi_entry(item);
+ struct xfs_efd_log_item *efdp = EFD_ITEM(done);
+ struct xfs_rtgroup **rtgp = (struct xfs_rtgroup **)state;
+ int error = 0;
+
+ trace_xfs_extent_free_deferred(mp, xefi);
+
+ if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) {
+ if (*rtgp != to_rtg(xefi->xefi_group)) {
+ *rtgp = to_rtg(xefi->xefi_group);
+ xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP);
+ xfs_rtgroup_trans_join(tp, *rtgp,
+ XFS_RTGLOCK_BITMAP);
+ }
+ error = xfs_rtfree_blocks(tp, *rtgp,
+ xefi->xefi_startblock, xefi->xefi_blockcount);
+ }
+ if (error == -EAGAIN) {
+ xfs_efd_from_efi(efdp);
+ return error;
+ }
+
+ xfs_efd_add_extent(efdp, xefi);
+ xfs_extent_free_cancel_item(item);
+ return error;
+}
+
+const struct xfs_defer_op_type xfs_rtextent_free_defer_type = {
+ .name = "rtextent_free",
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_rtextent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_rtextent_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+ .recover_work = xfs_extent_free_recover_work,
+ .relog_intent = xfs_extent_free_relog_intent,
+};
+#else
+const struct xfs_defer_op_type xfs_rtextent_free_defer_type = {
+ .name = "rtextent_free",
+};
+#endif /* CONFIG_XFS_RT */
+
STATIC bool
xfs_efi_item_match(
struct xfs_log_item *lip,
@@ -731,7 +848,7 @@ xlog_recover_efi_commit_pass2(
return -EFSCORRUPTED;
}
- efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+ efip = xfs_efi_init(mp, ITEM_TYPE(item), efi_formatp->efi_nextents);
error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
if (error) {
xfs_efi_item_free(efip);
@@ -749,6 +866,58 @@ const struct xlog_recover_item_ops xlog_efi_item_ops = {
.commit_pass2 = xlog_recover_efi_commit_pass2,
};
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtefi_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_efi_log_format *efi_formatp;
+ int error;
+
+ efi_formatp = item->ri_buf[0].i_addr;
+
+ if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ efip = xfs_efi_init(mp, ITEM_TYPE(item), efi_formatp->efi_nextents);
+ error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
+ if (error) {
+ xfs_efi_item_free(efip);
+ return error;
+ }
+ atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
+
+ xlog_recover_intent_item(log, &efip->efi_item, lsn,
+ &xfs_rtextent_free_defer_type);
+ return 0;
+}
+#else
+STATIC int
+xlog_recover_rtefi_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+}
+#endif
+
+const struct xlog_recover_item_ops xlog_rtefi_item_ops = {
+ .item_type = XFS_LI_EFI_RT,
+ .commit_pass2 = xlog_recover_rtefi_commit_pass2,
+};
+
/*
* This routine is called when an EFD format structure is found in a committed
* transaction in the log. Its purpose is to cancel the corresponding EFI if it
@@ -791,3 +960,44 @@ const struct xlog_recover_item_ops xlog_efd_item_ops = {
.item_type = XFS_LI_EFD,
.commit_pass2 = xlog_recover_efd_commit_pass2,
};
+
+#ifdef CONFIG_XFS_RT
+STATIC int
+xlog_recover_rtefd_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_efd_log_format *efd_formatp;
+ int buflen = item->ri_buf[0].i_len;
+
+ efd_formatp = item->ri_buf[0].i_addr;
+
+ if (buflen < sizeof(struct xfs_efd_log_format)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ efd_formatp, buflen);
+ return -EFSCORRUPTED;
+ }
+
+ if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+ efd_formatp->efd_nextents) &&
+ item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+ efd_formatp->efd_nextents)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
+ efd_formatp, buflen);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_EFI_RT,
+ efd_formatp->efd_efi_id);
+ return 0;
+}
+#else
+# define xlog_recover_rtefd_commit_pass2 xlog_recover_rtefi_commit_pass2
+#endif
+
+const struct xlog_recover_item_ops xlog_rtefd_item_ops = {
+ .item_type = XFS_LI_EFD_RT,
+ .commit_pass2 = xlog_recover_rtefd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b19916b11fd5..4a0b7de4f7ae 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -852,6 +852,20 @@ xfs_file_write_iter(
if (IS_DAX(inode))
return xfs_file_dax_write(iocb, from);
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ /*
+ * Currently only atomic writing of a single FS block is
+ * supported. It would be possible to atomic write smaller than
+ * a FS block, but there is no requirement to support this.
+ * Note that iomap also does not support this yet.
+ */
+ if (ocount != ip->i_mount->m_sb.sb_blocksize)
+ return -EINVAL;
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
@@ -1239,6 +1253,8 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
+ if (xfs_inode_can_atomicwrite(XFS_I(inode)))
+ file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}
@@ -1425,6 +1441,8 @@ xfs_dax_read_fault(
struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
vm_fault_t ret;
+ trace_xfs_read_fault(ip, order);
+
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
ret = xfs_dax_fault_locked(vmf, order, false);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
@@ -1432,6 +1450,16 @@ xfs_dax_read_fault(
return ret;
}
+/*
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
+ *
+ * mmap_lock (MM)
+ * sb_start_pagefault(vfs, freeze)
+ * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
+ * page_lock (MM)
+ * i_lock (XFS - extent map serialisation)
+ */
static vm_fault_t
xfs_write_fault(
struct vm_fault *vmf,
@@ -1442,6 +1470,8 @@ xfs_write_fault(
unsigned int lock_mode = XFS_MMAPLOCK_SHARED;
vm_fault_t ret;
+ trace_xfs_write_fault(ip, order);
+
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
@@ -1460,40 +1490,13 @@ xfs_write_fault(
if (IS_DAX(inode))
ret = xfs_dax_fault_locked(vmf, order, true);
else
- ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
return ret;
}
-/*
- * Locking for serialisation of IO during page faults. This results in a lock
- * ordering of:
- *
- * mmap_lock (MM)
- * sb_start_pagefault(vfs, freeze)
- * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
- * page_lock (MM)
- * i_lock (XFS - extent map serialisation)
- */
-static vm_fault_t
-__xfs_filemap_fault(
- struct vm_fault *vmf,
- unsigned int order,
- bool write_fault)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
-
- trace_xfs_filemap_fault(XFS_I(inode), order, write_fault);
-
- if (write_fault)
- return xfs_write_fault(vmf, order);
- if (IS_DAX(inode))
- return xfs_dax_read_fault(vmf, order);
- return filemap_fault(vmf);
-}
-
static inline bool
xfs_is_write_fault(
struct vm_fault *vmf)
@@ -1506,10 +1509,17 @@ static vm_fault_t
xfs_filemap_fault(
struct vm_fault *vmf)
{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+
/* DAX can shortcut the normal fault path on write faults! */
- return __xfs_filemap_fault(vmf, 0,
- IS_DAX(file_inode(vmf->vma->vm_file)) &&
- xfs_is_write_fault(vmf));
+ if (IS_DAX(inode)) {
+ if (xfs_is_write_fault(vmf))
+ return xfs_write_fault(vmf, 0);
+ return xfs_dax_read_fault(vmf, 0);
+ }
+
+ trace_xfs_read_fault(XFS_I(inode), 0);
+ return filemap_fault(vmf);
}
static vm_fault_t
@@ -1521,15 +1531,16 @@ xfs_filemap_huge_fault(
return VM_FAULT_FALLBACK;
/* DAX can shortcut the normal fault path on write faults! */
- return __xfs_filemap_fault(vmf, order,
- xfs_is_write_fault(vmf));
+ if (xfs_is_write_fault(vmf))
+ return xfs_write_fault(vmf, order);
+ return xfs_dax_read_fault(vmf, order);
}
static vm_fault_t
xfs_filemap_page_mkwrite(
struct vm_fault *vmf)
{
- return __xfs_filemap_fault(vmf, 0, true);
+ return xfs_write_fault(vmf, 0);
}
/*
@@ -1541,8 +1552,7 @@ static vm_fault_t
xfs_filemap_pfn_mkwrite(
struct vm_fault *vmf)
{
-
- return __xfs_filemap_fault(vmf, 0, true);
+ return xfs_write_fault(vmf, 0);
}
static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 290ba8887d29..a961aa420c48 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -96,7 +96,7 @@ restart:
maxfree = pag->pagf_freeblks;
if (max_pag)
xfs_perag_rele(max_pag);
- atomic_inc(&pag->pag_active_ref);
+ atomic_inc(&pag_group(pag)->xg_active_ref);
max_pag = pag;
}
@@ -222,12 +222,12 @@ xfs_filestream_lookup_association(
* down immediately after we mark the lookup as done.
*/
pag = container_of(mru, struct xfs_fstrm_item, mru)->pag;
- atomic_inc(&pag->pag_active_ref);
+ atomic_inc(&pag_group(pag)->xg_active_ref);
xfs_mru_cache_done(mp->m_filestream);
trace_xfs_filestream_lookup(pag, ap->ip->i_ino);
- ap->blkno = XFS_AGB_TO_FSB(args->mp, pag->pag_agno, 0);
+ ap->blkno = xfs_agbno_to_fsb(pag, 0);
xfs_bmap_adjacent(ap);
/*
@@ -275,7 +275,7 @@ xfs_filestream_create_association(
struct xfs_fstrm_item *item =
container_of(mru, struct xfs_fstrm_item, mru);
- agno = (item->pag->pag_agno + 1) % mp->m_sb.sb_agcount;
+ agno = (pag_agno(item->pag) + 1) % mp->m_sb.sb_agcount;
xfs_fstrm_free_func(mp, mru);
} else if (xfs_is_inode32(mp)) {
xfs_agnumber_t rotorstep = xfs_rotorstep;
@@ -314,7 +314,7 @@ xfs_filestream_create_association(
if (!item)
goto out_put_fstrms;
- atomic_inc(&args->pag->pag_active_ref);
+ atomic_inc(&pag_group(args->pag)->xg_active_ref);
item->pag = args->pag;
error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru);
if (error)
@@ -344,7 +344,6 @@ xfs_filestream_select_ag(
struct xfs_alloc_arg *args,
xfs_extlen_t *longest)
{
- struct xfs_mount *mp = args->mp;
struct xfs_inode *pip;
xfs_ino_t ino = 0;
int error = 0;
@@ -370,7 +369,7 @@ xfs_filestream_select_ag(
return error;
out_select:
- ap->blkno = XFS_AGB_TO_FSB(mp, args->pag->pag_agno, 0);
+ ap->blkno = xfs_agbno_to_fsb(args->pag, 0);
return 0;
}
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index ae18ab86e608..82f2e0dd2249 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -25,6 +25,7 @@
#include "xfs_alloc_btree.h"
#include "xfs_rtbitmap.h"
#include "xfs_ag.h"
+#include "xfs_rtgroup.h"
/* Convert an xfs_fsmap to an fsmap. */
static void
@@ -110,18 +111,18 @@ xfs_fsmap_owner_to_rmap(
/* Convert an rmapbt owner into an fsmap owner. */
static int
-xfs_fsmap_owner_from_rmap(
+xfs_fsmap_owner_from_frec(
struct xfs_fsmap *dest,
- const struct xfs_rmap_irec *src)
+ const struct xfs_fsmap_irec *frec)
{
dest->fmr_flags = 0;
- if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) {
- dest->fmr_owner = src->rm_owner;
+ if (!XFS_RMAP_NON_INODE_OWNER(frec->owner)) {
+ dest->fmr_owner = frec->owner;
return 0;
}
dest->fmr_flags |= FMR_OF_SPECIAL_OWNER;
- switch (src->rm_owner) {
+ switch (frec->owner) {
case XFS_RMAP_OWN_FS:
dest->fmr_owner = XFS_FMR_OWN_FS;
break;
@@ -158,7 +159,7 @@ struct xfs_getfsmap_info {
struct xfs_fsmap_head *head;
struct fsmap *fsmap_recs; /* mapping records */
struct xfs_buf *agf_bp; /* AGF, for refcount queries */
- struct xfs_perag *pag; /* AG info, if applicable */
+ struct xfs_group *group; /* group info, if applicable */
xfs_daddr_t next_daddr; /* next daddr we expect */
/* daddr of low fsmap key when we're using the rtbitmap */
xfs_daddr_t low_daddr;
@@ -203,7 +204,7 @@ STATIC int
xfs_getfsmap_is_shared(
struct xfs_trans *tp,
struct xfs_getfsmap_info *info,
- const struct xfs_rmap_irec *rec,
+ const struct xfs_fsmap_irec *frec,
bool *stat)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -216,15 +217,17 @@ xfs_getfsmap_is_shared(
if (!xfs_has_reflink(mp))
return 0;
/* rt files will have no perag structure */
- if (!info->pag)
+ if (!info->group)
return 0;
/* Are there any shared blocks here? */
flen = 0;
- cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, info->pag);
+ cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
+ to_perag(info->group));
- error = xfs_refcount_find_shared(cur, rec->rm_startblock,
- rec->rm_blockcount, &fbno, &flen, false);
+ error = xfs_refcount_find_shared(cur, frec->rec_key,
+ XFS_BB_TO_FSBT(mp, frec->len_daddr), &fbno, &flen,
+ false);
xfs_btree_del_cursor(cur, error);
if (error)
@@ -249,15 +252,22 @@ xfs_getfsmap_format(
}
static inline bool
-xfs_getfsmap_rec_before_start(
+xfs_getfsmap_frec_before_start(
struct xfs_getfsmap_info *info,
- const struct xfs_rmap_irec *rec,
- xfs_daddr_t rec_daddr)
+ const struct xfs_fsmap_irec *frec)
{
if (info->low_daddr != XFS_BUF_DADDR_NULL)
- return rec_daddr < info->low_daddr;
- if (info->low.rm_blockcount)
- return xfs_rmap_compare(rec, &info->low) < 0;
+ return frec->start_daddr < info->low_daddr;
+ if (info->low.rm_blockcount) {
+ struct xfs_rmap_irec rec = {
+ .rm_startblock = frec->rec_key,
+ .rm_owner = frec->owner,
+ .rm_flags = frec->rm_flags,
+ };
+
+ return xfs_rmap_compare(&rec, &info->low) < 0;
+ }
+
return false;
}
@@ -270,61 +280,36 @@ STATIC int
xfs_getfsmap_helper(
struct xfs_trans *tp,
struct xfs_getfsmap_info *info,
- const struct xfs_rmap_irec *rec,
- xfs_daddr_t rec_daddr,
- xfs_daddr_t len_daddr)
+ const struct xfs_fsmap_irec *frec)
{
struct xfs_fsmap fmr;
struct xfs_mount *mp = tp->t_mountp;
bool shared;
- int error;
+ int error = 0;
if (fatal_signal_pending(current))
return -EINTR;
- if (len_daddr == 0)
- len_daddr = XFS_FSB_TO_BB(mp, rec->rm_blockcount);
-
/*
* Filter out records that start before our startpoint, if the
* caller requested that.
*/
- if (xfs_getfsmap_rec_before_start(info, rec, rec_daddr)) {
- rec_daddr += len_daddr;
- if (info->next_daddr < rec_daddr)
- info->next_daddr = rec_daddr;
- return 0;
- }
-
- /*
- * For an info->last query, we're looking for a gap between the last
- * mapping emitted and the high key specified by userspace. If the
- * user's query spans less than 1 fsblock, then info->high and
- * info->low will have the same rm_startblock, which causes rec_daddr
- * and next_daddr to be the same. Therefore, use the end_daddr that
- * we calculated from userspace's high key to synthesize the record.
- * Note that if the btree query found a mapping, there won't be a gap.
- */
- if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL)
- rec_daddr = info->end_daddr;
+ if (xfs_getfsmap_frec_before_start(info, frec))
+ goto out;
/* Are we just counting mappings? */
if (info->head->fmh_count == 0) {
if (info->head->fmh_entries == UINT_MAX)
return -ECANCELED;
- if (rec_daddr > info->next_daddr)
+ if (frec->start_daddr > info->next_daddr)
info->head->fmh_entries++;
if (info->last)
return 0;
info->head->fmh_entries++;
-
- rec_daddr += len_daddr;
- if (info->next_daddr < rec_daddr)
- info->next_daddr = rec_daddr;
- return 0;
+ goto out;
}
/*
@@ -332,7 +317,7 @@ xfs_getfsmap_helper(
* then we've found a gap. Report the gap as being owned by
* whatever the caller specified is the missing owner.
*/
- if (rec_daddr > info->next_daddr) {
+ if (frec->start_daddr > info->next_daddr) {
if (info->head->fmh_entries >= info->head->fmh_count)
return -ECANCELED;
@@ -340,7 +325,7 @@ xfs_getfsmap_helper(
fmr.fmr_physical = info->next_daddr;
fmr.fmr_owner = info->missing_owner;
fmr.fmr_offset = 0;
- fmr.fmr_length = rec_daddr - info->next_daddr;
+ fmr.fmr_length = frec->start_daddr - info->next_daddr;
fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
xfs_getfsmap_format(mp, &fmr, info);
}
@@ -353,23 +338,24 @@ xfs_getfsmap_helper(
return -ECANCELED;
trace_xfs_fsmap_mapping(mp, info->dev,
- info->pag ? info->pag->pag_agno : NULLAGNUMBER, rec);
+ info->group ? info->group->xg_gno : NULLAGNUMBER,
+ frec);
fmr.fmr_device = info->dev;
- fmr.fmr_physical = rec_daddr;
- error = xfs_fsmap_owner_from_rmap(&fmr, rec);
+ fmr.fmr_physical = frec->start_daddr;
+ error = xfs_fsmap_owner_from_frec(&fmr, frec);
if (error)
return error;
- fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset);
- fmr.fmr_length = len_daddr;
- if (rec->rm_flags & XFS_RMAP_UNWRITTEN)
+ fmr.fmr_offset = XFS_FSB_TO_BB(mp, frec->offset);
+ fmr.fmr_length = frec->len_daddr;
+ if (frec->rm_flags & XFS_RMAP_UNWRITTEN)
fmr.fmr_flags |= FMR_OF_PREALLOC;
- if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+ if (frec->rm_flags & XFS_RMAP_ATTR_FORK)
fmr.fmr_flags |= FMR_OF_ATTR_FORK;
- if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ if (frec->rm_flags & XFS_RMAP_BMBT_BLOCK)
fmr.fmr_flags |= FMR_OF_EXTENT_MAP;
if (fmr.fmr_flags == 0) {
- error = xfs_getfsmap_is_shared(tp, info, rec, &shared);
+ error = xfs_getfsmap_is_shared(tp, info, frec, &shared);
if (error)
return error;
if (shared)
@@ -378,28 +364,55 @@ xfs_getfsmap_helper(
xfs_getfsmap_format(mp, &fmr, info);
out:
- rec_daddr += len_daddr;
- if (info->next_daddr < rec_daddr)
- info->next_daddr = rec_daddr;
+ info->next_daddr = max(info->next_daddr,
+ frec->start_daddr + frec->len_daddr);
return 0;
}
+static inline int
+xfs_getfsmap_group_helper(
+ struct xfs_getfsmap_info *info,
+ struct xfs_trans *tp,
+ struct xfs_group *xg,
+ xfs_agblock_t startblock,
+ xfs_extlen_t blockcount,
+ struct xfs_fsmap_irec *frec)
+{
+ /*
+ * For an info->last query, we're looking for a gap between the last
+ * mapping emitted and the high key specified by userspace. If the
+ * user's query spans less than 1 fsblock, then info->high and
+ * info->low will have the same rm_startblock, which causes rec_daddr
+ * and next_daddr to be the same. Therefore, use the end_daddr that
+ * we calculated from userspace's high key to synthesize the record.
+ * Note that if the btree query found a mapping, there won't be a gap.
+ */
+ if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL)
+ frec->start_daddr = info->end_daddr;
+ else
+ frec->start_daddr = xfs_gbno_to_daddr(xg, startblock);
+
+ frec->len_daddr = XFS_FSB_TO_BB(xg->xg_mount, blockcount);
+ return xfs_getfsmap_helper(tp, info, frec);
+}
+
/* Transform a rmapbt irec into a fsmap */
STATIC int
-xfs_getfsmap_datadev_helper(
+xfs_getfsmap_rmapbt_helper(
struct xfs_btree_cur *cur,
const struct xfs_rmap_irec *rec,
void *priv)
{
- struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_fsmap_irec frec = {
+ .owner = rec->rm_owner,
+ .offset = rec->rm_offset,
+ .rm_flags = rec->rm_flags,
+ .rec_key = rec->rm_startblock,
+ };
struct xfs_getfsmap_info *info = priv;
- xfs_fsblock_t fsb;
- xfs_daddr_t rec_daddr;
- fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock);
- rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
-
- return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr, 0);
+ return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group,
+ rec->rm_startblock, rec->rm_blockcount, &frec);
}
/* Transform a bnobt irec into a fsmap */
@@ -409,21 +422,14 @@ xfs_getfsmap_datadev_bnobt_helper(
const struct xfs_alloc_rec_incore *rec,
void *priv)
{
- struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_fsmap_irec frec = {
+ .owner = XFS_RMAP_OWN_NULL, /* "free" */
+ .rec_key = rec->ar_startblock,
+ };
struct xfs_getfsmap_info *info = priv;
- struct xfs_rmap_irec irec;
- xfs_daddr_t rec_daddr;
-
- rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.pag->pag_agno,
- rec->ar_startblock);
-
- irec.rm_startblock = rec->ar_startblock;
- irec.rm_blockcount = rec->ar_blockcount;
- irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
- irec.rm_offset = 0;
- irec.rm_flags = 0;
- return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr, 0);
+ return xfs_getfsmap_group_helper(info, cur->bc_tp, cur->bc_group,
+ rec->ar_startblock, rec->ar_blockcount, &frec);
}
/* Set rmap flags based on the getfsmap flags */
@@ -467,12 +473,11 @@ __xfs_getfsmap_datadev(
void *priv)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_perag *pag;
+ struct xfs_perag *pag = NULL;
struct xfs_btree_cur *bt_cur = NULL;
xfs_fsblock_t start_fsb;
xfs_fsblock_t end_fsb;
- xfs_agnumber_t start_ag;
- xfs_agnumber_t end_ag;
+ xfs_agnumber_t start_ag, end_ag;
uint64_t eofs;
int error = 0;
@@ -520,13 +525,13 @@ __xfs_getfsmap_datadev(
start_ag = XFS_FSB_TO_AGNO(mp, start_fsb);
end_ag = XFS_FSB_TO_AGNO(mp, end_fsb);
- for_each_perag_range(mp, start_ag, end_ag, pag) {
+ while ((pag = xfs_perag_next_range(mp, pag, start_ag, end_ag))) {
/*
* Set the AG high key from the fsmap high key if this
* is the last AG that we're querying.
*/
- info->pag = pag;
- if (pag->pag_agno == end_ag) {
+ info->group = pag_group(pag);
+ if (pag_agno(pag) == end_ag) {
info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp,
end_fsb);
info->high.rm_offset = XFS_BB_TO_FSBT(mp,
@@ -548,9 +553,9 @@ __xfs_getfsmap_datadev(
if (error)
break;
- trace_xfs_fsmap_low_key(mp, info->dev, pag->pag_agno,
+ trace_xfs_fsmap_low_group_key(mp, info->dev, pag_agno(pag),
&info->low);
- trace_xfs_fsmap_high_key(mp, info->dev, pag->pag_agno,
+ trace_xfs_fsmap_high_group_key(mp, info->dev, pag_agno(pag),
&info->high);
error = query_fn(tp, info, &bt_cur, priv);
@@ -561,7 +566,7 @@ __xfs_getfsmap_datadev(
* Set the AG low key to the start of the AG prior to
* moving on to the next AG.
*/
- if (pag->pag_agno == start_ag)
+ if (pag_agno(pag) == start_ag)
memset(&info->low, 0, sizeof(info->low));
/*
@@ -569,13 +574,13 @@ __xfs_getfsmap_datadev(
* before we drop the reference to the perag when the loop
* terminates.
*/
- if (pag->pag_agno == end_ag) {
+ if (pag_agno(pag) == end_ag) {
info->last = true;
error = query_fn(tp, info, &bt_cur, priv);
if (error)
break;
}
- info->pag = NULL;
+ info->group = NULL;
}
if (bt_cur)
@@ -585,9 +590,9 @@ __xfs_getfsmap_datadev(
xfs_trans_brelse(tp, info->agf_bp);
info->agf_bp = NULL;
}
- if (info->pag) {
- xfs_perag_rele(info->pag);
- info->pag = NULL;
+ if (info->group) {
+ xfs_perag_rele(pag);
+ info->group = NULL;
} else if (pag) {
/* loop termination case */
xfs_perag_rele(pag);
@@ -606,13 +611,13 @@ xfs_getfsmap_datadev_rmapbt_query(
{
/* Report any gap at the end of the last AG. */
if (info->last)
- return xfs_getfsmap_datadev_helper(*curpp, &info->high, info);
+ return xfs_getfsmap_rmapbt_helper(*curpp, &info->high, info);
/* Allocate cursor for this AG and query_range it. */
*curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
- info->pag);
+ to_perag(info->group));
return xfs_rmap_query_range(*curpp, &info->low, &info->high,
- xfs_getfsmap_datadev_helper, info);
+ xfs_getfsmap_rmapbt_helper, info);
}
/* Execute a getfsmap query against the regular data device rmapbt. */
@@ -643,7 +648,7 @@ xfs_getfsmap_datadev_bnobt_query(
/* Allocate cursor for this AG and query_range it. */
*curpp = xfs_bnobt_init_cursor(tp->t_mountp, tp, info->agf_bp,
- info->pag);
+ to_perag(info->group));
key->ar_startblock = info->low.rm_startblock;
key[1].ar_startblock = info->high.rm_startblock;
return xfs_alloc_query_range(*curpp, key, &key[1],
@@ -672,9 +677,12 @@ xfs_getfsmap_logdev(
const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
+ struct xfs_fsmap_irec frec = {
+ .start_daddr = 0,
+ .rec_key = 0,
+ .owner = XFS_RMAP_OWN_LOG,
+ };
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_rmap_irec rmap;
- xfs_daddr_t rec_daddr, len_daddr;
xfs_fsblock_t start_fsb, end_fsb;
uint64_t eofs;
@@ -689,51 +697,53 @@ xfs_getfsmap_logdev(
if (keys[0].fmr_length > 0)
info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb);
- trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb);
- trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb);
+ trace_xfs_fsmap_low_linear_key(mp, info->dev, start_fsb);
+ trace_xfs_fsmap_high_linear_key(mp, info->dev, end_fsb);
if (start_fsb > 0)
return 0;
/* Fabricate an rmap entry for the external log device. */
- rmap.rm_startblock = 0;
- rmap.rm_blockcount = mp->m_sb.sb_logblocks;
- rmap.rm_owner = XFS_RMAP_OWN_LOG;
- rmap.rm_offset = 0;
- rmap.rm_flags = 0;
-
- rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock);
- len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount);
- return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr);
+ frec.len_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+ return xfs_getfsmap_helper(tp, info, &frec);
}
#ifdef CONFIG_XFS_RT
/* Transform a rtbitmap "record" into a fsmap */
STATIC int
xfs_getfsmap_rtdev_rtbitmap_helper(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
{
+ struct xfs_fsmap_irec frec = {
+ .owner = XFS_RMAP_OWN_NULL, /* "free" */
+ };
+ struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_getfsmap_info *info = priv;
- struct xfs_rmap_irec irec;
- xfs_rtblock_t rtbno;
- xfs_daddr_t rec_daddr, len_daddr;
-
- rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
- rec_daddr = XFS_FSB_TO_BB(mp, rtbno);
- irec.rm_startblock = rtbno;
+ xfs_rtblock_t start_rtb =
+ xfs_rtx_to_rtb(rtg, rec->ar_startext);
+ uint64_t rtbcount =
+ xfs_rtbxlen_to_blen(mp, rec->ar_extcount);
- rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount);
- len_daddr = XFS_FSB_TO_BB(mp, rtbno);
- irec.rm_blockcount = rtbno;
-
- irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
- irec.rm_offset = 0;
- irec.rm_flags = 0;
+ /*
+ * For an info->last query, we're looking for a gap between the last
+ * mapping emitted and the high key specified by userspace. If the
+ * user's query spans less than 1 fsblock, then info->high and
+ * info->low will have the same rm_startblock, which causes rec_daddr
+ * and next_daddr to be the same. Therefore, use the end_daddr that
+ * we calculated from userspace's high key to synthesize the record.
+ * Note that if the btree query found a mapping, there won't be a gap.
+ */
+ if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL) {
+ frec.start_daddr = info->end_daddr;
+ } else {
+ frec.start_daddr = xfs_rtb_to_daddr(mp, start_rtb);
+ }
- return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr);
+ frec.len_daddr = XFS_FSB_TO_BB(mp, rtbcount);
+ return xfs_getfsmap_helper(tp, info, &frec);
}
/* Execute a getfsmap query against the realtime device rtbitmap. */
@@ -743,58 +753,83 @@ xfs_getfsmap_rtdev_rtbitmap(
const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
-
- struct xfs_rtalloc_rec ahigh = { 0 };
struct xfs_mount *mp = tp->t_mountp;
- xfs_rtblock_t start_rtb;
- xfs_rtblock_t end_rtb;
- xfs_rtxnum_t high;
+ xfs_rtblock_t start_rtbno, end_rtbno;
+ xfs_rtxnum_t start_rtx, end_rtx;
+ xfs_rgnumber_t start_rgno, end_rgno;
+ struct xfs_rtgroup *rtg = NULL;
uint64_t eofs;
int error;
- eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents));
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
if (keys[0].fmr_physical >= eofs)
return 0;
- start_rtb = XFS_BB_TO_FSBT(mp,
- keys[0].fmr_physical + keys[0].fmr_length);
- end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical));
info->missing_owner = XFS_FMR_OWN_UNKNOWN;
/* Adjust the low key if we are continuing from where we left off. */
+ start_rtbno = xfs_daddr_to_rtb(mp,
+ keys[0].fmr_physical + keys[0].fmr_length);
if (keys[0].fmr_length > 0) {
- info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb);
+ info->low_daddr = xfs_rtb_to_daddr(mp, start_rtbno);
if (info->low_daddr >= eofs)
return 0;
}
+ start_rtx = xfs_rtb_to_rtx(mp, start_rtbno);
+ start_rgno = xfs_rtb_to_rgno(mp, start_rtbno);
- trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb);
- trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb);
+ end_rtbno = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
+ end_rgno = xfs_rtb_to_rgno(mp, end_rtbno);
- xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
+ trace_xfs_fsmap_low_linear_key(mp, info->dev, start_rtbno);
+ trace_xfs_fsmap_high_linear_key(mp, info->dev, end_rtbno);
- /*
- * Set up query parameters to return free rtextents covering the range
- * we want.
- */
- high = xfs_rtb_to_rtxup(mp, end_rtb);
- error = xfs_rtalloc_query_range(mp, tp, xfs_rtb_to_rtx(mp, start_rtb),
- high, xfs_getfsmap_rtdev_rtbitmap_helper, info);
- if (error)
- goto err;
+ end_rtx = -1ULL;
- /*
- * Report any gaps at the end of the rtbitmap by simulating a null
- * rmap starting at the block after the end of the query range.
- */
- info->last = true;
- ahigh.ar_startext = min(mp->m_sb.sb_rextents, high);
+ while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) {
+ if (rtg_rgno(rtg) == end_rgno)
+ end_rtx = xfs_rtb_to_rtx(mp,
+ end_rtbno + mp->m_sb.sb_rextsize - 1);
+
+ info->group = rtg_group(rtg);
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ error = xfs_rtalloc_query_range(rtg, tp, start_rtx, end_rtx,
+ xfs_getfsmap_rtdev_rtbitmap_helper, info);
+ if (error)
+ break;
+
+ /*
+ * Report any gaps at the end of the rtbitmap by simulating a
+ * zero-length free extent starting at the rtx after the end
+ * of the query range.
+ */
+ if (rtg_rgno(rtg) == end_rgno) {
+ struct xfs_rtalloc_rec ahigh = {
+ .ar_startext = min(end_rtx + 1,
+ rtg->rtg_extents),
+ };
+
+ info->last = true;
+ error = xfs_getfsmap_rtdev_rtbitmap_helper(rtg, tp,
+ &ahigh, info);
+ if (error)
+ break;
+ }
+
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ info->group = NULL;
+ start_rtx = 0;
+ }
+
+ /* loop termination case */
+ if (rtg) {
+ if (info->group) {
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ info->group = NULL;
+ }
+ xfs_rtgroup_rele(rtg);
+ }
- error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
- if (error)
- goto err;
-err:
- xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
return error;
}
#endif /* CONFIG_XFS_RT */
@@ -1003,7 +1038,7 @@ xfs_getfsmap(
info.dev = handlers[i].dev;
info.last = false;
- info.pag = NULL;
+ info.group = NULL;
info.low_daddr = XFS_BUF_DADDR_NULL;
info.low.rm_blockcount = 0;
error = handlers[i].fn(tp, dkeys, &info);
diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
index a0bcc38486a5..06e492fd479d 100644
--- a/fs/xfs/xfs_fsmap.h
+++ b/fs/xfs/xfs_fsmap.h
@@ -28,6 +28,21 @@ struct xfs_fsmap_head {
struct xfs_fsmap fmh_keys[2]; /* low and high keys */
};
+/* internal fsmap record format */
+struct xfs_fsmap_irec {
+ xfs_daddr_t start_daddr;
+ xfs_daddr_t len_daddr;
+ uint64_t owner; /* extent owner */
+ uint64_t offset; /* offset within the owner */
+ unsigned int rm_flags; /* rmap state flags */
+
+ /*
+ * rmapbt startblock corresponding to start_daddr, if the record came
+ * from an rmap btree.
+ */
+ xfs_agblock_t rec_key;
+};
+
int xfs_ioc_getfsmap(struct xfs_inode *ip, struct fsmap_head __user *arg);
#endif /* __XFS_FSMAP_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index b247d895c276..28dde215c899 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -162,9 +162,7 @@ xfs_growfs_data_private(
error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
delta, last_pag, &lastag_extended);
} else {
- xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK,
- "EXPERIMENTAL online shrink feature in use. Use at your own risk!");
-
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SHRINK);
error = xfs_ag_shrink_space(last_pag, &tp, -delta);
}
xfs_perag_put(last_pag);
@@ -528,13 +526,12 @@ int
xfs_fs_reserve_ag_blocks(
struct xfs_mount *mp)
{
- xfs_agnumber_t agno;
- struct xfs_perag *pag;
+ struct xfs_perag *pag = NULL;
int error = 0;
int err2;
mp->m_finobt_nores = false;
- for_each_perag(mp, agno, pag) {
+ while ((pag = xfs_perag_next(mp, pag))) {
err2 = xfs_ag_resv_init(pag, NULL);
if (err2 && !error)
error = err2;
@@ -556,9 +553,8 @@ void
xfs_fs_unreserve_ag_blocks(
struct xfs_mount *mp)
{
- xfs_agnumber_t agno;
- struct xfs_perag *pag;
+ struct xfs_perag *pag = NULL;
- for_each_perag(mp, agno, pag)
+ while ((pag = xfs_perag_next(mp, pag)))
xfs_ag_resv_free(pag);
}
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index 49e5e5f04e60..f19fce557354 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -85,22 +85,23 @@ xfs_find_handle(
int hsize;
xfs_handle_t handle;
struct inode *inode;
- struct fd f = EMPTY_FD;
struct path path;
int error;
struct xfs_inode *ip;
if (cmd == XFS_IOC_FD_TO_HANDLE) {
- f = fdget(hreq->fd);
- if (!fd_file(f))
+ CLASS(fd, f)(hreq->fd);
+
+ if (fd_empty(f))
return -EBADF;
- inode = file_inode(fd_file(f));
+ path = fd_file(f)->f_path;
+ path_get(&path);
} else {
error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
if (error)
return error;
- inode = d_inode(path.dentry);
}
+ inode = d_inode(path.dentry);
ip = XFS_I(inode);
/*
@@ -134,10 +135,7 @@ xfs_find_handle(
error = 0;
out_put:
- if (cmd == XFS_IOC_FD_TO_HANDLE)
- fdput(f);
- else
- path_put(&path);
+ path_put(&path);
return error;
}
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 10f116d093a2..c7c2e6561998 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -18,6 +18,22 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_quota_defs.h"
+#include "xfs_rtgroup.h"
+
+static void
+xfs_health_unmount_group(
+ struct xfs_group *xg,
+ bool *warn)
+{
+ unsigned int sick = 0;
+ unsigned int checked = 0;
+
+ xfs_group_measure_sickness(xg, &sick, &checked);
+ if (sick) {
+ trace_xfs_group_unfixed_corruption(xg, sick);
+ *warn = true;
+ }
+}
/*
* Warn about metadata corruption that we detected but haven't fixed, and
@@ -28,8 +44,8 @@ void
xfs_health_unmount(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
+ struct xfs_rtgroup *rtg = NULL;
unsigned int sick = 0;
unsigned int checked = 0;
bool warn = false;
@@ -38,20 +54,12 @@ xfs_health_unmount(
return;
/* Measure AG corruption levels. */
- for_each_perag(mp, agno, pag) {
- xfs_ag_measure_sickness(pag, &sick, &checked);
- if (sick) {
- trace_xfs_ag_unfixed_corruption(mp, agno, sick);
- warn = true;
- }
- }
+ while ((pag = xfs_perag_next(mp, pag)))
+ xfs_health_unmount_group(pag_group(pag), &warn);
- /* Measure realtime volume corruption levels. */
- xfs_rt_measure_sickness(mp, &sick, &checked);
- if (sick) {
- trace_xfs_rt_unfixed_corruption(mp, sick);
- warn = true;
- }
+ /* Measure realtime group corruption levels. */
+ while ((rtg = xfs_rtgroup_next(mp, rtg)))
+ xfs_health_unmount_group(rtg_group(rtg), &warn);
/*
* Measure fs corruption and keep the sample around for the warning.
@@ -150,65 +158,6 @@ xfs_fs_measure_sickness(
spin_unlock(&mp->m_sb_lock);
}
-/* Mark unhealthy realtime metadata. */
-void
-xfs_rt_mark_sick(
- struct xfs_mount *mp,
- unsigned int mask)
-{
- ASSERT(!(mask & ~XFS_SICK_RT_ALL));
- trace_xfs_rt_mark_sick(mp, mask);
-
- spin_lock(&mp->m_sb_lock);
- mp->m_rt_sick |= mask;
- spin_unlock(&mp->m_sb_lock);
-}
-
-/* Mark realtime metadata as having been checked and found unhealthy by fsck. */
-void
-xfs_rt_mark_corrupt(
- struct xfs_mount *mp,
- unsigned int mask)
-{
- ASSERT(!(mask & ~XFS_SICK_RT_ALL));
- trace_xfs_rt_mark_corrupt(mp, mask);
-
- spin_lock(&mp->m_sb_lock);
- mp->m_rt_sick |= mask;
- mp->m_rt_checked |= mask;
- spin_unlock(&mp->m_sb_lock);
-}
-
-/* Mark a realtime metadata healed. */
-void
-xfs_rt_mark_healthy(
- struct xfs_mount *mp,
- unsigned int mask)
-{
- ASSERT(!(mask & ~XFS_SICK_RT_ALL));
- trace_xfs_rt_mark_healthy(mp, mask);
-
- spin_lock(&mp->m_sb_lock);
- mp->m_rt_sick &= ~mask;
- if (!(mp->m_rt_sick & XFS_SICK_RT_PRIMARY))
- mp->m_rt_sick &= ~XFS_SICK_RT_SECONDARY;
- mp->m_rt_checked |= mask;
- spin_unlock(&mp->m_sb_lock);
-}
-
-/* Sample which realtime metadata are unhealthy. */
-void
-xfs_rt_measure_sickness(
- struct xfs_mount *mp,
- unsigned int *sick,
- unsigned int *checked)
-{
- spin_lock(&mp->m_sb_lock);
- *sick = mp->m_rt_sick;
- *checked = mp->m_rt_checked;
- spin_unlock(&mp->m_sb_lock);
-}
-
/* Mark unhealthy per-ag metadata given a raw AG number. */
void
xfs_agno_mark_sick(
@@ -226,63 +175,95 @@ xfs_agno_mark_sick(
xfs_perag_put(pag);
}
+static inline void
+xfs_group_check_mask(
+ struct xfs_group *xg,
+ unsigned int mask)
+{
+ if (xg->xg_type == XG_TYPE_AG)
+ ASSERT(!(mask & ~XFS_SICK_AG_ALL));
+ else
+ ASSERT(!(mask & ~XFS_SICK_RG_ALL));
+}
+
/* Mark unhealthy per-ag metadata. */
void
-xfs_ag_mark_sick(
- struct xfs_perag *pag,
+xfs_group_mark_sick(
+ struct xfs_group *xg,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_AG_ALL));
- trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask);
+ xfs_group_check_mask(xg, mask);
+ trace_xfs_group_mark_sick(xg, mask);
- spin_lock(&pag->pag_state_lock);
- pag->pag_sick |= mask;
- spin_unlock(&pag->pag_state_lock);
+ spin_lock(&xg->xg_state_lock);
+ xg->xg_sick |= mask;
+ spin_unlock(&xg->xg_state_lock);
}
-/* Mark per-ag metadata as having been checked and found unhealthy by fsck. */
+/*
+ * Mark per-group metadata as having been checked and found unhealthy by fsck.
+ */
void
-xfs_ag_mark_corrupt(
- struct xfs_perag *pag,
+xfs_group_mark_corrupt(
+ struct xfs_group *xg,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_AG_ALL));
- trace_xfs_ag_mark_corrupt(pag->pag_mount, pag->pag_agno, mask);
+ xfs_group_check_mask(xg, mask);
+ trace_xfs_group_mark_corrupt(xg, mask);
- spin_lock(&pag->pag_state_lock);
- pag->pag_sick |= mask;
- pag->pag_checked |= mask;
- spin_unlock(&pag->pag_state_lock);
+ spin_lock(&xg->xg_state_lock);
+ xg->xg_sick |= mask;
+ xg->xg_checked |= mask;
+ spin_unlock(&xg->xg_state_lock);
}
-/* Mark per-ag metadata ok. */
+/*
+ * Mark per-group metadata ok.
+ */
void
-xfs_ag_mark_healthy(
- struct xfs_perag *pag,
+xfs_group_mark_healthy(
+ struct xfs_group *xg,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_AG_ALL));
- trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask);
-
- spin_lock(&pag->pag_state_lock);
- pag->pag_sick &= ~mask;
- if (!(pag->pag_sick & XFS_SICK_AG_PRIMARY))
- pag->pag_sick &= ~XFS_SICK_AG_SECONDARY;
- pag->pag_checked |= mask;
- spin_unlock(&pag->pag_state_lock);
+ xfs_group_check_mask(xg, mask);
+ trace_xfs_group_mark_healthy(xg, mask);
+
+ spin_lock(&xg->xg_state_lock);
+ xg->xg_sick &= ~mask;
+ if (!(xg->xg_sick & XFS_SICK_AG_PRIMARY))
+ xg->xg_sick &= ~XFS_SICK_AG_SECONDARY;
+ xg->xg_checked |= mask;
+ spin_unlock(&xg->xg_state_lock);
}
/* Sample which per-ag metadata are unhealthy. */
void
-xfs_ag_measure_sickness(
- struct xfs_perag *pag,
+xfs_group_measure_sickness(
+ struct xfs_group *xg,
unsigned int *sick,
unsigned int *checked)
{
- spin_lock(&pag->pag_state_lock);
- *sick = pag->pag_sick;
- *checked = pag->pag_checked;
- spin_unlock(&pag->pag_state_lock);
+ spin_lock(&xg->xg_state_lock);
+ *sick = xg->xg_sick;
+ *checked = xg->xg_checked;
+ spin_unlock(&xg->xg_state_lock);
+}
+
+/* Mark unhealthy per-rtgroup metadata given a raw rt group number. */
+void
+xfs_rgno_mark_sick(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno,
+ unsigned int mask)
+{
+ struct xfs_rtgroup *rtg = xfs_rtgroup_get(mp, rgno);
+
+ /* per-rtgroup structure not set up yet? */
+ if (!rtg)
+ return;
+
+ xfs_group_mark_sick(rtg_group(rtg), mask);
+ xfs_rtgroup_put(rtg);
}
/* Mark the unhealthy parts of an inode. */
@@ -369,6 +350,9 @@ struct ioctl_sick_map {
unsigned int ioctl_mask;
};
+#define for_each_sick_map(map, m) \
+ for ((m) = (map); (m) < (map) + ARRAY_SIZE(map); (m)++)
+
static const struct ioctl_sick_map fs_map[] = {
{ XFS_SICK_FS_COUNTERS, XFS_FSOP_GEOM_SICK_COUNTERS},
{ XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA },
@@ -376,13 +360,13 @@ static const struct ioctl_sick_map fs_map[] = {
{ XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA },
{ XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK },
{ XFS_SICK_FS_NLINKS, XFS_FSOP_GEOM_SICK_NLINKS },
- { 0, 0 },
+ { XFS_SICK_FS_METADIR, XFS_FSOP_GEOM_SICK_METADIR },
+ { XFS_SICK_FS_METAPATH, XFS_FSOP_GEOM_SICK_METAPATH },
};
static const struct ioctl_sick_map rt_map[] = {
- { XFS_SICK_RT_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP },
- { XFS_SICK_RT_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY },
- { 0, 0 },
+ { XFS_SICK_RG_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP },
+ { XFS_SICK_RG_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY },
};
static inline void
@@ -404,6 +388,7 @@ xfs_fsop_geom_health(
struct xfs_mount *mp,
struct xfs_fsop_geom *geo)
{
+ struct xfs_rtgroup *rtg = NULL;
const struct ioctl_sick_map *m;
unsigned int sick;
unsigned int checked;
@@ -412,12 +397,14 @@ xfs_fsop_geom_health(
geo->checked = 0;
xfs_fs_measure_sickness(mp, &sick, &checked);
- for (m = fs_map; m->sick_mask; m++)
+ for_each_sick_map(fs_map, m)
xfgeo_health_tick(geo, sick, checked, m);
- xfs_rt_measure_sickness(mp, &sick, &checked);
- for (m = rt_map; m->sick_mask; m++)
- xfgeo_health_tick(geo, sick, checked, m);
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked);
+ for_each_sick_map(rt_map, m)
+ xfgeo_health_tick(geo, sick, checked, m);
+ }
}
static const struct ioctl_sick_map ag_map[] = {
@@ -432,7 +419,6 @@ static const struct ioctl_sick_map ag_map[] = {
{ XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT },
{ XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT },
{ XFS_SICK_AG_INODES, XFS_AG_GEOM_SICK_INODES },
- { 0, 0 },
};
/* Fill out ag geometry health info. */
@@ -448,8 +434,8 @@ xfs_ag_geom_health(
ageo->ag_sick = 0;
ageo->ag_checked = 0;
- xfs_ag_measure_sickness(pag, &sick, &checked);
- for (m = ag_map; m->sick_mask; m++) {
+ xfs_group_measure_sickness(pag_group(pag), &sick, &checked);
+ for_each_sick_map(ag_map, m) {
if (checked & m->sick_mask)
ageo->ag_checked |= m->ioctl_mask;
if (sick & m->sick_mask)
@@ -457,6 +443,34 @@ xfs_ag_geom_health(
}
}
+static const struct ioctl_sick_map rtgroup_map[] = {
+ { XFS_SICK_RG_SUPER, XFS_RTGROUP_GEOM_SICK_SUPER },
+ { XFS_SICK_RG_BITMAP, XFS_RTGROUP_GEOM_SICK_BITMAP },
+ { XFS_SICK_RG_SUMMARY, XFS_RTGROUP_GEOM_SICK_SUMMARY },
+};
+
+/* Fill out rtgroup geometry health info. */
+void
+xfs_rtgroup_geom_health(
+ struct xfs_rtgroup *rtg,
+ struct xfs_rtgroup_geometry *rgeo)
+{
+ const struct ioctl_sick_map *m;
+ unsigned int sick;
+ unsigned int checked;
+
+ rgeo->rg_sick = 0;
+ rgeo->rg_checked = 0;
+
+ xfs_group_measure_sickness(rtg_group(rtg), &sick, &checked);
+ for_each_sick_map(rtgroup_map, m) {
+ if (checked & m->sick_mask)
+ rgeo->rg_checked |= m->ioctl_mask;
+ if (sick & m->sick_mask)
+ rgeo->rg_sick |= m->ioctl_mask;
+ }
+}
+
static const struct ioctl_sick_map ino_map[] = {
{ XFS_SICK_INO_CORE, XFS_BS_SICK_INODE },
{ XFS_SICK_INO_BMBTD, XFS_BS_SICK_BMBTD },
@@ -471,7 +485,6 @@ static const struct ioctl_sick_map ino_map[] = {
{ XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR },
{ XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK },
{ XFS_SICK_INO_DIRTREE, XFS_BS_SICK_DIRTREE },
- { 0, 0 },
};
/* Fill out bulkstat health info. */
@@ -488,7 +501,7 @@ xfs_bulkstat_health(
bs->bs_checked = 0;
xfs_inode_measure_sickness(ip, &sick, &checked);
- for (m = ino_map; m->sick_mask; m++) {
+ for_each_sick_map(ino_map, m) {
if (checked & m->sick_mask)
bs->bs_checked |= m->ioctl_mask;
if (sick & m->sick_mask)
@@ -527,24 +540,13 @@ void
xfs_btree_mark_sick(
struct xfs_btree_cur *cur)
{
- switch (cur->bc_ops->type) {
- case XFS_BTREE_TYPE_MEM:
- /* no health state tracking for ephemeral btrees */
- return;
- case XFS_BTREE_TYPE_AG:
+ if (xfs_btree_is_bmap(cur->bc_ops)) {
+ xfs_bmap_mark_sick(cur->bc_ino.ip, cur->bc_ino.whichfork);
+ /* no health state tracking for ephemeral btrees */
+ } else if (cur->bc_ops->type != XFS_BTREE_TYPE_MEM) {
+ ASSERT(cur->bc_group);
ASSERT(cur->bc_ops->sick_mask);
- xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask);
- return;
- case XFS_BTREE_TYPE_INODE:
- if (xfs_btree_is_bmap(cur->bc_ops)) {
- xfs_bmap_mark_sick(cur->bc_ino.ip,
- cur->bc_ino.whichfork);
- return;
- }
- fallthrough;
- default:
- ASSERT(0);
- return;
+ xfs_group_mark_sick(cur->bc_group, cur->bc_ops->sick_mask);
}
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 6b119a7a324f..7b6c026d01a1 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -25,6 +25,9 @@
#include "xfs_ag.h"
#include "xfs_log_priv.h"
#include "xfs_health.h"
+#include "xfs_da_format.h"
+#include "xfs_dir2.h"
+#include "xfs_metafile.h"
#include <linux/iversion.h>
@@ -204,7 +207,7 @@ xfs_reclaim_work_queue(
{
rcu_read_lock();
- if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
+ if (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) {
queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
}
@@ -219,15 +222,14 @@ static inline void
xfs_blockgc_queue(
struct xfs_perag *pag)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
if (!xfs_is_blockgc_enabled(mp))
return;
rcu_read_lock();
if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
- queue_delayed_work(pag->pag_mount->m_blockgc_wq,
- &pag->pag_blockgc_work,
+ queue_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work,
msecs_to_jiffies(xfs_blockgc_secs * 1000));
rcu_read_unlock();
}
@@ -239,7 +241,6 @@ xfs_perag_set_inode_tag(
xfs_agino_t agino,
unsigned int tag)
{
- struct xfs_mount *mp = pag->pag_mount;
bool was_tagged;
lockdep_assert_held(&pag->pag_ici_lock);
@@ -253,13 +254,13 @@ xfs_perag_set_inode_tag(
if (was_tagged)
return;
- /* propagate the tag up into the perag radix tree */
- xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
+ /* propagate the tag up into the pag xarray tree */
+ xfs_group_set_mark(pag_group(pag), ici_tag_to_mark(tag));
/* start background work */
switch (tag) {
case XFS_ICI_RECLAIM_TAG:
- xfs_reclaim_work_queue(mp);
+ xfs_reclaim_work_queue(pag_mount(pag));
break;
case XFS_ICI_BLOCKGC_TAG:
xfs_blockgc_queue(pag);
@@ -276,8 +277,6 @@ xfs_perag_clear_inode_tag(
xfs_agino_t agino,
unsigned int tag)
{
- struct xfs_mount *mp = pag->pag_mount;
-
lockdep_assert_held(&pag->pag_ici_lock);
/*
@@ -295,9 +294,8 @@ xfs_perag_clear_inode_tag(
if (radix_tree_tagged(&pag->pag_ici_root, tag))
return;
- /* clear the tag from the perag radix tree */
- xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
-
+ /* clear the tag from the pag xarray */
+ xfs_group_clear_mark(pag_group(pag), ici_tag_to_mark(tag));
trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
}
@@ -310,22 +308,9 @@ xfs_perag_grab_next_tag(
struct xfs_perag *pag,
int tag)
{
- unsigned long index = 0;
-
- if (pag) {
- index = pag->pag_agno + 1;
- xfs_perag_rele(pag);
- }
-
- rcu_read_lock();
- pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag));
- if (pag) {
- trace_xfs_perag_grab_next_tag(pag, _RET_IP_);
- if (!atomic_inc_not_zero(&pag->pag_active_ref))
- pag = NULL;
- }
- rcu_read_unlock();
- return pag;
+ return to_perag(xfs_group_grab_next_mark(mp,
+ pag ? pag_group(pag) : NULL,
+ ici_tag_to_mark(tag), XG_TYPE_AG));
}
/*
@@ -847,6 +832,77 @@ out_error_or_again:
}
/*
+ * Get a metadata inode.
+ *
+ * The metafile type must match the file mode exactly, and for files in the
+ * metadata directory tree, it must match the inode's metatype exactly.
+ */
+int
+xfs_trans_metafile_iget(
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ enum xfs_metafile_type metafile_type,
+ struct xfs_inode **ipp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *ip;
+ umode_t mode;
+ int error;
+
+ error = xfs_iget(mp, tp, ino, 0, 0, &ip);
+ if (error == -EFSCORRUPTED || error == -EINVAL)
+ goto whine;
+ if (error)
+ return error;
+
+ if (VFS_I(ip)->i_nlink == 0)
+ goto bad_rele;
+
+ if (metafile_type == XFS_METAFILE_DIR)
+ mode = S_IFDIR;
+ else
+ mode = S_IFREG;
+ if (inode_wrong_type(VFS_I(ip), mode))
+ goto bad_rele;
+ if (xfs_has_metadir(mp)) {
+ if (!xfs_is_metadir_inode(ip))
+ goto bad_rele;
+ if (metafile_type != ip->i_metatype)
+ goto bad_rele;
+ }
+
+ *ipp = ip;
+ return 0;
+bad_rele:
+ xfs_irele(ip);
+whine:
+ xfs_err(mp, "metadata inode 0x%llx type %u is corrupt", ino,
+ metafile_type);
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
+ return -EFSCORRUPTED;
+}
+
+/* Grab a metadata file if the caller doesn't already have a transaction. */
+int
+xfs_metafile_iget(
+ struct xfs_mount *mp,
+ xfs_ino_t ino,
+ enum xfs_metafile_type metafile_type,
+ struct xfs_inode **ipp)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp);
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+/*
* Grab the inode for reclaim exclusively.
*
* We have found this inode via a lookup under RCU, so the inode may have
@@ -1014,7 +1070,7 @@ xfs_reclaim_inodes(
if (xfs_want_reclaim_sick(mp))
icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
- while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
+ while (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) {
xfs_ail_push_all_sync(mp->m_ail);
xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
}
@@ -1056,7 +1112,7 @@ long
xfs_reclaim_inodes_count(
struct xfs_mount *mp)
{
- XA_STATE (xas, &mp->m_perags, 0);
+ XA_STATE (xas, &mp->m_groups[XG_TYPE_AG].xa, 0);
long reclaimable = 0;
struct xfs_perag *pag;
@@ -1401,13 +1457,12 @@ void
xfs_blockgc_stop(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
if (!xfs_clear_blockgc_enabled(mp))
return;
- for_each_perag(mp, agno, pag)
+ while ((pag = xfs_perag_next(mp, pag)))
cancel_delayed_work_sync(&pag->pag_blockgc_work);
trace_xfs_blockgc_stop(mp, __return_address);
}
@@ -1499,7 +1554,7 @@ xfs_blockgc_worker(
{
struct xfs_perag *pag = container_of(to_delayed_work(work),
struct xfs_perag, pag_blockgc_work);
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
int error;
trace_xfs_blockgc_worker(mp, __return_address);
@@ -1507,7 +1562,7 @@ xfs_blockgc_worker(
error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
if (error)
xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
- pag->pag_agno, error);
+ pag_agno(pag), error);
xfs_blockgc_queue(pag);
}
@@ -1548,8 +1603,7 @@ xfs_blockgc_flush_all(
* queued, it will not be requeued. Then flush whatever is left.
*/
while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
- mod_delayed_work(pag->pag_mount->m_blockgc_wq,
- &pag->pag_blockgc_work, 0);
+ mod_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, 0);
while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
flush_delayed_work(&pag->pag_blockgc_work);
@@ -1688,7 +1742,7 @@ xfs_icwalk_ag(
enum xfs_icwalk_goal goal,
struct xfs_icwalk *icw)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
uint32_t first_index;
int last_error = 0;
int skipped;
@@ -1741,7 +1795,7 @@ restart:
* us to see this inode, so another lookup from the
* same index will not find it again.
*/
- if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+ if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 19dcb569a3e7..c8ad2606f928 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -43,6 +43,7 @@
#include "xfs_parent.h"
#include "xfs_xattr.h"
#include "xfs_inode_util.h"
+#include "xfs_metafile.h"
struct kmem_cache *xfs_inode_cache;
@@ -341,8 +342,7 @@ xfs_lock_inumorder(
{
uint class = 0;
- ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
- XFS_ILOCK_RTSUM)));
+ ASSERT(!(lock_mode & XFS_ILOCK_PARENT));
ASSERT(xfs_lockdep_subclass_ok(subclass));
if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
@@ -554,8 +554,20 @@ xfs_lookup(
if (error)
goto out_free_name;
+ /*
+ * Fail if a directory entry in the regular directory tree points to
+ * a metadata file.
+ */
+ if (XFS_IS_CORRUPT(dp->i_mount, xfs_is_metadir_inode(*ipp))) {
+ xfs_fs_mark_sick(dp->i_mount, XFS_SICK_FS_METADIR);
+ error = -EFSCORRUPTED;
+ goto out_irele;
+ }
+
return 0;
+out_irele:
+ xfs_irele(*ipp);
out_free_name:
if (ci_name)
kfree(ci_name->name);
@@ -1295,7 +1307,7 @@ xfs_inode_needs_inactive(
return false;
/* Metadata inodes require explicit resource cleanup. */
- if (xfs_is_metadata_inode(ip))
+ if (xfs_is_internal_inode(ip))
return false;
/* Want to clean out the cow blocks if there are any. */
@@ -1388,7 +1400,7 @@ xfs_inactive(
goto out;
/* Metadata inodes require explicit resource cleanup. */
- if (xfs_is_metadata_inode(ip))
+ if (xfs_is_internal_inode(ip))
goto out;
/* Try to clean out the cow blocks if there are any. */
@@ -1514,9 +1526,8 @@ xfs_iunlink_reload_next(
xfs_agino_t next_agino)
{
struct xfs_perag *pag = agibp->b_pag;
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_inode *next_ip = NULL;
- xfs_ino_t ino;
int error;
ASSERT(next_agino != NULLAGINO);
@@ -1530,7 +1541,7 @@ xfs_iunlink_reload_next(
xfs_info_ratelimited(mp,
"Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
- next_agino, pag->pag_agno);
+ next_agino, pag_agno(pag));
/*
* Use an untrusted lookup just to be cautious in case the AGI has been
@@ -1538,8 +1549,8 @@ xfs_iunlink_reload_next(
* but we'd rather shut down now since we're already running in a weird
* situation.
*/
- ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
- error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
+ error = xfs_iget(mp, tp, xfs_agino_to_ino(pag, next_agino),
+ XFS_IGET_UNTRUSTED, 0, &next_ip);
if (error) {
xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
return error;
@@ -1573,7 +1584,7 @@ xfs_ifree_mark_inode_stale(
struct xfs_inode *free_ip,
xfs_ino_t inum)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_inode_log_item *iip;
struct xfs_inode *ip;
@@ -3041,7 +3052,7 @@ xfs_inode_alloc_unitsize(
/* Should we always be using copy on write for file writes? */
bool
xfs_is_always_cow_inode(
- struct xfs_inode *ip)
+ const struct xfs_inode *ip)
{
return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 03944b6c5fba..b0de3d924d4c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -65,6 +65,7 @@ typedef struct xfs_inode {
uint16_t i_flushiter; /* incremented on flush */
};
uint8_t i_forkoff; /* attr fork offset >> 3 */
+ enum xfs_metafile_type i_metatype; /* XFS_METAFILE_* */
uint16_t i_diflags; /* XFS_DIFLAG_... */
uint64_t i_diflags2; /* XFS_DIFLAG2_... */
struct timespec64 i_crtime; /* time created */
@@ -100,7 +101,7 @@ static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip)
return ip->i_prev_unlinked != 0;
}
-static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
+static inline bool xfs_inode_has_attr_fork(const struct xfs_inode *ip)
{
return ip->i_forkoff > 0;
}
@@ -271,23 +272,36 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned long flags)
return ret;
}
-static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
+static inline bool xfs_is_reflink_inode(const struct xfs_inode *ip)
{
return ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
}
-static inline bool xfs_is_metadata_inode(const struct xfs_inode *ip)
+static inline bool xfs_is_metadir_inode(const struct xfs_inode *ip)
+{
+ return ip->i_diflags2 & XFS_DIFLAG2_METADATA;
+}
+
+static inline bool xfs_is_internal_inode(const struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
+ /* Any file in the metadata directory tree is a metadata inode. */
+ if (xfs_has_metadir(mp))
+ return xfs_is_metadir_inode(ip);
+
+ /*
+ * Before metadata directories, the only metadata inodes were the
+ * three quota files, the realtime bitmap, and the realtime summary.
+ */
return ip->i_ino == mp->m_sb.sb_rbmino ||
ip->i_ino == mp->m_sb.sb_rsumino ||
xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
}
-bool xfs_is_always_cow_inode(struct xfs_inode *ip);
+bool xfs_is_always_cow_inode(const struct xfs_inode *ip);
-static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+static inline bool xfs_is_cow_inode(const struct xfs_inode *ip)
{
return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
}
@@ -301,17 +315,17 @@ static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip)
* Check if an inode has any data in the COW fork. This might be often false
* even for inodes with the reflink flag when there is no pending COW operation.
*/
-static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
+static inline bool xfs_inode_has_cow_data(const struct xfs_inode *ip)
{
return ip->i_cowfp && ip->i_cowfp->if_bytes;
}
-static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
+static inline bool xfs_inode_has_bigtime(const struct xfs_inode *ip)
{
return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
}
-static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
+static inline bool xfs_inode_has_large_extent_counts(const struct xfs_inode *ip)
{
return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
}
@@ -320,7 +334,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
* Decide if this file is a realtime file whose data allocation unit is larger
* than a single filesystem block.
*/
-static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
+static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
{
return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
}
@@ -332,6 +346,21 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
(XFS_IS_REALTIME_INODE(ip) ? \
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
+static inline bool
+xfs_inode_can_atomicwrite(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+
+ if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
+ return false;
+ if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
+ return false;
+
+ return true;
+}
+
/*
* In-core inode flags.
*/
@@ -434,9 +463,8 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
* However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly
* limited to the subclasses we can represent via nesting. We need at least
* 5 inodes nest depth for the ILOCK through rename, and we also have to support
- * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP
- * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all
- * 8 subclasses supported by lockdep.
+ * XFS_ILOCK_PARENT, which gives 6 subclasses. That's 6 of the 8 subclasses
+ * supported by lockdep.
*
* This also means we have to number the sub-classes in the lowest bits of
* the mask we keep, and we have to ensure we never exceed 3 bits of lockdep
@@ -462,8 +490,8 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
* ILOCK values
* 0-4 subclass values
* 5 PARENT subclass (not nestable)
- * 6 RTBITMAP subclass (not nestable)
- * 7 RTSUM subclass (not nestable)
+ * 6 unused
+ * 7 unused
*
*/
#define XFS_IOLOCK_SHIFT 16
@@ -478,12 +506,8 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
#define XFS_ILOCK_SHIFT 24
#define XFS_ILOCK_PARENT_VAL 5u
#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1)
-#define XFS_ILOCK_RTBITMAP_VAL 6u
-#define XFS_ILOCK_RTSUM_VAL 7u
#define XFS_ILOCK_DEP_MASK 0xff000000u
#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
-#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
-#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
#define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \
XFS_MMAPLOCK_DEP_MASK | \
@@ -625,9 +649,9 @@ void xfs_sort_inodes(struct xfs_inode **i_tab, unsigned int num_inodes);
static inline bool
xfs_inode_unlinked_incomplete(
- struct xfs_inode *ip)
+ const struct xfs_inode *ip)
{
- return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip);
+ return VFS_IC(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip);
}
int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
int xfs_inode_reload_unlinked(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index b509cbd191f4..912f0b1bc3cb 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -556,7 +556,6 @@ xfs_inode_to_log_dinode(
to->di_projid_lo = ip->i_projid & 0xffff;
to->di_projid_hi = ip->i_projid >> 16;
- memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode));
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode));
to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
@@ -590,10 +589,16 @@ xfs_inode_to_log_dinode(
/* dummy value for initialisation */
to->di_crc = 0;
+
+ if (xfs_is_metadir_inode(ip))
+ to->di_metatype = ip->i_metatype;
+ else
+ to->di_metatype = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
+ to->di_metatype = 0;
}
xfs_inode_to_log_dinode_iext_counters(ip, to);
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index dbdab4ce7c44..e70d2611456b 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -175,7 +175,7 @@ xfs_log_dinode_to_disk(
to->di_mode = cpu_to_be16(from->di_mode);
to->di_version = from->di_version;
to->di_format = from->di_format;
- to->di_onlink = 0;
+ to->di_metatype = cpu_to_be16(from->di_metatype);
to->di_uid = cpu_to_be32(from->di_uid);
to->di_gid = cpu_to_be32(from->di_gid);
to->di_nlink = cpu_to_be32(from->di_nlink);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 2567fd2a0994..0789c18aaa18 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -40,6 +40,7 @@
#include "xfs_file.h"
#include "xfs_exchrange.h"
#include "xfs_handle.h"
+#include "xfs_rtgroup.h"
#include <linux/mount.h>
#include <linux/fileattr.h>
@@ -233,6 +234,10 @@ xfs_bulk_ireq_setup(
if (hdr->flags & XFS_BULK_IREQ_NREXT64)
breq->flags |= XFS_IBULK_NREXT64;
+ /* Caller wants to see metadata directories in bulkstat output. */
+ if (hdr->flags & XFS_BULK_IREQ_METADIR)
+ breq->flags |= XFS_IBULK_METADIR;
+
return 0;
}
@@ -323,6 +328,9 @@ xfs_ioc_inumbers(
if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
return -EFAULT;
+ if (hdr.flags & XFS_BULK_IREQ_METADIR)
+ return -EINVAL;
+
error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
if (error == -ECANCELED)
goto out_teardown;
@@ -396,6 +404,38 @@ xfs_ioc_ag_geometry(
return 0;
}
+STATIC int
+xfs_ioc_rtgroup_geometry(
+ struct xfs_mount *mp,
+ void __user *arg)
+{
+ struct xfs_rtgroup *rtg;
+ struct xfs_rtgroup_geometry rgeo;
+ int error;
+
+ if (copy_from_user(&rgeo, arg, sizeof(rgeo)))
+ return -EFAULT;
+ if (rgeo.rg_flags)
+ return -EINVAL;
+ if (memchr_inv(&rgeo.rg_reserved, 0, sizeof(rgeo.rg_reserved)))
+ return -EINVAL;
+ if (!xfs_has_rtgroups(mp))
+ return -EINVAL;
+
+ rtg = xfs_rtgroup_get(mp, rgeo.rg_number);
+ if (!rtg)
+ return -EINVAL;
+
+ error = xfs_rtgroup_get_geometry(rtg, &rgeo);
+ xfs_rtgroup_put(rtg);
+ if (error)
+ return error;
+
+ if (copy_to_user(arg, &rgeo, sizeof(rgeo)))
+ return -EFAULT;
+ return 0;
+}
+
/*
* Linux extended inode flags interface.
*/
@@ -881,41 +921,29 @@ xfs_ioc_swapext(
xfs_swapext_t *sxp)
{
xfs_inode_t *ip, *tip;
- struct fd f, tmp;
- int error = 0;
/* Pull information for the target fd */
- f = fdget((int)sxp->sx_fdtarget);
- if (!fd_file(f)) {
- error = -EINVAL;
- goto out;
- }
+ CLASS(fd, f)((int)sxp->sx_fdtarget);
+ if (fd_empty(f))
+ return -EINVAL;
if (!(fd_file(f)->f_mode & FMODE_WRITE) ||
!(fd_file(f)->f_mode & FMODE_READ) ||
- (fd_file(f)->f_flags & O_APPEND)) {
- error = -EBADF;
- goto out_put_file;
- }
+ (fd_file(f)->f_flags & O_APPEND))
+ return -EBADF;
- tmp = fdget((int)sxp->sx_fdtmp);
- if (!fd_file(tmp)) {
- error = -EINVAL;
- goto out_put_file;
- }
+ CLASS(fd, tmp)((int)sxp->sx_fdtmp);
+ if (fd_empty(tmp))
+ return -EINVAL;
if (!(fd_file(tmp)->f_mode & FMODE_WRITE) ||
!(fd_file(tmp)->f_mode & FMODE_READ) ||
- (fd_file(tmp)->f_flags & O_APPEND)) {
- error = -EBADF;
- goto out_put_tmp_file;
- }
+ (fd_file(tmp)->f_flags & O_APPEND))
+ return -EBADF;
if (IS_SWAPFILE(file_inode(fd_file(f))) ||
- IS_SWAPFILE(file_inode(fd_file(tmp)))) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
+ IS_SWAPFILE(file_inode(fd_file(tmp))))
+ return -EINVAL;
/*
* We need to ensure that the fds passed in point to XFS inodes
@@ -923,37 +951,22 @@ xfs_ioc_swapext(
* control over what the user passes us here.
*/
if (fd_file(f)->f_op != &xfs_file_operations ||
- fd_file(tmp)->f_op != &xfs_file_operations) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
+ fd_file(tmp)->f_op != &xfs_file_operations)
+ return -EINVAL;
ip = XFS_I(file_inode(fd_file(f)));
tip = XFS_I(file_inode(fd_file(tmp)));
- if (ip->i_mount != tip->i_mount) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
-
- if (ip->i_ino == tip->i_ino) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
+ if (ip->i_mount != tip->i_mount)
+ return -EINVAL;
- if (xfs_is_shutdown(ip->i_mount)) {
- error = -EIO;
- goto out_put_tmp_file;
- }
+ if (ip->i_ino == tip->i_ino)
+ return -EINVAL;
- error = xfs_swap_extents(ip, tip, sxp);
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
- out_put_tmp_file:
- fdput(tmp);
- out_put_file:
- fdput(f);
- out:
- return error;
+ return xfs_swap_extents(ip, tip, sxp);
}
static int
@@ -1021,7 +1034,7 @@ xfs_ioc_setlabel(
* buffered reads from userspace (i.e. from blkid) are invalidated,
* and userspace will see the newly-written label.
*/
- error = xfs_sync_sb_buf(mp);
+ error = xfs_sync_sb_buf(mp, true);
if (error)
goto out;
/*
@@ -1032,6 +1045,8 @@ xfs_ioc_setlabel(
mutex_unlock(&mp->m_growlock);
invalidate_bdev(mp->m_ddev_targp->bt_bdev);
+ if (xfs_has_rtsb(mp) && mp->m_rtdev_targp)
+ invalidate_bdev(mp->m_rtdev_targp->bt_bdev);
out:
mnt_drop_write_file(filp);
@@ -1216,6 +1231,8 @@ xfs_file_ioctl(
case XFS_IOC_AG_GEOMETRY:
return xfs_ioc_ag_geometry(mp, arg);
+ case XFS_IOC_RTGROUP_GEOMETRY:
+ return xfs_ioc_rtgroup_geometry(mp, arg);
case XFS_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *)arg);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 86da16f54be9..50fa3ef89f6c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -24,6 +24,7 @@
#include "xfs_iomap.h"
#include "xfs_trace.h"
#include "xfs_quota.h"
+#include "xfs_rtgroup.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_reflink.h"
@@ -115,7 +116,9 @@ xfs_bmbt_to_iomap(
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_DELALLOC;
} else {
- iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
+ xfs_daddr_t daddr = xfs_fsb_to_db(ip, imap->br_startblock);
+
+ iomap->addr = BBTOB(daddr);
if (mapping_flags & IOMAP_DAX)
iomap->addr += target->bt_dax_part_off;
@@ -124,6 +127,14 @@ xfs_bmbt_to_iomap(
else
iomap->type = IOMAP_MAPPED;
+ /*
+ * Mark iomaps starting at the first sector of a RTG as merge
+ * boundary so that each I/O completions is contained to a
+ * single RTG.
+ */
+ if (XFS_IS_REALTIME_INODE(ip) && xfs_has_rtgroups(mp) &&
+ xfs_rtbno_is_group_start(mp, imap->br_startblock))
+ iomap->flags |= IOMAP_F_BOUNDARY;
}
iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
@@ -342,16 +353,26 @@ xfs_quota_need_throttle(
xfs_fsblock_t alloc_blocks)
{
struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+ struct xfs_dquot_res *res;
+ struct xfs_dquot_pre *pre;
if (!dq || !xfs_this_quota_on(ip->i_mount, type))
return false;
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ res = &dq->q_rtb;
+ pre = &dq->q_rtb_prealloc;
+ } else {
+ res = &dq->q_blk;
+ pre = &dq->q_blk_prealloc;
+ }
+
/* no hi watermark, no throttle */
- if (!dq->q_prealloc_hi_wmark)
+ if (!pre->q_prealloc_hi_wmark)
return false;
/* under the lo watermark, no throttle */
- if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark)
+ if (res->reserved + alloc_blocks < pre->q_prealloc_lo_wmark)
return false;
return true;
@@ -366,22 +387,35 @@ xfs_quota_calc_throttle(
int64_t *qfreesp)
{
struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+ struct xfs_dquot_res *res;
+ struct xfs_dquot_pre *pre;
int64_t freesp;
int shift = 0;
+ if (!dq) {
+ res = NULL;
+ pre = NULL;
+ } else if (XFS_IS_REALTIME_INODE(ip)) {
+ res = &dq->q_rtb;
+ pre = &dq->q_rtb_prealloc;
+ } else {
+ res = &dq->q_blk;
+ pre = &dq->q_blk_prealloc;
+ }
+
/* no dq, or over hi wmark, squash the prealloc completely */
- if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) {
+ if (!res || res->reserved >= pre->q_prealloc_hi_wmark) {
*qblocks = 0;
*qfreesp = 0;
return;
}
- freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved;
- if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
+ freesp = pre->q_prealloc_hi_wmark - res->reserved;
+ if (freesp < pre->q_low_space[XFS_QLOWSP_5_PCNT]) {
shift = 2;
- if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
+ if (freesp < pre->q_low_space[XFS_QLOWSP_3_PCNT])
shift += 2;
- if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
+ if (freesp < pre->q_low_space[XFS_QLOWSP_1_PCNT])
shift += 2;
}
@@ -501,8 +535,8 @@ xfs_iomap_prealloc_size(
alloc_blocks);
if (unlikely(XFS_IS_REALTIME_INODE(ip)))
- freesp = xfs_rtx_to_rtb(mp,
- xfs_iomap_freesp(&mp->m_frextents,
+ freesp = xfs_rtbxlen_to_blen(mp,
+ xfs_iomap_freesp(&mp->m_frextents,
mp->m_low_rtexts, &shift));
else
freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
@@ -1234,6 +1268,14 @@ xfs_buffered_write_iomap_end(
if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
return 0;
+ /*
+ * iomap_page_mkwrite() will never fail in a way that requires delalloc
+ * extents that it allocated to be revoked. Hence never try to release
+ * them here.
+ */
+ if (flags & IOMAP_FAULT)
+ return 0;
+
/* Nothing to do if we've written the entire delalloc extent */
start_byte = iomap_last_written_block(inode, offset, written);
end_byte = round_up(offset + length, i_blocksize(inode));
@@ -1260,15 +1302,6 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = {
.iomap_end = xfs_buffered_write_iomap_end,
};
-/*
- * iomap_page_mkwrite() will never fail in a way that requires delalloc extents
- * that it allocated to be revoked. Hence we do not need an .iomap_end method
- * for this operation.
- */
-const struct iomap_ops xfs_page_mkwrite_iomap_ops = {
- .iomap_begin = xfs_buffered_write_iomap_begin,
-};
-
static int
xfs_read_iomap_begin(
struct inode *inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 4da13440bae9..8347268af727 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -48,7 +48,6 @@ xfs_aligned_fsb_count(
}
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
-extern const struct iomap_ops xfs_page_mkwrite_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ee79cf161312..207e0dadffc3 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -42,7 +42,9 @@
* held. For regular files, the lock order is the other way around - the
* mmap_lock is taken during the page fault, and then we lock the ilock to do
* block mapping. Hence we need a different class for the directory ilock so
- * that lockdep can tell them apart.
+ * that lockdep can tell them apart. Directories in the metadata directory
+ * tree get a separate class so that lockdep reports will warn us if someone
+ * ever tries to lock regular directories after locking metadata directories.
*/
static struct lock_class_key xfs_nondir_ilock_class;
static struct lock_class_key xfs_dir_ilock_class;
@@ -570,6 +572,20 @@ xfs_stat_blksize(
return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
}
+static void
+xfs_get_atomic_write_attr(
+ struct xfs_inode *ip,
+ unsigned int *unit_min,
+ unsigned int *unit_max)
+{
+ if (!xfs_inode_can_atomicwrite(ip)) {
+ *unit_min = *unit_max = 0;
+ return;
+ }
+
+ *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
+}
+
STATIC int
xfs_vn_getattr(
struct mnt_idmap *idmap,
@@ -597,8 +613,9 @@ xfs_vn_getattr(
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode_get_atime(inode);
- stat->mtime = inode_get_mtime(inode);
- stat->ctime = inode_get_ctime(inode);
+
+ fill_mg_cmtime(stat, request_mask, inode);
+
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
if (xfs_has_v3inodes(mp)) {
@@ -608,11 +625,6 @@ xfs_vn_getattr(
}
}
- if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
- stat->change_cookie = inode_query_iversion(inode);
- stat->result_mask |= STATX_CHANGE_COOKIE;
- }
-
/*
* Note: If you add another clause to set an attribute flag, please
* update attributes_mask below.
@@ -643,6 +655,14 @@ xfs_vn_getattr(
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
stat->dio_offset_align = bdev_logical_block_size(bdev);
}
+ if (request_mask & STATX_WRITE_ATOMIC) {
+ unsigned int unit_min, unit_max;
+
+ xfs_get_atomic_write_attr(ip, &unit_min,
+ &unit_max);
+ generic_fill_statx_atomic_writes(stat,
+ unit_min, unit_max);
+ }
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
@@ -1289,6 +1309,7 @@ xfs_setup_inode(
{
struct inode *inode = &ip->i_vnode;
gfp_t gfp_mask;
+ bool is_meta = xfs_is_internal_inode(ip);
inode->i_ino = ip->i_ino;
inode->i_state |= I_NEW;
@@ -1300,6 +1321,16 @@ xfs_setup_inode(
i_size_write(inode, ip->i_disk_size);
xfs_diflags_to_iflags(ip, true);
+ /*
+ * Mark our metadata files as private so that LSMs and the ACL code
+ * don't try to add their own metadata or reason about these files,
+ * and users cannot ever obtain file handles to them.
+ */
+ if (is_meta) {
+ inode->i_flags |= S_PRIVATE;
+ inode->i_opflags &= ~IOP_XATTR;
+ }
+
if (S_ISDIR(inode->i_mode)) {
/*
* We set the i_rwsem class here to avoid potential races with
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c0757ab99495..1fa1c0564b0c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -36,6 +36,14 @@ struct xfs_bstat_chunk {
struct xfs_bulkstat *buf;
};
+static inline bool
+want_metadir_file(
+ struct xfs_inode *ip,
+ struct xfs_ibulk *breq)
+{
+ return xfs_is_metadir_inode(ip) && (breq->flags & XFS_IBULK_METADIR);
+}
+
/*
* Fill out the bulkstat info for a single inode and report it somewhere.
*
@@ -69,9 +77,6 @@ xfs_bulkstat_one_int(
vfsuid_t vfsuid;
vfsgid_t vfsgid;
- if (xfs_internal_inum(mp, ino))
- goto out_advance;
-
error = xfs_iget(mp, tp, ino,
(XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
XFS_ILOCK_SHARED, &ip);
@@ -97,8 +102,28 @@ xfs_bulkstat_one_int(
vfsuid = i_uid_into_vfsuid(idmap, inode);
vfsgid = i_gid_into_vfsgid(idmap, inode);
+ /*
+ * If caller wants files from the metadata directories, push out the
+ * bare minimum information for enabling scrub.
+ */
+ if (want_metadir_file(ip, bc->breq)) {
+ memset(buf, 0, sizeof(*buf));
+ buf->bs_ino = ino;
+ buf->bs_gen = inode->i_generation;
+ buf->bs_mode = inode->i_mode & S_IFMT;
+ xfs_bulkstat_health(ip, buf);
+ buf->bs_version = XFS_BULKSTAT_VERSION_V5;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_irele(ip);
+
+ error = bc->formatter(bc->breq, buf);
+ if (!error || error == -ECANCELED)
+ goto out_advance;
+ goto out;
+ }
+
/* If this is a private inode, don't leak its details to userspace. */
- if (IS_PRIVATE(inode)) {
+ if (IS_PRIVATE(inode) || xfs_is_sb_inum(mp, ino)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
xfs_irele(ip);
error = -EINVAL;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1659f13f17a8..f10e8f8f2335 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -22,6 +22,9 @@ struct xfs_ibulk {
/* Fill out the bs_extents64 field if set. */
#define XFS_IBULK_NREXT64 (1U << 1)
+/* Signal that we can return metadata directories. */
+#define XFS_IBULK_METADIR (1U << 2)
+
/*
* Advance the user buffer pointer by one record of the given size. If the
* buffer is now full, return the appropriate error code.
diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c
index 2ddccb172fa0..1fd70a7aed63 100644
--- a/fs/xfs/xfs_iunlink_item.c
+++ b/fs/xfs/xfs_iunlink_item.c
@@ -52,14 +52,14 @@ xfs_iunlink_log_dinode(
struct xfs_trans *tp,
struct xfs_iunlink_item *iup)
{
- struct xfs_mount *mp = tp->t_mountp;
struct xfs_inode *ip = iup->ip;
struct xfs_dinode *dip;
struct xfs_buf *ibp;
+ xfs_agino_t old_ptr;
int offset;
int error;
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp);
+ error = xfs_imap_to_bp(tp->t_mountp, tp, &ip->i_imap, &ibp);
if (error)
return error;
/*
@@ -73,22 +73,21 @@ xfs_iunlink_log_dinode(
dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset);
/* Make sure the old pointer isn't garbage. */
- if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) {
+ old_ptr = be32_to_cpu(dip->di_next_unlinked);
+ if (old_ptr != iup->old_agino) {
xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
sizeof(*dip), __this_address);
error = -EFSCORRUPTED;
goto out;
}
- trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno,
- XFS_INO_TO_AGINO(mp, ip->i_ino),
- be32_to_cpu(dip->di_next_unlinked), iup->next_agino);
+ trace_xfs_iunlink_update_dinode(iup, old_ptr);
dip->di_next_unlinked = cpu_to_be32(iup->next_agino);
offset = ip->i_imap.im_boffset +
offsetof(struct xfs_dinode, di_next_unlinked);
- xfs_dinode_calc_crc(mp, dip);
+ xfs_dinode_calc_crc(tp->t_mountp, dip);
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
return 0;
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 86f14ec7c31f..7db3ece370b1 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -100,7 +100,6 @@ xfs_iwalk_ichunk_ra(
struct xfs_inobt_rec_incore *irec)
{
struct xfs_ino_geometry *igeo = M_IGEO(mp);
- xfs_agnumber_t agno = pag->pag_agno;
xfs_agblock_t agbno;
struct blk_plug plug;
int i; /* inode chunk index */
@@ -114,7 +113,7 @@ xfs_iwalk_ichunk_ra(
imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
if (imask & ~irec->ir_free) {
xfs_buf_readahead(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, agbno),
+ xfs_agbno_to_daddr(pag, agbno),
igeo->blocks_per_cluster * mp->m_bsize,
&xfs_inode_buf_ops);
}
@@ -177,20 +176,19 @@ xfs_iwalk_ag_recs(
struct xfs_mount *mp = iwag->mp;
struct xfs_trans *tp = iwag->tp;
struct xfs_perag *pag = iwag->pag;
- xfs_ino_t ino;
unsigned int i, j;
int error;
for (i = 0; i < iwag->nr_recs; i++) {
struct xfs_inobt_rec_incore *irec = &iwag->recs[i];
- trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec);
+ trace_xfs_iwalk_ag_rec(pag, irec);
if (xfs_pwork_want_abort(&iwag->pwork))
return 0;
if (iwag->inobt_walk_fn) {
- error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec,
+ error = iwag->inobt_walk_fn(mp, tp, pag_agno(pag), irec,
iwag->data);
if (error)
return error;
@@ -208,9 +206,10 @@ xfs_iwalk_ag_recs(
continue;
/* Otherwise call our function. */
- ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
- irec->ir_startino + j);
- error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
+ error = iwag->iwalk_fn(mp, tp,
+ xfs_agino_to_ino(pag,
+ irec->ir_startino + j),
+ iwag->data);
if (error)
return error;
}
@@ -305,7 +304,7 @@ xfs_iwalk_ag_start(
return -EFSCORRUPTED;
}
- iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
+ iwag->lastino = xfs_agino_to_ino(pag,
irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
/*
@@ -406,7 +405,7 @@ xfs_iwalk_ag(
int error = 0;
/* Set up our cursor at the right place in the inode btree. */
- ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino));
+ ASSERT(pag_agno(pag) == XFS_INO_TO_AGNO(mp, iwag->startino));
agino = XFS_INO_TO_AGINO(mp, iwag->startino);
error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more);
@@ -425,7 +424,7 @@ xfs_iwalk_ag(
break;
/* Make sure that we always move forward. */
- rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
+ rec_fsino = xfs_agino_to_ino(pag, irec->ir_startino);
if (iwag->lastino != NULLFSINO &&
XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
xfs_btree_mark_sick(cur);
@@ -535,6 +534,37 @@ xfs_iwalk_prefetch(
return max(inobt_records, 2U);
}
+static int
+xfs_iwalk_args(
+ struct xfs_iwalk_ag *iwag,
+ unsigned int flags)
+{
+ struct xfs_mount *mp = iwag->mp;
+ xfs_agnumber_t start_agno;
+ int error;
+
+ start_agno = XFS_INO_TO_AGNO(iwag->mp, iwag->startino);
+ ASSERT(start_agno < iwag->mp->m_sb.sb_agcount);
+ ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
+
+ error = xfs_iwalk_alloc(iwag);
+ if (error)
+ return error;
+
+ while ((iwag->pag = xfs_perag_next_from(mp, iwag->pag, start_agno))) {
+ error = xfs_iwalk_ag(iwag);
+ if (error || (flags & XFS_IWALK_SAME_AG)) {
+ xfs_perag_rele(iwag->pag);
+ break;
+ }
+ iwag->startino =
+ XFS_AGINO_TO_INO(mp, pag_agno(iwag->pag) + 1, 0);
+ }
+
+ xfs_iwalk_free(iwag);
+ return error;
+}
+
/*
* Walk all inodes in the filesystem starting from @startino. The @iwalk_fn
* will be called for each allocated inode, being passed the inode's number and
@@ -563,32 +593,8 @@ xfs_iwalk(
.pwork = XFS_PWORK_SINGLE_THREADED,
.lastino = NULLFSINO,
};
- struct xfs_perag *pag;
- xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
- int error;
-
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
-
- error = xfs_iwalk_alloc(&iwag);
- if (error)
- return error;
-
- for_each_perag_from(mp, agno, pag) {
- iwag.pag = pag;
- error = xfs_iwalk_ag(&iwag);
- if (error)
- break;
- iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
- if (flags & XFS_INOBT_WALK_SAME_AG)
- break;
- iwag.pag = NULL;
- }
- if (iwag.pag)
- xfs_perag_rele(pag);
- xfs_iwalk_free(&iwag);
- return error;
+ return xfs_iwalk_args(&iwag, flags);
}
/* Run per-thread iwalk work. */
@@ -640,19 +646,19 @@ xfs_iwalk_threaded(
bool polled,
void *data)
{
+ xfs_agnumber_t start_agno = XFS_INO_TO_AGNO(mp, startino);
struct xfs_pwork_ctl pctl;
- struct xfs_perag *pag;
- xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ struct xfs_perag *pag = NULL;
int error;
- ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(start_agno < mp->m_sb.sb_agcount);
ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk");
if (error)
return error;
- for_each_perag_from(mp, agno, pag) {
+ while ((pag = xfs_perag_next_from(mp, pag, start_agno))) {
struct xfs_iwalk_ag *iwag;
if (xfs_pwork_ctl_want_abort(&pctl))
@@ -673,8 +679,8 @@ xfs_iwalk_threaded(
iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
iwag->lastino = NULLFSINO;
xfs_pwork_queue(&pctl, &iwag->pwork);
- startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
- if (flags & XFS_INOBT_WALK_SAME_AG)
+ startino = XFS_AGINO_TO_INO(mp, pag_agno(pag) + 1, 0);
+ if (flags & XFS_IWALK_SAME_AG)
break;
}
if (pag)
@@ -748,30 +754,6 @@ xfs_inobt_walk(
.pwork = XFS_PWORK_SINGLE_THREADED,
.lastino = NULLFSINO,
};
- struct xfs_perag *pag;
- xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
- int error;
- ASSERT(agno < mp->m_sb.sb_agcount);
- ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
-
- error = xfs_iwalk_alloc(&iwag);
- if (error)
- return error;
-
- for_each_perag_from(mp, agno, pag) {
- iwag.pag = pag;
- error = xfs_iwalk_ag(&iwag);
- if (error)
- break;
- iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
- if (flags & XFS_INOBT_WALK_SAME_AG)
- break;
- iwag.pag = NULL;
- }
-
- if (iwag.pag)
- xfs_perag_rele(pag);
- xfs_iwalk_free(&iwag);
- return error;
+ return xfs_iwalk_args(&iwag, flags);
}
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 83699089755e..17a5a2c6debb 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -25,7 +25,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
unsigned int flags, xfs_iwalk_fn iwalk_fn,
unsigned int inode_records, bool poll, void *data);
-/* Only iterate inodes within the same AG as @startino. */
+/* Only iterate within the same AG as @startino. */
#define XFS_IWALK_SAME_AG (1U << 0)
#define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG)
@@ -41,9 +41,4 @@ int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_inobt_walk_fn inobt_walk_fn, unsigned int inobt_records,
void *data);
-/* Only iterate inobt records within the same AG as @startino. */
-#define XFS_INOBT_WALK_SAME_AG (XFS_IWALK_SAME_AG)
-
-#define XFS_INOBT_WALK_FLAGS_ALL (XFS_INOBT_WALK_SAME_AG)
-
#endif /* __XFS_IWALK_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 80da0cf87d7a..2e9157b650e6 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -907,7 +907,7 @@ xlog_cil_committed(
xlog_cil_ail_insert(ctx, abort);
xfs_extent_busy_sort(&ctx->busy_extents.extent_list);
- xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list,
+ xfs_extent_busy_clear(&ctx->busy_extents.extent_list,
xfs_has_discard(mp) && !abort);
spin_lock(&ctx->cil->xc_push_lock);
@@ -917,7 +917,6 @@ xlog_cil_committed(
xlog_cil_free_logvec(&ctx->lv_chain);
if (!list_empty(&ctx->busy_extents.extent_list)) {
- ctx->busy_extents.mount = mp;
ctx->busy_extents.owner = ctx;
xfs_discard_extents(mp, &ctx->busy_extents);
return;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 704aaadb61cf..0af3d477197b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1818,6 +1818,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
&xlog_attrd_item_ops,
&xlog_xmi_item_ops,
&xlog_xmd_item_ops,
+ &xlog_rtefi_item_ops,
+ &xlog_rtefd_item_ops,
};
static const struct xlog_recover_item_ops *
@@ -2677,7 +2679,7 @@ xlog_recover_clear_agi_bucket(
struct xfs_perag *pag,
int bucket)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_trans *tp;
struct xfs_agi *agi;
struct xfs_buf *agibp;
@@ -2708,7 +2710,7 @@ out_abort:
xfs_trans_cancel(tp);
out_error:
xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
- pag->pag_agno);
+ pag_agno(pag));
return;
}
@@ -2718,7 +2720,7 @@ xlog_recover_iunlink_bucket(
struct xfs_agi *agi,
int bucket)
{
- struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_mount *mp = pag_mount(pag);
struct xfs_inode *prev_ip = NULL;
struct xfs_inode *ip;
xfs_agino_t prev_agino, agino;
@@ -2726,9 +2728,8 @@ xlog_recover_iunlink_bucket(
agino = be32_to_cpu(agi->agi_unlinked[bucket]);
while (agino != NULLAGINO) {
- error = xfs_iget(mp, NULL,
- XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
- 0, 0, &ip);
+ error = xfs_iget(mp, NULL, xfs_agino_to_ino(pag, agino), 0, 0,
+ &ip);
if (error)
break;
@@ -2846,10 +2847,9 @@ static void
xlog_recover_process_iunlinks(
struct xlog *log)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
- for_each_perag(log->l_mp, agno, pag)
+ while ((pag = xfs_perag_next(log->l_mp, pag)))
xlog_recover_iunlink_ag(pag);
}
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 8f495cc23903..6ed485ff2756 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -131,3 +131,54 @@ xfs_buf_alert_ratelimited(
__xfs_printk(KERN_ALERT, mp, &vaf);
va_end(args);
}
+
+void
+xfs_warn_experimental(
+ struct xfs_mount *mp,
+ enum xfs_experimental_feat feat)
+{
+ static const struct {
+ const char *name;
+ long opstate;
+ } features[] = {
+ [XFS_EXPERIMENTAL_PNFS] = {
+ .opstate = XFS_OPSTATE_WARNED_PNFS,
+ .name = "pNFS",
+ },
+ [XFS_EXPERIMENTAL_SCRUB] = {
+ .opstate = XFS_OPSTATE_WARNED_SCRUB,
+ .name = "online scrub",
+ },
+ [XFS_EXPERIMENTAL_SHRINK] = {
+ .opstate = XFS_OPSTATE_WARNED_SHRINK,
+ .name = "online shrink",
+ },
+ [XFS_EXPERIMENTAL_LARP] = {
+ .opstate = XFS_OPSTATE_WARNED_LARP,
+ .name = "logged extended attributes",
+ },
+ [XFS_EXPERIMENTAL_LBS] = {
+ .opstate = XFS_OPSTATE_WARNED_LBS,
+ .name = "large block size",
+ },
+ [XFS_EXPERIMENTAL_EXCHRANGE] = {
+ .opstate = XFS_OPSTATE_WARNED_EXCHRANGE,
+ .name = "exchange range",
+ },
+ [XFS_EXPERIMENTAL_PPTR] = {
+ .opstate = XFS_OPSTATE_WARNED_PPTR,
+ .name = "parent pointer",
+ },
+ [XFS_EXPERIMENTAL_METADIR] = {
+ .opstate = XFS_OPSTATE_WARNED_METADIR,
+ .name = "metadata directory tree",
+ },
+ };
+ ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX);
+ BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);
+
+ if (xfs_should_warn(mp, features[feat].opstate))
+ xfs_warn(mp,
+ "EXPERIMENTAL %s feature enabled. Use at your own risk!",
+ features[feat].name);
+}
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index cc323775a12c..7fb36ced9df7 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -75,12 +75,6 @@ do { \
#define xfs_debug_ratelimited(dev, fmt, ...) \
xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
-#define xfs_warn_mount(mp, warntag, fmt, ...) \
-do { \
- if (xfs_should_warn((mp), (warntag))) \
- xfs_warn((mp), (fmt), ##__VA_ARGS__); \
-} while (0)
-
#define xfs_warn_once(dev, fmt, ...) \
xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__)
#define xfs_notice_once(dev, fmt, ...) \
@@ -96,4 +90,18 @@ extern void xfs_hex_dump(const void *p, int length);
void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg,
const char *fmt, ...);
+enum xfs_experimental_feat {
+ XFS_EXPERIMENTAL_PNFS,
+ XFS_EXPERIMENTAL_SCRUB,
+ XFS_EXPERIMENTAL_SHRINK,
+ XFS_EXPERIMENTAL_LARP,
+ XFS_EXPERIMENTAL_LBS,
+ XFS_EXPERIMENTAL_EXCHRANGE,
+ XFS_EXPERIMENTAL_PPTR,
+ XFS_EXPERIMENTAL_METADIR,
+
+ XFS_EXPERIMENTAL_MAX,
+};
+void xfs_warn_experimental(struct xfs_mount *mp, enum xfs_experimental_feat f);
+
#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 25bbcc3f4ee0..5918f433dba7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -35,6 +35,8 @@
#include "xfs_trace.h"
#include "xfs_ag.h"
#include "xfs_rtbitmap.h"
+#include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
#include "scrub/stats.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -620,6 +622,22 @@ xfs_mount_setup_inode_geom(
xfs_ialloc_setup_geometry(mp);
}
+/* Mount the metadata directory tree root. */
+STATIC int
+xfs_mount_setup_metadir(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ /* Load the metadata directory root inode into memory. */
+ error = xfs_metafile_iget(mp, mp->m_sb.sb_metadirino, XFS_METAFILE_DIR,
+ &mp->m_metadirip);
+ if (error)
+ xfs_warn(mp, "Failed to load metadir root directory, error %d",
+ error);
+ return error;
+}
+
/* Compute maximum possible height for per-AG btree types for this fs. */
static inline void
xfs_agbtree_compute_maxlevels(
@@ -817,10 +835,17 @@ xfs_mountfs(
goto out_free_dir;
}
+ error = xfs_initialize_rtgroups(mp, 0, sbp->sb_rgcount,
+ mp->m_sb.sb_rextents);
+ if (error) {
+ xfs_warn(mp, "Failed rtgroup init: %d", error);
+ goto out_free_perag;
+ }
+
if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
xfs_warn(mp, "no log defined");
error = -EFSCORRUPTED;
- goto out_free_perag;
+ goto out_free_rtgroup;
}
error = xfs_inodegc_register_shrinker(mp);
@@ -828,6 +853,13 @@ xfs_mountfs(
goto out_fail_wait;
/*
+ * If we're resuming quota status, pick up the preliminary qflags from
+ * the ondisk superblock so that we know if we should recover dquots.
+ */
+ if (xfs_is_resuming_quotaon(mp))
+ xfs_qm_resume_quotaon(mp);
+
+ /*
* Log's mount-time initialization. The first part of recovery can place
* some items on the AIL, to be handled when recovery is finished or
* cancelled.
@@ -841,6 +873,14 @@ xfs_mountfs(
}
/*
+ * If we're resuming quota status and recovered the log, re-sample the
+ * qflags from the ondisk superblock now that we've recovered it, just
+ * in case someone shut down enforcement just before a crash.
+ */
+ if (xfs_clear_resuming_quotaon(mp) && xlog_recovery_needed(mp->m_log))
+ xfs_qm_resume_quotaon(mp);
+
+ /*
* If logged xattrs are still enabled after log recovery finishes, then
* they'll be available until unmount. Otherwise, turn them off.
*/
@@ -866,6 +906,12 @@ xfs_mountfs(
mp->m_features |= XFS_FEAT_ATTR2;
}
+ if (xfs_has_metadir(mp)) {
+ error = xfs_mount_setup_metadir(mp);
+ if (error)
+ goto out_free_metadir;
+ }
+
/*
* Get and sanity-check the root inode.
* Save the pointer to it in the mount structure.
@@ -876,7 +922,7 @@ xfs_mountfs(
xfs_warn(mp,
"Failed to read root inode 0x%llx, error %d",
sbp->sb_rootino, -error);
- goto out_log_dealloc;
+ goto out_free_metadir;
}
ASSERT(rip != NULL);
@@ -1018,6 +1064,9 @@ xfs_mountfs(
xfs_irele(rip);
/* Clean out dquots that might be in memory after quotacheck. */
xfs_qm_unmount(mp);
+ out_free_metadir:
+ if (mp->m_metadirip)
+ xfs_irele(mp->m_metadirip);
/*
* Inactivate all inodes that might still be in memory after a log
@@ -1039,7 +1088,6 @@ xfs_mountfs(
* quota inodes.
*/
xfs_unmount_flush_inodes(mp);
- out_log_dealloc:
xfs_log_mount_cancel(mp);
out_inodegc_shrinker:
shrinker_free(mp->m_inodegc_shrinker);
@@ -1047,6 +1095,8 @@ xfs_mountfs(
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_buftarg_drain(mp->m_logdev_targp);
xfs_buftarg_drain(mp->m_ddev_targp);
+ out_free_rtgroup:
+ xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount);
out_free_perag:
xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
out_free_dir:
@@ -1091,6 +1141,8 @@ xfs_unmountfs(
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
xfs_irele(mp->m_rootip);
+ if (mp->m_metadirip)
+ xfs_irele(mp->m_metadirip);
xfs_unmount_flush_inodes(mp);
@@ -1129,6 +1181,7 @@ xfs_unmountfs(
xfs_errortag_clearall(mp);
#endif
shrinker_free(mp->m_inodegc_shrinker);
+ xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount);
xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
xfs_errortag_del(mp);
xfs_error_sysfs_del(mp);
@@ -1436,7 +1489,7 @@ xfs_mod_delalloc(
if (XFS_IS_REALTIME_INODE(ip)) {
percpu_counter_add_batch(&mp->m_delalloc_rtextents,
- xfs_rtb_to_rtx(mp, data_delta),
+ xfs_blen_to_rtbxlen(mp, data_delta),
XFS_DELALLOC_BATCH);
if (!ind_delta)
return;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 96496f39f551..db9dade7d22a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -72,6 +72,40 @@ struct xfs_inodegc {
};
/*
+ * Container for each type of groups, used to look up individual groups and
+ * describes the geometry.
+ */
+struct xfs_groups {
+ struct xarray xa;
+
+ /*
+ * Maximum capacity of the group in FSBs.
+ *
+ * Each group is laid out densely in the daddr space. For the
+ * degenerate case of a pre-rtgroups filesystem, the incore rtgroup
+ * pretends to have a zero-block and zero-blklog rtgroup.
+ */
+ uint32_t blocks;
+
+ /*
+ * Log(2) of the logical size of each group.
+ *
+ * Compared to the blocks field above this is rounded up to the next
+ * power of two, and thus lays out the xfs_fsblock_t/xfs_rtblock_t
+ * space sparsely with a hole from blocks to (1 << blklog) at the end
+ * of each group.
+ */
+ uint8_t blklog;
+
+ /*
+ * Mask to extract the group-relative block number from a FSB.
+ * For a pre-rtgroups filesystem we pretend to have one very large
+ * rtgroup, so this mask must be 64-bit.
+ */
+ uint64_t blkmask;
+};
+
+/*
* The struct xfsmount layout is optimised to separate read-mostly variables
* from variables that are frequently modified. We put the read-mostly variables
* first, then place all the other variables at the end.
@@ -85,27 +119,20 @@ typedef struct xfs_mount {
struct super_block *m_super;
struct xfs_ail *m_ail; /* fs active log item list */
struct xfs_buf *m_sb_bp; /* buffer for superblock */
+ struct xfs_buf *m_rtsb_bp; /* realtime superblock */
char *m_rtname; /* realtime device name */
char *m_logname; /* external log device name */
struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
struct xlog *m_log; /* log specific stuff */
- struct xfs_inode *m_rbmip; /* pointer to bitmap inode */
- struct xfs_inode *m_rsumip; /* pointer to summary inode */
struct xfs_inode *m_rootip; /* pointer to root directory */
+ struct xfs_inode *m_metadirip; /* ptr to metadata directory */
+ struct xfs_inode *m_rtdirip; /* ptr to realtime metadir */
struct xfs_quotainfo *m_quotainfo; /* disk quota information */
struct xfs_buftarg *m_ddev_targp; /* data device */
struct xfs_buftarg *m_logdev_targp;/* log device */
struct xfs_buftarg *m_rtdev_targp; /* rt device */
void __percpu *m_inodegc; /* percpu inodegc structures */
-
- /*
- * Optional cache of rt summary level per bitmap block with the
- * invariant that m_rsum_cache[bbno] > the maximum i for which
- * rsum[i][bbno] != 0, or 0 if rsum[i][bbno] == 0 for all i.
- * Reads and writes are serialized by the rsumip inode lock.
- */
- uint8_t *m_rsum_cache;
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
@@ -120,9 +147,11 @@ typedef struct xfs_mount {
uint8_t m_agno_log; /* log #ag's */
uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
int8_t m_rtxblklog; /* log2 of rextsize, if possible */
+
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
- uint m_blockwmask; /* blockwsize-1 */
+ /* number of rt extents per rt bitmap block if rtgroups enabled */
+ unsigned int m_rtx_per_rbmblock;
uint m_alloc_mxr[2]; /* max alloc btree records */
uint m_alloc_mnr[2]; /* min alloc btree records */
uint m_bmap_dmxr[2]; /* max bmap btree records */
@@ -146,7 +175,7 @@ typedef struct xfs_mount {
uint m_allocsize_blocks; /* min write size blocks */
int m_logbufs; /* number of log buffers */
int m_logbsize; /* size of each log buffer */
- uint m_rsumlevels; /* rt summary levels */
+ unsigned int m_rsumlevels; /* rt summary levels */
xfs_filblks_t m_rsumblocks; /* size of rt summary, FSBs */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_qflags; /* quota status flags */
@@ -208,7 +237,7 @@ typedef struct xfs_mount {
*/
atomic64_t m_allocbt_blks;
- struct xarray m_perags; /* per-ag accounting info */
+ struct xfs_groups m_groups[XG_TYPE_MAX];
uint64_t m_resblks; /* total reserved blocks */
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
@@ -224,6 +253,7 @@ typedef struct xfs_mount {
#endif
xfs_agnumber_t m_agfrotor; /* last ag where space found */
atomic_t m_agirotor; /* last ag dir inode alloced */
+ atomic_t m_rtgrotor; /* last rtgroup rtpicked */
/* Memory shrinker to throttle and reprioritize inodegc */
struct shrinker *m_inodegc_shrinker;
@@ -298,6 +328,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
+#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -353,6 +384,19 @@ __XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
+__XFS_HAS_FEAT(metadir, METADIR)
+
+static inline bool xfs_has_rtgroups(struct xfs_mount *mp)
+{
+ /* all metadir file systems also allow rtgroups */
+ return xfs_has_metadir(mp);
+}
+
+static inline bool xfs_has_rtsb(struct xfs_mount *mp)
+{
+ /* all rtgroups filesystems with an rt section have an rtsb */
+ return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
+}
/*
* Some features are always on for v5 file systems, allow the compiler to
@@ -433,18 +477,30 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
*/
#define XFS_OPSTATE_BLOCKGC_ENABLED 6
+/* Kernel has logged a warning about pNFS being used on this fs. */
+#define XFS_OPSTATE_WARNED_PNFS 7
/* Kernel has logged a warning about online fsck being used on this fs. */
-#define XFS_OPSTATE_WARNED_SCRUB 7
+#define XFS_OPSTATE_WARNED_SCRUB 8
/* Kernel has logged a warning about shrink being used on this fs. */
-#define XFS_OPSTATE_WARNED_SHRINK 8
+#define XFS_OPSTATE_WARNED_SHRINK 9
/* Kernel has logged a warning about logged xattr updates being used. */
-#define XFS_OPSTATE_WARNED_LARP 9
+#define XFS_OPSTATE_WARNED_LARP 10
/* Mount time quotacheck is running */
-#define XFS_OPSTATE_QUOTACHECK_RUNNING 10
+#define XFS_OPSTATE_QUOTACHECK_RUNNING 11
/* Do we want to clear log incompat flags? */
-#define XFS_OPSTATE_UNSET_LOG_INCOMPAT 11
+#define XFS_OPSTATE_UNSET_LOG_INCOMPAT 12
/* Filesystem can use logged extended attributes */
-#define XFS_OPSTATE_USE_LARP 12
+#define XFS_OPSTATE_USE_LARP 13
+/* Kernel has logged a warning about blocksize > pagesize on this fs. */
+#define XFS_OPSTATE_WARNED_LBS 14
+/* Kernel has logged a warning about exchange-range being used on this fs. */
+#define XFS_OPSTATE_WARNED_EXCHRANGE 15
+/* Kernel has logged a warning about parent pointers being used on this fs. */
+#define XFS_OPSTATE_WARNED_PPTR 16
+/* Kernel has logged a warning about metadata dirs being used on this fs. */
+#define XFS_OPSTATE_WARNED_METADIR 17
+/* Filesystem should use qflags to determine quotaon status */
+#define XFS_OPSTATE_RESUMING_QUOTAON 18
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -469,9 +525,24 @@ __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
#ifdef CONFIG_XFS_QUOTA
__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
+__XFS_IS_OPSTATE(resuming_quotaon, RESUMING_QUOTAON)
#else
-# define xfs_is_quotacheck_running(mp) (false)
-#endif
+static inline bool xfs_is_quotacheck_running(struct xfs_mount *mp)
+{
+ return false;
+}
+static inline bool xfs_is_resuming_quotaon(struct xfs_mount *mp)
+{
+ return false;
+}
+static inline void xfs_set_resuming_quotaon(struct xfs_mount *m)
+{
+}
+static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
+{
+ return false;
+}
+#endif /* CONFIG_XFS_QUOTA */
__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 23d16186e1a3..6f4479deac6d 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -58,8 +58,7 @@ xfs_fs_get_uuid(
{
struct xfs_mount *mp = XFS_M(sb);
- xfs_notice_once(mp,
-"Using experimental pNFS feature, use at your own risk!");
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PNFS);
if (*len < sizeof(uuid_t))
return -EINVAL;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 7e2307921deb..b928b036990b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -27,6 +27,9 @@
#include "xfs_ialloc.h"
#include "xfs_log_priv.h"
#include "xfs_health.h"
+#include "xfs_da_format.h"
+#include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -37,7 +40,6 @@
STATIC int xfs_qm_init_quotainos(struct xfs_mount *mp);
STATIC int xfs_qm_init_quotainfo(struct xfs_mount *mp);
-STATIC void xfs_qm_destroy_quotainos(struct xfs_quotainfo *qi);
STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp);
/*
* We use the batch lookup interface to iterate over the dquots as it
@@ -208,6 +210,39 @@ xfs_qm_unmount(
}
}
+static void
+xfs_qm_unmount_rt(
+ struct xfs_mount *mp)
+{
+ struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, 0);
+
+ if (!rtg)
+ return;
+ if (rtg->rtg_inodes[XFS_RTGI_BITMAP])
+ xfs_qm_dqdetach(rtg->rtg_inodes[XFS_RTGI_BITMAP]);
+ if (rtg->rtg_inodes[XFS_RTGI_SUMMARY])
+ xfs_qm_dqdetach(rtg->rtg_inodes[XFS_RTGI_SUMMARY]);
+ xfs_rtgroup_rele(rtg);
+}
+
+STATIC void
+xfs_qm_destroy_quotainos(
+ struct xfs_quotainfo *qi)
+{
+ if (qi->qi_uquotaip) {
+ xfs_irele(qi->qi_uquotaip);
+ qi->qi_uquotaip = NULL; /* paranoia */
+ }
+ if (qi->qi_gquotaip) {
+ xfs_irele(qi->qi_gquotaip);
+ qi->qi_gquotaip = NULL;
+ }
+ if (qi->qi_pquotaip) {
+ xfs_irele(qi->qi_pquotaip);
+ qi->qi_pquotaip = NULL;
+ }
+}
+
/*
* Called from the vfsops layer.
*/
@@ -221,28 +256,19 @@ xfs_qm_unmount_quotas(
*/
ASSERT(mp->m_rootip);
xfs_qm_dqdetach(mp->m_rootip);
- if (mp->m_rbmip)
- xfs_qm_dqdetach(mp->m_rbmip);
- if (mp->m_rsumip)
- xfs_qm_dqdetach(mp->m_rsumip);
+
+ /*
+ * For pre-RTG file systems, the RT inodes have quotas attached,
+ * detach them now.
+ */
+ if (!xfs_has_rtgroups(mp))
+ xfs_qm_unmount_rt(mp);
/*
* Release the quota inodes.
*/
- if (mp->m_quotainfo) {
- if (mp->m_quotainfo->qi_uquotaip) {
- xfs_irele(mp->m_quotainfo->qi_uquotaip);
- mp->m_quotainfo->qi_uquotaip = NULL;
- }
- if (mp->m_quotainfo->qi_gquotaip) {
- xfs_irele(mp->m_quotainfo->qi_gquotaip);
- mp->m_quotainfo->qi_gquotaip = NULL;
- }
- if (mp->m_quotainfo->qi_pquotaip) {
- xfs_irele(mp->m_quotainfo->qi_pquotaip);
- mp->m_quotainfo->qi_pquotaip = NULL;
- }
- }
+ if (mp->m_quotainfo)
+ xfs_qm_destroy_quotainos(mp->m_quotainfo);
}
STATIC int
@@ -302,6 +328,8 @@ xfs_qm_need_dqattach(
return false;
if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return false;
+ if (xfs_is_metadir_inode(ip))
+ return false;
return true;
}
@@ -324,6 +352,7 @@ xfs_qm_dqattach_locked(
return 0;
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+ ASSERT(!xfs_is_metadir_inode(ip));
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
@@ -616,6 +645,157 @@ xfs_qm_init_timelimits(
xfs_qm_dqdestroy(dqp);
}
+static int
+xfs_qm_load_metadir_qinos(
+ struct xfs_mount *mp,
+ struct xfs_quotainfo *qi,
+ struct xfs_inode **dpp)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ error = xfs_dqinode_load_parent(tp, dpp);
+ if (error == -ENOENT) {
+ /* no quota dir directory, but we'll create one later */
+ error = 0;
+ goto out_trans;
+ }
+ if (error)
+ goto out_trans;
+
+ if (XFS_IS_UQUOTA_ON(mp)) {
+ error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_USER,
+ &qi->qi_uquotaip);
+ if (error && error != -ENOENT)
+ goto out_trans;
+ }
+
+ if (XFS_IS_GQUOTA_ON(mp)) {
+ error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_GROUP,
+ &qi->qi_gquotaip);
+ if (error && error != -ENOENT)
+ goto out_trans;
+ }
+
+ if (XFS_IS_PQUOTA_ON(mp)) {
+ error = xfs_dqinode_load(tp, *dpp, XFS_DQTYPE_PROJ,
+ &qi->qi_pquotaip);
+ if (error && error != -ENOENT)
+ goto out_trans;
+ }
+
+ error = 0;
+out_trans:
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+/* Create quota inodes in the metadata directory tree. */
+STATIC int
+xfs_qm_create_metadir_qinos(
+ struct xfs_mount *mp,
+ struct xfs_quotainfo *qi,
+ struct xfs_inode **dpp)
+{
+ int error;
+
+ if (!*dpp) {
+ error = xfs_dqinode_mkdir_parent(mp, dpp);
+ if (error && error != -EEXIST)
+ return error;
+ }
+
+ if (XFS_IS_UQUOTA_ON(mp) && !qi->qi_uquotaip) {
+ error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_USER,
+ &qi->qi_uquotaip);
+ if (error)
+ return error;
+ }
+
+ if (XFS_IS_GQUOTA_ON(mp) && !qi->qi_gquotaip) {
+ error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_GROUP,
+ &qi->qi_gquotaip);
+ if (error)
+ return error;
+ }
+
+ if (XFS_IS_PQUOTA_ON(mp) && !qi->qi_pquotaip) {
+ error = xfs_dqinode_metadir_create(*dpp, XFS_DQTYPE_PROJ,
+ &qi->qi_pquotaip);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Add QUOTABIT to sb_versionnum and initialize qflags in preparation for
+ * creating quota files on a metadir filesystem.
+ */
+STATIC int
+xfs_qm_prep_metadir_sb(
+ struct xfs_mount *mp)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ spin_lock(&mp->m_sb_lock);
+
+ xfs_add_quota(mp);
+
+ /* qflags will get updated fully _after_ quotacheck */
+ mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
+
+ spin_unlock(&mp->m_sb_lock);
+ xfs_log_sb(tp);
+
+ return xfs_trans_commit(tp);
+}
+
+/*
+ * Load existing quota inodes or create them. Since this is a V5 filesystem,
+ * we don't have to deal with the grp/prjquota switcheroo thing from V4.
+ */
+STATIC int
+xfs_qm_init_metadir_qinos(
+ struct xfs_mount *mp)
+{
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_inode *dp = NULL;
+ int error;
+
+ if (!xfs_has_quota(mp)) {
+ error = xfs_qm_prep_metadir_sb(mp);
+ if (error)
+ return error;
+ }
+
+ error = xfs_qm_load_metadir_qinos(mp, qi, &dp);
+ if (error)
+ goto out_err;
+
+ error = xfs_qm_create_metadir_qinos(mp, qi, &dp);
+ if (error)
+ goto out_err;
+
+ xfs_irele(dp);
+ return 0;
+out_err:
+ xfs_qm_destroy_quotainos(mp->m_quotainfo);
+ if (dp)
+ xfs_irele(dp);
+ return error;
+}
+
/*
* This initializes all the quota information that's kept in the
* mount structure
@@ -640,7 +820,10 @@ xfs_qm_init_quotainfo(
* See if quotainodes are setup, and if not, allocate them,
* and change the superblock accordingly.
*/
- error = xfs_qm_init_quotainos(mp);
+ if (xfs_has_metadir(mp))
+ error = xfs_qm_init_metadir_qinos(mp);
+ else
+ error = xfs_qm_init_quotainos(mp);
if (error)
goto out_free_lru;
@@ -733,6 +916,17 @@ xfs_qm_destroy_quotainfo(
mp->m_quotainfo = NULL;
}
+static inline enum xfs_metafile_type
+xfs_qm_metafile_type(
+ unsigned int flags)
+{
+ if (flags & XFS_QMOPT_UQUOTA)
+ return XFS_METAFILE_USRQUOTA;
+ else if (flags & XFS_QMOPT_GQUOTA)
+ return XFS_METAFILE_GRPQUOTA;
+ return XFS_METAFILE_PRJQUOTA;
+}
+
/*
* Create an inode and return with a reference already taken, but unlocked
* This is how we create quota inodes
@@ -744,6 +938,7 @@ xfs_qm_qino_alloc(
unsigned int flags)
{
struct xfs_trans *tp;
+ enum xfs_metafile_type metafile_type = xfs_qm_metafile_type(flags);
int error;
bool need_alloc = true;
@@ -777,9 +972,10 @@ xfs_qm_qino_alloc(
}
}
if (ino != NULLFSINO) {
- error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
+ error = xfs_metafile_iget(mp, ino, metafile_type, ipp);
if (error)
return error;
+
mp->m_sb.sb_gquotino = NULLFSINO;
mp->m_sb.sb_pquotino = NULLFSINO;
need_alloc = false;
@@ -806,6 +1002,8 @@ xfs_qm_qino_alloc(
xfs_trans_cancel(tp);
return error;
}
+ if (xfs_has_metadir(mp))
+ xfs_metafile_set_iflag(tp, *ipp, metafile_type);
}
/*
@@ -1153,8 +1351,8 @@ xfs_qm_dqusage_adjust(
void *data)
{
struct xfs_inode *ip;
- xfs_qcnt_t nblks;
- xfs_filblks_t rtblks = 0; /* total rt blks */
+ xfs_filblks_t nblks, rtblks;
+ unsigned int lock_mode;
int error;
ASSERT(XFS_IS_QUOTA_ON(mp));
@@ -1189,20 +1387,23 @@ xfs_qm_dqusage_adjust(
}
}
+ /* Metadata directory files are not accounted to user-visible quotas. */
+ if (xfs_is_metadir_inode(ip))
+ goto error0;
+
ASSERT(ip->i_delayed_blks == 0);
+ lock_mode = xfs_ilock_data_map_shared(ip);
if (XFS_IS_REALTIME_INODE(ip)) {
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
-
error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
- if (error)
+ if (error) {
+ xfs_iunlock(ip, lock_mode);
goto error0;
-
- xfs_bmap_count_leaves(ifp, &rtblks);
+ }
}
-
- nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks;
+ xfs_inode_count_blocks(tp, ip, &nblks, &rtblks);
xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED);
+ xfs_iunlock(ip, lock_mode);
/*
* Add the (disk blocks and inode) resources occupied by this
@@ -1462,10 +1663,11 @@ xfs_qm_mount_quotas(
uint sbf;
/*
- * If quotas on realtime volumes is not supported, we disable
- * quotas immediately.
+ * If quotas on realtime volumes is not supported, disable quotas
+ * immediately. We only support rtquota if rtgroups are enabled to
+ * avoid problems with older kernels.
*/
- if (mp->m_sb.sb_rextents) {
+ if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) {
xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
mp->m_qflags = 0;
goto write_changes;
@@ -1533,7 +1735,7 @@ xfs_qm_mount_quotas(
}
if (error) {
- xfs_warn(mp, "Failed to initialize disk quotas.");
+ xfs_warn(mp, "Failed to initialize disk quotas, err %d.", error);
return;
}
}
@@ -1552,27 +1754,26 @@ xfs_qm_qino_load(
xfs_dqtype_t type,
struct xfs_inode **ipp)
{
- xfs_ino_t ino = NULLFSINO;
-
- switch (type) {
- case XFS_DQTYPE_USER:
- ino = mp->m_sb.sb_uquotino;
- break;
- case XFS_DQTYPE_GROUP:
- ino = mp->m_sb.sb_gquotino;
- break;
- case XFS_DQTYPE_PROJ:
- ino = mp->m_sb.sb_pquotino;
- break;
- default:
- ASSERT(0);
- return -EFSCORRUPTED;
- }
-
- if (ino == NULLFSINO)
- return -ENOENT;
-
- return xfs_iget(mp, NULL, ino, 0, 0, ipp);
+ struct xfs_trans *tp;
+ struct xfs_inode *dp = NULL;
+ int error;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+
+ if (xfs_has_metadir(mp)) {
+ error = xfs_dqinode_load_parent(tp, &dp);
+ if (error)
+ goto out_cancel;
+ }
+
+ error = xfs_dqinode_load(tp, dp, type, ipp);
+ if (dp)
+ xfs_irele(dp);
+out_cancel:
+ xfs_trans_cancel(tp);
+ return error;
}
/*
@@ -1666,24 +1867,6 @@ error_rele:
}
STATIC void
-xfs_qm_destroy_quotainos(
- struct xfs_quotainfo *qi)
-{
- if (qi->qi_uquotaip) {
- xfs_irele(qi->qi_uquotaip);
- qi->qi_uquotaip = NULL; /* paranoia */
- }
- if (qi->qi_gquotaip) {
- xfs_irele(qi->qi_gquotaip);
- qi->qi_gquotaip = NULL;
- }
- if (qi->qi_pquotaip) {
- xfs_irele(qi->qi_pquotaip);
- qi->qi_pquotaip = NULL;
- }
-}
-
-STATIC void
xfs_qm_dqfree_one(
struct xfs_dquot *dqp)
{
@@ -1735,6 +1918,8 @@ xfs_qm_vop_dqalloc(
if (!XFS_IS_QUOTA_ON(mp))
return 0;
+ ASSERT(!xfs_is_metadir_inode(ip));
+
lockflags = XFS_ILOCK_EXCL;
xfs_ilock(ip, lockflags);
@@ -1858,23 +2043,29 @@ xfs_qm_vop_chown(
struct xfs_dquot *newdq)
{
struct xfs_dquot *prevdq;
- uint bfield = XFS_IS_REALTIME_INODE(ip) ?
- XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
-
+ xfs_filblks_t dblocks, rblocks;
+ bool isrt = XFS_IS_REALTIME_INODE(ip);
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
+ ASSERT(!xfs_is_metadir_inode(ip));
/* old dquot */
prevdq = *IO_olddq;
ASSERT(prevdq);
ASSERT(prevdq != newdq);
- xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks));
+ xfs_inode_count_blocks(tp, ip, &dblocks, &rblocks);
+
+ xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_BCOUNT,
+ -(xfs_qcnt_t)dblocks);
+ xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_RTBCOUNT,
+ -(xfs_qcnt_t)rblocks);
xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
/* the sparkling new dquot */
- xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks);
+ xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_BCOUNT, dblocks);
+ xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_RTBCOUNT, rblocks);
xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
@@ -1884,7 +2075,8 @@ xfs_qm_vop_chown(
* (having already bumped up the real counter) so that we don't have
* any reservation to give back when we commit.
*/
- xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS,
+ xfs_trans_mod_dquot(tp, newdq,
+ isrt ? XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS,
-ip->i_delayed_blks);
/*
@@ -1896,8 +2088,13 @@ xfs_qm_vop_chown(
*/
tp->t_flags |= XFS_TRANS_DIRTY;
xfs_dqlock(prevdq);
- ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
- prevdq->q_blk.reserved -= ip->i_delayed_blks;
+ if (isrt) {
+ ASSERT(prevdq->q_rtb.reserved >= ip->i_delayed_blks);
+ prevdq->q_rtb.reserved -= ip->i_delayed_blks;
+ } else {
+ ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
+ prevdq->q_blk.reserved -= ip->i_delayed_blks;
+ }
xfs_dqunlock(prevdq);
/*
@@ -1951,6 +2148,7 @@ xfs_qm_vop_create_dqattach(
return;
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+ ASSERT(!xfs_is_metadir_inode(ip));
if (udqp && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(ip->i_udquot == NULL);
@@ -1981,6 +2179,8 @@ xfs_inode_near_dquot_enforcement(
xfs_dqtype_t type)
{
struct xfs_dquot *dqp;
+ struct xfs_dquot_res *res;
+ struct xfs_dquot_pre *pre;
int64_t freesp;
/* We only care for quotas that are enabled and enforced. */
@@ -1989,21 +2189,30 @@ xfs_inode_near_dquot_enforcement(
return false;
if (xfs_dquot_res_over_limits(&dqp->q_ino) ||
+ xfs_dquot_res_over_limits(&dqp->q_blk) ||
xfs_dquot_res_over_limits(&dqp->q_rtb))
return true;
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ res = &dqp->q_rtb;
+ pre = &dqp->q_rtb_prealloc;
+ } else {
+ res = &dqp->q_blk;
+ pre = &dqp->q_blk_prealloc;
+ }
+
/* For space on the data device, check the various thresholds. */
- if (!dqp->q_prealloc_hi_wmark)
+ if (!pre->q_prealloc_hi_wmark)
return false;
- if (dqp->q_blk.reserved < dqp->q_prealloc_lo_wmark)
+ if (res->reserved < pre->q_prealloc_lo_wmark)
return false;
- if (dqp->q_blk.reserved >= dqp->q_prealloc_hi_wmark)
+ if (res->reserved >= pre->q_prealloc_hi_wmark)
return true;
- freesp = dqp->q_prealloc_hi_wmark - dqp->q_blk.reserved;
- if (freesp < dqp->q_low_space[XFS_QLOWSP_5_PCNT])
+ freesp = pre->q_prealloc_hi_wmark - res->reserved;
+ if (freesp < pre->q_low_space[XFS_QLOWSP_5_PCNT])
return true;
return false;
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index a11436579877..847ba29630e9 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -19,18 +19,24 @@
STATIC void
xfs_fill_statvfs_from_dquot(
struct kstatfs *statp,
+ struct xfs_inode *ip,
struct xfs_dquot *dqp)
{
+ struct xfs_dquot_res *blkres = &dqp->q_blk;
uint64_t limit;
- limit = dqp->q_blk.softlimit ?
- dqp->q_blk.softlimit :
- dqp->q_blk.hardlimit;
+ if (XFS_IS_REALTIME_MOUNT(ip->i_mount) &&
+ (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME)))
+ blkres = &dqp->q_rtb;
+
+ limit = blkres->softlimit ?
+ blkres->softlimit :
+ blkres->hardlimit;
if (limit && statp->f_blocks > limit) {
statp->f_blocks = limit;
statp->f_bfree = statp->f_bavail =
- (statp->f_blocks > dqp->q_blk.reserved) ?
- (statp->f_blocks - dqp->q_blk.reserved) : 0;
+ (statp->f_blocks > blkres->reserved) ?
+ (statp->f_blocks - blkres->reserved) : 0;
}
limit = dqp->q_ino.softlimit ?
@@ -61,7 +67,7 @@ xfs_qm_statvfs(
struct xfs_dquot *dqp;
if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) {
- xfs_fill_statvfs_from_dquot(statp, dqp);
+ xfs_fill_statvfs_from_dquot(statp, ip, dqp);
xfs_qm_dqput(dqp);
}
}
@@ -135,3 +141,21 @@ xfs_qm_newmount(
return 0;
}
+
+/*
+ * If the sysadmin didn't provide any quota mount options, restore the quota
+ * accounting and enforcement state from the ondisk superblock. Only do this
+ * for metadir filesystems because this is a behavior change.
+ */
+void
+xfs_qm_resume_quotaon(
+ struct xfs_mount *mp)
+{
+ if (!xfs_has_metadir(mp))
+ return;
+ if (xfs_has_norecovery(mp))
+ return;
+
+ mp->m_qflags = mp->m_sb.sb_qflags & (XFS_ALL_QUOTA_ACCT |
+ XFS_ALL_QUOTA_ENFD);
+}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 23d71a55bbc0..fa1317cc396c 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -29,6 +29,11 @@ struct xfs_buf;
(XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
(XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
+#define XFS_IS_DQDETACHED(ip) \
+ ((ip)->i_udquot == NULL && \
+ (ip)->i_gdquot == NULL && \
+ (ip)->i_pdquot == NULL)
+
#define XFS_QM_NEED_QUOTACHECK(mp) \
((XFS_IS_UQUOTA_ON(mp) && \
(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
@@ -120,10 +125,12 @@ extern void xfs_qm_dqdetach(struct xfs_inode *);
extern void xfs_qm_dqrele(struct xfs_dquot *);
extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
+void xfs_qm_resume_quotaon(struct xfs_mount *mp);
extern void xfs_qm_mount_quotas(struct xfs_mount *);
extern void xfs_qm_unmount(struct xfs_mount *);
extern void xfs_qm_unmount_quotas(struct xfs_mount *);
bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
+int xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks);
# ifdef CONFIG_XFS_LIVE_HOOKS
void xfs_trans_mod_ino_dquot(struct xfs_trans *tp, struct xfs_inode *ip,
@@ -197,11 +204,17 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
#define xfs_qm_dqrele(d) do { (d) = (d); } while(0)
#define xfs_qm_statvfs(ip, s) do { } while(0)
#define xfs_qm_newmount(mp, a, b) (0)
+#define xfs_qm_resume_quotaon(mp) ((void)0)
#define xfs_qm_mount_quotas(mp)
#define xfs_qm_unmount(mp)
#define xfs_qm_unmount_quotas(mp)
#define xfs_inode_near_dquot_enforcement(ip, type) (false)
+static inline int xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return 0;
+}
+
# ifdef CONFIG_XFS_LIVE_HOOKS
# define xfs_dqtrx_hook_enable() ((void)0)
# define xfs_dqtrx_hook_disable() ((void)0)
@@ -209,12 +222,6 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
#endif /* CONFIG_XFS_QUOTA */
-static inline int
-xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
-{
- return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
-}
-
static inline void
xfs_quota_unreserve_blkres(struct xfs_inode *ip, uint64_t blocks)
{
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 27398512b179..bede1c96c330 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -244,7 +244,7 @@ xfs_refcount_update_diff_items(
struct xfs_refcount_intent *ra = ci_entry(a);
struct xfs_refcount_intent *rb = ci_entry(b);
- return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
+ return ra->ri_group->xg_gno - rb->ri_group->xg_gno;
}
/* Log refcount updates in the intent item. */
@@ -330,7 +330,7 @@ xfs_refcount_defer_add(
trace_xfs_refcount_defer(mp, ri);
- ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_startblock);
+ ri->ri_group = xfs_group_intent_get(mp, ri->ri_startblock, XG_TYPE_AG);
xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
}
@@ -341,7 +341,7 @@ xfs_refcount_update_cancel_item(
{
struct xfs_refcount_intent *ri = ci_entry(item);
- xfs_perag_intent_put(ri->ri_pag);
+ xfs_group_intent_put(ri->ri_group);
kmem_cache_free(xfs_refcount_intent_cache, ri);
}
@@ -431,7 +431,8 @@ xfs_cui_recover_work(
ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
ri->ri_startblock = pmap->pe_startblock;
ri->ri_blockcount = pmap->pe_len;
- ri->ri_pag = xfs_perag_intent_get(mp, pmap->pe_startblock);
+ ri->ri_group = xfs_group_intent_get(mp, pmap->pe_startblock,
+ XG_TYPE_AG);
xfs_defer_add_item(dfp, &ri->ri_list);
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 5bf6682e701b..b11769c009ef 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -144,7 +144,7 @@ xfs_reflink_find_shared(
if (error)
return error;
- cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
+ cur = xfs_refcountbt_init_cursor(pag_mount(pag), tp, agbp, pag);
error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
find_end_of_shared);
@@ -894,14 +894,13 @@ int
xfs_reflink_recover_cow(
struct xfs_mount *mp)
{
- struct xfs_perag *pag;
- xfs_agnumber_t agno;
+ struct xfs_perag *pag = NULL;
int error = 0;
if (!xfs_has_reflink(mp))
return 0;
- for_each_perag(mp, agno, pag) {
+ while ((pag = xfs_perag_next(mp, pag))) {
error = xfs_refcount_recover_cow_leftovers(mp, pag);
if (error) {
xfs_perag_rele(pag);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 88b5580e1e19..76b3c0ed3b4f 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -243,7 +243,7 @@ xfs_rmap_update_diff_items(
struct xfs_rmap_intent *ra = ri_entry(a);
struct xfs_rmap_intent *rb = ri_entry(b);
- return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno;
+ return ra->ri_group->xg_gno - rb->ri_group->xg_gno;
}
/* Log rmap updates in the intent item. */
@@ -353,7 +353,8 @@ xfs_rmap_defer_add(
trace_xfs_rmap_defer(mp, ri);
- ri->ri_pag = xfs_perag_intent_get(mp, ri->ri_bmap.br_startblock);
+ ri->ri_group = xfs_group_intent_get(mp, ri->ri_bmap.br_startblock,
+ XG_TYPE_AG);
xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
}
@@ -364,7 +365,7 @@ xfs_rmap_update_cancel_item(
{
struct xfs_rmap_intent *ri = ri_entry(item);
- xfs_perag_intent_put(ri->ri_pag);
+ xfs_group_intent_put(ri->ri_group);
kmem_cache_free(xfs_rmap_intent_cache, ri);
}
@@ -494,7 +495,7 @@ xfs_rui_recover_work(
ri->ri_bmap.br_blockcount = map->me_len;
ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
- ri->ri_pag = xfs_perag_intent_get(mp, map->me_startblock);
+ ri->ri_group = xfs_group_intent_get(mp, map->me_startblock, XG_TYPE_AG);
xfs_defer_add_item(dfp, &ri->ri_list);
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 3a2005a1e673..0cb534d71119 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -25,6 +25,11 @@
#include "xfs_quota.h"
#include "xfs_log_priv.h"
#include "xfs_health.h"
+#include "xfs_da_format.h"
+#include "xfs_metafile.h"
+#include "xfs_rtgroup.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
/*
* Return whether there are any free extents in the size range given
@@ -38,14 +43,14 @@ xfs_rtany_summary(
xfs_fileoff_t bbno, /* bitmap block number */
int *maxlog) /* out: max log2 extent size free */
{
- struct xfs_mount *mp = args->mp;
+ uint8_t *rsum_cache = args->rtg->rtg_rsum_cache;
int error;
int log; /* loop counter, log2 of ext. size */
xfs_suminfo_t sum; /* summary data */
- /* There are no extents at levels >= m_rsum_cache[bbno]. */
- if (mp->m_rsum_cache) {
- high = min(high, mp->m_rsum_cache[bbno] - 1);
+ /* There are no extents at levels >= rsum_cache[bbno]. */
+ if (rsum_cache) {
+ high = min(high, rsum_cache[bbno] - 1);
if (low > high) {
*maxlog = -1;
return 0;
@@ -77,12 +82,11 @@ xfs_rtany_summary(
*maxlog = -1;
out:
/* There were no extents at levels > log. */
- if (mp->m_rsum_cache && log + 1 < mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log + 1;
+ if (rsum_cache && log + 1 < rsum_cache[bbno])
+ rsum_cache[bbno] = log + 1;
return 0;
}
-
/*
* Copy and transform the summary file, given the old and new
* parameters in the mount structures.
@@ -149,7 +153,7 @@ xfs_rtallocate_range(
/*
* Find the next allocated block (end of free extent).
*/
- error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
+ error = xfs_rtfind_forw(args, end, args->rtg->rtg_extents - 1,
&postblock);
if (error)
return error;
@@ -211,14 +215,14 @@ xfs_rtalloc_align_len(
*/
static inline xfs_rtxlen_t
xfs_rtallocate_clamp_len(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
xfs_rtxnum_t startrtx,
xfs_rtxlen_t rtxlen,
xfs_rtxlen_t prod)
{
xfs_rtxlen_t ret;
- ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx;
+ ret = min(rtg->rtg_extents, startrtx + rtxlen) - startrtx;
return xfs_rtalloc_align_len(ret, prod);
}
@@ -253,10 +257,11 @@ xfs_rtallocate_extent_block(
* Loop over all the extents starting in this bitmap block up to the
* end of the rt volume, looking for one that's long enough.
*/
- end = min(mp->m_sb.sb_rextents, xfs_rbmblock_to_rtx(mp, bbno + 1)) - 1;
+ end = min(args->rtg->rtg_extents, xfs_rbmblock_to_rtx(mp, bbno + 1)) -
+ 1;
for (i = xfs_rbmblock_to_rtx(mp, bbno); i <= end; i++) {
/* Make sure we don't scan off the end of the rt volume. */
- scanlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod);
+ scanlen = xfs_rtallocate_clamp_len(args->rtg, i, maxlen, prod);
if (scanlen < minlen)
break;
@@ -341,7 +346,6 @@ xfs_rtallocate_extent_exact(
xfs_rtxlen_t prod, /* extent product factor */
xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- struct xfs_mount *mp = args->mp;
xfs_rtxnum_t next; /* next rtext to try (dummy) */
xfs_rtxlen_t alloclen; /* candidate length */
xfs_rtxlen_t scanlen; /* number of free rtx to look for */
@@ -352,7 +356,7 @@ xfs_rtallocate_extent_exact(
ASSERT(maxlen % prod == 0);
/* Make sure we don't run off the end of the rt volume. */
- scanlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod);
+ scanlen = xfs_rtallocate_clamp_len(args->rtg, start, maxlen, prod);
if (scanlen < minlen)
return -ENOSPC;
@@ -413,11 +417,10 @@ xfs_rtallocate_extent_near(
ASSERT(maxlen % prod == 0);
/*
- * If the block number given is off the end, silently set it to
- * the last block.
+ * If the block number given is off the end, silently set it to the last
+ * block.
*/
- if (start >= mp->m_sb.sb_rextents)
- start = mp->m_sb.sb_rextents - 1;
+ start = min(start, args->rtg->rtg_extents - 1);
/*
* Try the exact allocation first.
@@ -649,19 +652,30 @@ xfs_rtallocate_extent_size(
return -ENOSPC;
}
+static void
+xfs_rtunmount_rtg(
+ struct xfs_rtgroup *rtg)
+{
+ int i;
+
+ for (i = 0; i < XFS_RTGI_MAX; i++)
+ xfs_rtginode_irele(&rtg->rtg_inodes[i]);
+ kvfree(rtg->rtg_rsum_cache);
+}
+
static int
xfs_alloc_rsum_cache(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
xfs_extlen_t rbmblocks)
{
/*
* The rsum cache is initialized to the maximum value, which is
* trivially an upper bound on the maximum level with any free extents.
*/
- mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL);
- if (!mp->m_rsum_cache)
+ rtg->rtg_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL);
+ if (!rtg->rtg_rsum_cache)
return -ENOMEM;
- memset(mp->m_rsum_cache, -1, rbmblocks);
+ memset(rtg->rtg_rsum_cache, -1, rbmblocks);
return 0;
}
@@ -698,44 +712,175 @@ out_iolock:
return error;
}
+/* Ensure that the rtgroup metadata inode is loaded, creating it if neeeded. */
+static int
+xfs_rtginode_ensure(
+ struct xfs_rtgroup *rtg,
+ enum xfs_rtg_inodes type)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ if (rtg->rtg_inodes[type])
+ return 0;
+
+ error = xfs_trans_alloc_empty(rtg_mount(rtg), &tp);
+ if (error)
+ return error;
+ error = xfs_rtginode_load(rtg, type, tp);
+ xfs_trans_cancel(tp);
+
+ if (error != -ENOENT)
+ return 0;
+ return xfs_rtginode_create(rtg, type, true);
+}
+
+static struct xfs_mount *
+xfs_growfs_rt_alloc_fake_mount(
+ const struct xfs_mount *mp,
+ xfs_rfsblock_t rblocks,
+ xfs_agblock_t rextsize)
+{
+ struct xfs_mount *nmp;
+
+ nmp = kmemdup(mp, sizeof(*mp), GFP_KERNEL);
+ if (!nmp)
+ return NULL;
+ xfs_mount_sb_set_rextsize(nmp, &nmp->m_sb, rextsize);
+ nmp->m_sb.sb_rblocks = rblocks;
+ nmp->m_sb.sb_rextents = xfs_blen_to_rtbxlen(nmp, nmp->m_sb.sb_rblocks);
+ nmp->m_sb.sb_rbmblocks = xfs_rtbitmap_blockcount(nmp);
+ nmp->m_sb.sb_rextslog = xfs_compute_rextslog(nmp->m_sb.sb_rextents);
+ if (xfs_has_rtgroups(nmp))
+ nmp->m_sb.sb_rgcount = howmany_64(nmp->m_sb.sb_rextents,
+ nmp->m_sb.sb_rgextents);
+ else
+ nmp->m_sb.sb_rgcount = 1;
+ nmp->m_rsumblocks = xfs_rtsummary_blockcount(nmp, &nmp->m_rsumlevels);
+
+ if (rblocks > 0)
+ nmp->m_features |= XFS_FEAT_REALTIME;
+
+ /* recompute growfsrt reservation from new rsumsize */
+ xfs_trans_resv_calc(nmp, &nmp->m_resv);
+ return nmp;
+}
+
+/* Free all the new space and return the number of extents actually freed. */
+static int
+xfs_growfs_rt_free_new(
+ struct xfs_rtgroup *rtg,
+ struct xfs_rtalloc_args *nargs,
+ xfs_rtbxlen_t *freed_rtx)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rgnumber_t rgno = rtg_rgno(rtg);
+ xfs_rtxnum_t start_rtx = 0, end_rtx;
+
+ if (rgno < mp->m_sb.sb_rgcount)
+ start_rtx = xfs_rtgroup_extents(mp, rgno);
+ end_rtx = xfs_rtgroup_extents(nargs->mp, rgno);
+
+ /*
+ * Compute the first new extent that we want to free, being careful to
+ * skip past a realtime superblock at the start of the realtime volume.
+ */
+ if (xfs_has_rtsb(nargs->mp) && rgno == 0 && start_rtx == 0)
+ start_rtx++;
+ *freed_rtx = end_rtx - start_rtx;
+ return xfs_rtfree_range(nargs, start_rtx, *freed_rtx);
+}
+
+static xfs_rfsblock_t
+xfs_growfs_rt_nrblocks(
+ struct xfs_rtgroup *rtg,
+ xfs_rfsblock_t nrblocks,
+ xfs_agblock_t rextsize,
+ xfs_fileoff_t bmbno)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rfsblock_t step;
+
+ step = (bmbno + 1) * mp->m_rtx_per_rbmblock * rextsize;
+ if (xfs_has_rtgroups(mp)) {
+ xfs_rfsblock_t rgblocks = mp->m_sb.sb_rgextents * rextsize;
+
+ step = min(rgblocks, step) + rgblocks * rtg_rgno(rtg);
+ }
+
+ return min(nrblocks, step);
+}
+
+/*
+ * If the post-grow filesystem will have an rtsb; we're initializing the first
+ * rtgroup; and the filesystem didn't have a realtime section, write the rtsb
+ * now, and attach the rtsb buffer to the real mount.
+ */
+static int
+xfs_growfs_rt_init_rtsb(
+ const struct xfs_rtalloc_args *nargs,
+ const struct xfs_rtgroup *rtg,
+ const struct xfs_rtalloc_args *args)
+{
+ struct xfs_mount *mp = args->mp;
+ struct xfs_buf *rtsb_bp;
+ int error;
+
+ if (!xfs_has_rtsb(nargs->mp))
+ return 0;
+ if (rtg_rgno(rtg) > 0)
+ return 0;
+ if (mp->m_sb.sb_rblocks)
+ return 0;
+
+ error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1),
+ 0, &rtsb_bp);
+ if (error)
+ return error;
+
+ rtsb_bp->b_maps[0].bm_bn = XFS_RTSB_DADDR;
+ rtsb_bp->b_ops = &xfs_rtsb_buf_ops;
+
+ xfs_update_rtsb(rtsb_bp, mp->m_sb_bp);
+ mp->m_rtsb_bp = rtsb_bp;
+ error = xfs_bwrite(rtsb_bp);
+ xfs_buf_unlock(rtsb_bp);
+ return error;
+}
+
static int
xfs_growfs_rt_bmblock(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
xfs_rfsblock_t nrblocks,
xfs_agblock_t rextsize,
xfs_fileoff_t bmbno)
{
- struct xfs_inode *rbmip = mp->m_rbmip;
- struct xfs_inode *rsumip = mp->m_rsumip;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+ struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY];
struct xfs_rtalloc_args args = {
.mp = mp,
+ .rtg = rtg,
};
struct xfs_rtalloc_args nargs = {
+ .rtg = rtg,
};
struct xfs_mount *nmp;
- xfs_rfsblock_t nrblocks_step;
xfs_rtbxlen_t freed_rtx;
int error;
-
- nrblocks_step = (bmbno + 1) * NBBY * mp->m_sb.sb_blocksize * rextsize;
-
- nmp = nargs.mp = kmemdup(mp, sizeof(*mp), GFP_KERNEL);
+ /*
+ * Calculate new sb and mount fields for this round. Also ensure the
+ * rtg_extents value is uptodate as the rtbitmap code relies on it.
+ */
+ nmp = nargs.mp = xfs_growfs_rt_alloc_fake_mount(mp,
+ xfs_growfs_rt_nrblocks(rtg, nrblocks, rextsize, bmbno),
+ rextsize);
if (!nmp)
return -ENOMEM;
- /*
- * Calculate new sb and mount fields for this round.
- */
- nmp->m_sb.sb_rextsize = rextsize;
- xfs_mount_sb_set_rextsize(nmp, &nmp->m_sb);
- nmp->m_sb.sb_rbmblocks = bmbno + 1;
- nmp->m_sb.sb_rblocks = min(nrblocks, nrblocks_step);
- nmp->m_sb.sb_rextents = xfs_rtb_to_rtx(nmp, nmp->m_sb.sb_rblocks);
- nmp->m_sb.sb_rextslog = xfs_compute_rextslog(nmp->m_sb.sb_rextents);
- nmp->m_rsumlevels = nmp->m_sb.sb_rextslog + 1;
- nmp->m_rsumblocks = xfs_rtsummary_blockcount(mp, nmp->m_rsumlevels,
- nmp->m_sb.sb_rbmblocks);
+ xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg),
+ nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents);
/*
* Recompute the growfsrt reservation from the new rsumsize, so that the
@@ -748,8 +893,8 @@ xfs_growfs_rt_bmblock(
goto out_free;
nargs.tp = args.tp;
- xfs_rtbitmap_lock(mp);
- xfs_rtbitmap_trans_join(args.tp);
+ xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP);
+ xfs_rtgroup_trans_join(args.tp, args.rtg, XFS_RTGLOCK_BITMAP);
/*
* Update the bitmap inode's size ondisk and incore. We need to update
@@ -780,6 +925,10 @@ xfs_growfs_rt_bmblock(
goto out_cancel;
}
+ error = xfs_growfs_rt_init_rtsb(&nargs, rtg, &args);
+ if (error)
+ goto out_cancel;
+
/*
* Update superblock fields.
*/
@@ -798,12 +947,14 @@ xfs_growfs_rt_bmblock(
if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG,
nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
+ if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
+ xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT,
+ nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
/*
* Free the new extent.
*/
- freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents;
- error = xfs_rtfree_range(&nargs, mp->m_sb.sb_rextents, freed_rtx);
+ error = xfs_growfs_rt_free_new(rtg, &nargs, &freed_rtx);
xfs_rtbuf_cache_relse(&nargs);
if (error)
goto out_cancel;
@@ -818,7 +969,6 @@ xfs_growfs_rt_bmblock(
*/
mp->m_rsumlevels = nmp->m_rsumlevels;
mp->m_rsumblocks = nmp->m_rsumblocks;
- xfs_mount_sb_set_rextsize(mp, &mp->m_sb);
/*
* Recompute the growfsrt reservation from the new rsumsize.
@@ -844,6 +994,15 @@ out_free:
return error;
}
+static xfs_rtxnum_t
+xfs_last_rtgroup_extents(
+ struct xfs_mount *mp)
+{
+ return mp->m_sb.sb_rextents -
+ ((xfs_rtxnum_t)(mp->m_sb.sb_rgcount - 1) *
+ mp->m_sb.sb_rgextents);
+}
+
/*
* Calculate the last rbmblock currently used.
*
@@ -851,34 +1010,235 @@ out_free:
*/
static xfs_fileoff_t
xfs_last_rt_bmblock(
- struct xfs_mount *mp)
+ struct xfs_rtgroup *rtg)
{
- xfs_fileoff_t bmbno = mp->m_sb.sb_rbmblocks;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rgnumber_t rgno = rtg_rgno(rtg);
+ xfs_fileoff_t bmbno = 0;
+
+ ASSERT(!mp->m_sb.sb_rgcount || rgno >= mp->m_sb.sb_rgcount - 1);
+
+ if (mp->m_sb.sb_rgcount && rgno == mp->m_sb.sb_rgcount - 1) {
+ xfs_rtxnum_t nrext = xfs_last_rtgroup_extents(mp);
+
+ /* Also fill up the previous block if not entirely full. */
+ bmbno = xfs_rtbitmap_blockcount_len(mp, nrext);
+ if (xfs_rtx_to_rbmword(mp, nrext) != 0)
+ bmbno--;
+ }
- /* Skip the current block if it is exactly full. */
- if (xfs_rtx_to_rbmword(mp, mp->m_sb.sb_rextents) != 0)
- bmbno--;
return bmbno;
}
/*
+ * Allocate space to the bitmap and summary files, as necessary.
+ */
+static int
+xfs_growfs_rt_alloc_blocks(
+ struct xfs_rtgroup *rtg,
+ xfs_rfsblock_t nrblocks,
+ xfs_agblock_t rextsize,
+ xfs_extlen_t *nrbmblocks)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+ struct xfs_inode *rsumip = rtg->rtg_inodes[XFS_RTGI_SUMMARY];
+ xfs_extlen_t orbmblocks = 0;
+ xfs_extlen_t orsumblocks = 0;
+ struct xfs_mount *nmp;
+ int error = 0;
+
+ nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks, rextsize);
+ if (!nmp)
+ return -ENOMEM;
+ *nrbmblocks = nmp->m_sb.sb_rbmblocks;
+
+ if (xfs_has_rtgroups(mp)) {
+ /*
+ * For file systems with the rtgroups feature, the RT bitmap and
+ * summary are always fully allocated, which means that we never
+ * need to grow the existing files.
+ *
+ * But we have to be careful to only fill the bitmap until the
+ * end of the actually used range.
+ */
+ if (rtg_rgno(rtg) == nmp->m_sb.sb_rgcount - 1)
+ *nrbmblocks = xfs_rtbitmap_blockcount_len(nmp,
+ xfs_last_rtgroup_extents(nmp));
+
+ if (mp->m_sb.sb_rgcount &&
+ rtg_rgno(rtg) == mp->m_sb.sb_rgcount - 1)
+ goto out_free;
+ } else {
+ /*
+ * Get the old block counts for bitmap and summary inodes.
+ * These can't change since other growfs callers are locked out.
+ */
+ orbmblocks = XFS_B_TO_FSB(mp, rbmip->i_disk_size);
+ orsumblocks = XFS_B_TO_FSB(mp, rsumip->i_disk_size);
+ }
+
+ error = xfs_rtfile_initialize_blocks(rtg, XFS_RTGI_BITMAP, orbmblocks,
+ nmp->m_sb.sb_rbmblocks, NULL);
+ if (error)
+ goto out_free;
+ error = xfs_rtfile_initialize_blocks(rtg, XFS_RTGI_SUMMARY, orsumblocks,
+ nmp->m_rsumblocks, NULL);
+out_free:
+ kfree(nmp);
+ return error;
+}
+
+static int
+xfs_growfs_rtg(
+ struct xfs_mount *mp,
+ xfs_rgnumber_t rgno,
+ xfs_rfsblock_t nrblocks,
+ xfs_agblock_t rextsize)
+{
+ uint8_t *old_rsum_cache = NULL;
+ xfs_extlen_t bmblocks;
+ xfs_fileoff_t bmbno;
+ struct xfs_rtgroup *rtg;
+ unsigned int i;
+ int error;
+
+ rtg = xfs_rtgroup_grab(mp, rgno);
+ if (!rtg)
+ return -EINVAL;
+
+ for (i = 0; i < XFS_RTGI_MAX; i++) {
+ error = xfs_rtginode_ensure(rtg, i);
+ if (error)
+ goto out_rele;
+ }
+
+ error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks);
+ if (error)
+ goto out_rele;
+
+ if (bmblocks != rtg_mount(rtg)->m_sb.sb_rbmblocks) {
+ old_rsum_cache = rtg->rtg_rsum_cache;
+ error = xfs_alloc_rsum_cache(rtg, bmblocks);
+ if (error)
+ goto out_rele;
+ }
+
+ for (bmbno = xfs_last_rt_bmblock(rtg); bmbno < bmblocks; bmbno++) {
+ error = xfs_growfs_rt_bmblock(rtg, nrblocks, rextsize, bmbno);
+ if (error)
+ goto out_error;
+ }
+
+ if (old_rsum_cache)
+ kvfree(old_rsum_cache);
+ xfs_rtgroup_rele(rtg);
+ return 0;
+
+out_error:
+ /*
+ * Reset rtg_extents to the old value if adding more blocks failed.
+ */
+ xfs_rtgroup_calc_geometry(mp, rtg, rtg_rgno(rtg), mp->m_sb.sb_rgcount,
+ mp->m_sb.sb_rextents);
+ if (old_rsum_cache) {
+ kvfree(rtg->rtg_rsum_cache);
+ rtg->rtg_rsum_cache = old_rsum_cache;
+ }
+out_rele:
+ xfs_rtgroup_rele(rtg);
+ return error;
+}
+
+static int
+xfs_growfs_check_rtgeom(
+ const struct xfs_mount *mp,
+ xfs_rfsblock_t rblocks,
+ xfs_extlen_t rextsize)
+{
+ struct xfs_mount *nmp;
+ int error = 0;
+
+ nmp = xfs_growfs_rt_alloc_fake_mount(mp, rblocks, rextsize);
+ if (!nmp)
+ return -ENOMEM;
+
+ /*
+ * New summary size can't be more than half the size of the log. This
+ * prevents us from getting a log overflow, since we'll log basically
+ * the whole summary file at once.
+ */
+ if (nmp->m_rsumblocks > (mp->m_sb.sb_logblocks >> 1))
+ error = -EINVAL;
+
+ kfree(nmp);
+ return error;
+}
+
+/*
+ * Compute the new number of rt groups and ensure that /rtgroups exists.
+ *
+ * Changing the rtgroup size is not allowed (even if the rt volume hasn't yet
+ * been initialized) because the userspace ABI doesn't support it.
+ */
+static int
+xfs_growfs_rt_prep_groups(
+ struct xfs_mount *mp,
+ xfs_rfsblock_t rblocks,
+ xfs_extlen_t rextsize,
+ xfs_rgnumber_t *new_rgcount)
+{
+ int error;
+
+ *new_rgcount = howmany_64(rblocks, mp->m_sb.sb_rgextents * rextsize);
+ if (*new_rgcount > XFS_MAX_RGNUMBER)
+ return -EINVAL;
+
+ /* Make sure the /rtgroups dir has been created */
+ if (!mp->m_rtdirip) {
+ struct xfs_trans *tp;
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ return error;
+ error = xfs_rtginode_load_parent(tp);
+ xfs_trans_cancel(tp);
+
+ if (error == -ENOENT)
+ error = xfs_rtginode_mkdir_parent(mp);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+static bool
+xfs_grow_last_rtg(
+ struct xfs_mount *mp)
+{
+ if (!xfs_has_rtgroups(mp))
+ return true;
+ if (mp->m_sb.sb_rgcount == 0)
+ return false;
+ return xfs_rtgroup_extents(mp, mp->m_sb.sb_rgcount - 1) <=
+ mp->m_sb.sb_rgextents;
+}
+
+/*
* Grow the realtime area of the filesystem.
*/
int
xfs_growfs_rt(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_growfs_rt_t *in) /* growfs rt input struct */
+ struct xfs_mount *mp,
+ struct xfs_growfs_rt *in)
{
- xfs_fileoff_t bmbno; /* bitmap block number */
- struct xfs_buf *bp; /* temporary buffer */
- int error; /* error return value */
- xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
- xfs_rtxnum_t nrextents; /* new number of realtime extents */
- xfs_extlen_t nrsumblocks; /* new number of summary blocks */
- xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */
- xfs_extlen_t rsumblocks; /* current number of rt summary blks */
- uint8_t *rsum_cache; /* old summary cache */
- xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
+ xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount;
+ xfs_rgnumber_t new_rgcount = 1;
+ xfs_rgnumber_t rgno;
+ struct xfs_buf *bp;
+ xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
+ int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -889,15 +1249,9 @@ xfs_growfs_rt(
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
- /*
- * Mount should fail if the rt bitmap/summary files don't load, but
- * we'll check anyway.
- */
- error = -EINVAL;
- if (!mp->m_rbmip || !mp->m_rsumip)
- goto out_unlock;
/* Shrink not supported. */
+ error = -EINVAL;
if (in->newblocks <= mp->m_sb.sb_rblocks)
goto out_unlock;
/* Can only change rt extent size when adding rt volume. */
@@ -911,7 +1265,9 @@ xfs_growfs_rt(
/* Unsupported realtime features. */
error = -EOPNOTSUPP;
- if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp))
+ if (xfs_has_quota(mp) && !xfs_has_rtgroups(mp))
+ goto out_unlock;
+ if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp))
goto out_unlock;
error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
@@ -930,80 +1286,64 @@ xfs_growfs_rt(
/*
* Calculate new parameters. These are the final values to be reached.
*/
- nrextents = div_u64(in->newblocks, in->extsize);
- if (nrextents == 0) {
- error = -EINVAL;
- goto out_unlock;
- }
- nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
- nrsumblocks = xfs_rtsummary_blockcount(mp,
- xfs_compute_rextslog(nrextents) + 1, nrbmblocks);
-
- /*
- * New summary size can't be more than half the size of
- * the log. This prevents us from getting a log overflow,
- * since we'll log basically the whole summary file at once.
- */
- if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) {
- error = -EINVAL;
+ error = -EINVAL;
+ if (in->newblocks < in->extsize)
goto out_unlock;
- }
- /*
- * Get the old block counts for bitmap and summary inodes.
- * These can't change since other growfs callers are locked out.
- */
- rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_disk_size);
- rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_disk_size);
- /*
- * Allocate space to the bitmap and summary files, as necessary.
- */
- error = xfs_rtfile_initialize_blocks(mp->m_rbmip, rbmblocks,
- nrbmblocks, NULL);
- if (error)
- goto out_unlock;
- error = xfs_rtfile_initialize_blocks(mp->m_rsumip, rsumblocks,
- nrsumblocks, NULL);
+ /* Make sure the new fs size won't cause problems with the log. */
+ error = xfs_growfs_check_rtgeom(mp, in->newblocks, in->extsize);
if (error)
goto out_unlock;
- rsum_cache = mp->m_rsum_cache;
- if (nrbmblocks != mp->m_sb.sb_rbmblocks) {
- error = xfs_alloc_rsum_cache(mp, nrbmblocks);
+ if (xfs_has_rtgroups(mp)) {
+ error = xfs_growfs_rt_prep_groups(mp, in->newblocks,
+ in->extsize, &new_rgcount);
if (error)
goto out_unlock;
}
- /* Initialize the free space bitmap one bitmap block at a time. */
- for (bmbno = xfs_last_rt_bmblock(mp); bmbno < nrbmblocks; bmbno++) {
- error = xfs_growfs_rt_bmblock(mp, in->newblocks, in->extsize,
- bmbno);
+ if (xfs_grow_last_rtg(mp)) {
+ error = xfs_growfs_rtg(mp, old_rgcount - 1, in->newblocks,
+ in->extsize);
if (error)
- goto out_free;
+ goto out_unlock;
}
- if (old_rextsize != in->extsize) {
- error = xfs_growfs_rt_fixup_extsize(mp);
+ for (rgno = old_rgcount; rgno < new_rgcount; rgno++) {
+ xfs_rtbxlen_t rextents = div_u64(in->newblocks, in->extsize);
+
+ error = xfs_rtgroup_alloc(mp, rgno, new_rgcount, rextents);
if (error)
- goto out_free;
+ goto out_unlock;
+
+ error = xfs_growfs_rtg(mp, rgno, in->newblocks, in->extsize);
+ if (error) {
+ struct xfs_rtgroup *rtg;
+
+ rtg = xfs_rtgroup_grab(mp, rgno);
+ if (!WARN_ON_ONCE(!rtg)) {
+ xfs_rtunmount_rtg(rtg);
+ xfs_rtgroup_rele(rtg);
+ xfs_rtgroup_free(mp, rgno);
+ }
+ break;
+ }
}
- /* Update secondary superblocks now the physical grow has completed */
- error = xfs_update_secondary_sbs(mp);
+ if (!error && old_rextsize != in->extsize)
+ error = xfs_growfs_rt_fixup_extsize(mp);
-out_free:
/*
- * If we had to allocate a new rsum_cache, we either need to free the
- * old one (if we succeeded) or free the new one and restore the old one
- * (if there was an error).
+ * Update secondary superblocks now the physical grow has completed.
+ *
+ * Also do this in case of an error as we might have already
+ * successfully updated one or more RTGs and incremented sb_rgcount.
*/
- if (rsum_cache != mp->m_rsum_cache) {
- if (error) {
- kvfree(mp->m_rsum_cache);
- mp->m_rsum_cache = rsum_cache;
- } else {
- kvfree(rsum_cache);
- }
+ if (!xfs_is_shutdown(mp)) {
+ int error2 = xfs_update_secondary_sbs(mp);
+
+ if (!error)
+ error = error2;
}
out_unlock:
@@ -1011,6 +1351,56 @@ out_unlock:
return error;
}
+/* Read the realtime superblock and attach it to the mount. */
+int
+xfs_rtmount_readsb(
+ struct xfs_mount *mp)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ if (!xfs_has_rtsb(mp))
+ return 0;
+ if (mp->m_sb.sb_rblocks == 0)
+ return 0;
+ if (mp->m_rtdev_targp == NULL) {
+ xfs_warn(mp,
+ "Filesystem has a realtime volume, use rtdev=device option");
+ return -ENODEV;
+ }
+
+ /* m_blkbb_log is not set up yet */
+ error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR,
+ mp->m_sb.sb_blocksize >> BBSHIFT, XBF_NO_IOACCT, &bp,
+ &xfs_rtsb_buf_ops);
+ if (error) {
+ xfs_warn(mp, "rt sb validate failed with error %d.", error);
+ /* bad CRC means corrupted metadata */
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
+ return error;
+ }
+
+ mp->m_rtsb_bp = bp;
+ xfs_buf_unlock(bp);
+ return 0;
+}
+
+/* Detach the realtime superblock from the mount and free it. */
+void
+xfs_rtmount_freesb(
+ struct xfs_mount *mp)
+{
+ struct xfs_buf *bp = mp->m_rtsb_bp;
+
+ if (!bp)
+ return;
+
+ xfs_buf_lock(bp);
+ mp->m_rtsb_bp = NULL;
+ xfs_buf_relse(bp);
+}
+
/*
* Initialize realtime fields in the mount structure.
*/
@@ -1019,22 +1409,19 @@ xfs_rtmount_init(
struct xfs_mount *mp) /* file system mount structure */
{
struct xfs_buf *bp; /* buffer for last block of subvolume */
- struct xfs_sb *sbp; /* filesystem superblock copy in mount */
xfs_daddr_t d; /* address of last block of subvolume */
int error;
- sbp = &mp->m_sb;
- if (sbp->sb_rblocks == 0)
+ if (mp->m_sb.sb_rblocks == 0)
return 0;
if (mp->m_rtdev_targp == NULL) {
xfs_warn(mp,
"Filesystem has a realtime volume, use rtdev=device option");
return -ENODEV;
}
- mp->m_rsumlevels = sbp->sb_rextslog + 1;
- mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels,
- mp->m_sb.sb_rbmblocks);
- mp->m_rbmip = mp->m_rsumip = NULL;
+
+ mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels);
+
/*
* Check that the realtime section is an ok size.
*/
@@ -1058,7 +1445,7 @@ xfs_rtmount_init(
static int
xfs_rtalloc_count_frextent(
- struct xfs_mount *mp,
+ struct xfs_rtgroup *rtg,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
@@ -1080,12 +1467,18 @@ xfs_rtalloc_reinit_frextents(
uint64_t val = 0;
int error;
- xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
- error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
- &val);
- xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
- if (error)
- return error;
+ struct xfs_rtgroup *rtg = NULL;
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ error = xfs_rtalloc_query_all(rtg, NULL,
+ xfs_rtalloc_count_frextent, &val);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
+ if (error) {
+ xfs_rtgroup_rele(rtg);
+ return error;
+ }
+ }
spin_lock(&mp->m_sb_lock);
mp->m_sb.sb_frextents = val;
@@ -1101,17 +1494,12 @@ xfs_rtalloc_reinit_frextents(
*/
static inline int
xfs_rtmount_iread_extents(
- struct xfs_inode *ip,
- unsigned int lock_class)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(ip->i_mount, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
if (error)
@@ -1124,54 +1512,67 @@ xfs_rtmount_iread_extents(
}
out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class);
- xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
+static int
+xfs_rtmount_rtg(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg)
+{
+ int error, i;
+
+ for (i = 0; i < XFS_RTGI_MAX; i++) {
+ error = xfs_rtginode_load(rtg, i, tp);
+ if (error)
+ return error;
+
+ if (rtg->rtg_inodes[i]) {
+ error = xfs_rtmount_iread_extents(tp,
+ rtg->rtg_inodes[i]);
+ if (error)
+ return error;
+ }
+ }
+
+ return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
+}
+
/*
* Get the bitmap and summary inodes and the summary cache into the mount
* structure at mount time.
*/
-int /* error */
+int
xfs_rtmount_inodes(
- xfs_mount_t *mp) /* file system mount structure */
+ struct xfs_mount *mp)
{
- int error; /* error return value */
- xfs_sb_t *sbp;
+ struct xfs_trans *tp;
+ struct xfs_rtgroup *rtg = NULL;
+ int error;
- sbp = &mp->m_sb;
- error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
- if (xfs_metadata_is_sick(error))
- xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP);
+ error = xfs_trans_alloc_empty(mp, &tp);
if (error)
return error;
- ASSERT(mp->m_rbmip != NULL);
- error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP);
- if (error)
- goto out_rele_bitmap;
-
- error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
- if (xfs_metadata_is_sick(error))
- xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY);
- if (error)
- goto out_rele_bitmap;
- ASSERT(mp->m_rsumip != NULL);
-
- error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM);
- if (error)
- goto out_rele_summary;
+ if (xfs_has_rtgroups(mp) && mp->m_sb.sb_rgcount > 0) {
+ error = xfs_rtginode_load_parent(tp);
+ if (error)
+ goto out_cancel;
+ }
- error = xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
- if (error)
- goto out_rele_summary;
- return 0;
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ error = xfs_rtmount_rtg(mp, tp, rtg);
+ if (error) {
+ xfs_rtgroup_rele(rtg);
+ xfs_rtunmount_inodes(mp);
+ break;
+ }
+ }
-out_rele_summary:
- xfs_irele(mp->m_rsumip);
-out_rele_bitmap:
- xfs_irele(mp->m_rbmip);
+out_cancel:
+ xfs_trans_cancel(tp);
return error;
}
@@ -1179,11 +1580,11 @@ void
xfs_rtunmount_inodes(
struct xfs_mount *mp)
{
- kvfree(mp->m_rsum_cache);
- if (mp->m_rbmip)
- xfs_irele(mp->m_rbmip);
- if (mp->m_rsumip)
- xfs_irele(mp->m_rsumip);
+ struct xfs_rtgroup *rtg = NULL;
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg)))
+ xfs_rtunmount_rtg(rtg);
+ xfs_rtginode_irele(&mp->m_rtdirip);
}
/*
@@ -1195,28 +1596,29 @@ xfs_rtunmount_inodes(
*/
static xfs_rtxnum_t
xfs_rtpick_extent(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
+ struct xfs_rtgroup *rtg,
+ struct xfs_trans *tp,
xfs_rtxlen_t len) /* allocation length (rtextents) */
{
- xfs_rtxnum_t b; /* result rtext */
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_inode *rbmip = rtg->rtg_inodes[XFS_RTGI_BITMAP];
+ xfs_rtxnum_t b = 0; /* result rtext */
int log2; /* log of sequence number */
uint64_t resid; /* residual after log removed */
uint64_t seq; /* sequence number of file creation */
struct timespec64 ts; /* timespec in inode */
- xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL);
- ts = inode_get_atime(VFS_I(mp->m_rbmip));
- if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
- mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+ ts = inode_get_atime(VFS_I(rbmip));
+ if (!(rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
+ rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM;
seq = 0;
} else {
seq = ts.tv_sec;
}
- if ((log2 = xfs_highbit64(seq)) == -1)
- b = 0;
- else {
+ log2 = xfs_highbit64(seq);
+ if (log2 != -1) {
resid = seq - (1ULL << log2);
b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >>
(log2 + 1);
@@ -1226,8 +1628,8 @@ xfs_rtpick_extent(
b = mp->m_sb.sb_rextents - len;
}
ts.tv_sec = seq + 1;
- inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts);
- xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+ inode_set_atime_to_ts(VFS_I(rbmip), ts);
+ xfs_trans_log_inode(tp, rbmip, XFS_ILOG_CORE);
return b;
}
@@ -1260,9 +1662,118 @@ xfs_rtalloc_align_minmax(
*raminlen = newminlen;
}
+/* Given a free extent, find any part of it that isn't busy, if possible. */
+STATIC bool
+xfs_rtalloc_check_busy(
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start,
+ xfs_rtxlen_t minlen_rtx,
+ xfs_rtxlen_t maxlen_rtx,
+ xfs_rtxlen_t len_rtx,
+ xfs_rtxlen_t prod,
+ xfs_rtxnum_t rtx,
+ xfs_rtxlen_t *reslen,
+ xfs_rtxnum_t *resrtx,
+ unsigned *busy_gen)
+{
+ struct xfs_rtgroup *rtg = args->rtg;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_agblock_t rgbno = xfs_rtx_to_rgbno(rtg, rtx);
+ xfs_rgblock_t min_rgbno = xfs_rtx_to_rgbno(rtg, start);
+ xfs_extlen_t minlen = xfs_rtxlen_to_extlen(mp, minlen_rtx);
+ xfs_extlen_t len = xfs_rtxlen_to_extlen(mp, len_rtx);
+ xfs_extlen_t diff;
+ bool busy;
+
+ busy = xfs_extent_busy_trim(rtg_group(rtg), minlen,
+ xfs_rtxlen_to_extlen(mp, maxlen_rtx), &rgbno, &len,
+ busy_gen);
+
+ /*
+ * If we have a largish extent that happens to start before min_rgbno,
+ * see if we can shift it into range...
+ */
+ if (rgbno < min_rgbno && rgbno + len > min_rgbno) {
+ diff = min_rgbno - rgbno;
+ if (len > diff) {
+ rgbno += diff;
+ len -= diff;
+ }
+ }
+
+ if (prod > 1 && len >= minlen) {
+ xfs_rgblock_t aligned_rgbno = roundup(rgbno, prod);
+
+ diff = aligned_rgbno - rgbno;
+
+ *resrtx = xfs_rgbno_to_rtx(mp, aligned_rgbno);
+ *reslen = xfs_extlen_to_rtxlen(mp,
+ diff >= len ? 0 : len - diff);
+ } else {
+ *resrtx = xfs_rgbno_to_rtx(mp, rgbno);
+ *reslen = xfs_extlen_to_rtxlen(mp, len);
+ }
+
+ return busy;
+}
+
+/*
+ * Adjust the given free extent so that it isn't busy, or flush the log and
+ * wait for the space to become unbusy. Only needed for rtgroups.
+ */
+STATIC int
+xfs_rtallocate_adjust_for_busy(
+ struct xfs_rtalloc_args *args,
+ xfs_rtxnum_t start,
+ xfs_rtxlen_t minlen,
+ xfs_rtxlen_t maxlen,
+ xfs_rtxlen_t *len,
+ xfs_rtxlen_t prod,
+ xfs_rtxnum_t *rtx)
+{
+ xfs_rtxnum_t resrtx;
+ xfs_rtxlen_t reslen;
+ unsigned busy_gen;
+ bool busy;
+ int error;
+
+again:
+ busy = xfs_rtalloc_check_busy(args, start, minlen, maxlen, *len, prod,
+ *rtx, &reslen, &resrtx, &busy_gen);
+ if (!busy)
+ return 0;
+
+ if (reslen < minlen || (start != 0 && resrtx != *rtx)) {
+ /*
+ * Enough of the extent was busy that we cannot satisfy the
+ * allocation, or this is a near allocation and the start of
+ * the extent is busy. Flush the log and wait for the busy
+ * situation to resolve.
+ */
+ trace_xfs_rtalloc_extent_busy(args->rtg, start, minlen, maxlen,
+ *len, prod, *rtx, busy_gen);
+
+ error = xfs_extent_busy_flush(args->tp, rtg_group(args->rtg),
+ busy_gen, 0);
+ if (error)
+ return error;
+
+ goto again;
+ }
+
+ /* Some of the free space wasn't busy, hand that back to the caller. */
+ trace_xfs_rtalloc_extent_busy_trim(args->rtg, *rtx, *len, resrtx,
+ reslen);
+ *len = reslen;
+ *rtx = resrtx;
+
+ return 0;
+}
+
static int
-xfs_rtallocate(
+xfs_rtallocate_rtg(
struct xfs_trans *tp,
+ xfs_rgnumber_t rgno,
xfs_rtblock_t bno_hint,
xfs_rtxlen_t minlen,
xfs_rtxlen_t maxlen,
@@ -1282,12 +1793,33 @@ xfs_rtallocate(
xfs_rtxlen_t len = 0;
int error = 0;
+ args.rtg = xfs_rtgroup_grab(args.mp, rgno);
+ if (!args.rtg)
+ return -ENOSPC;
+
/*
- * Lock out modifications to both the RT bitmap and summary inodes.
+ * We need to lock out modifications to both the RT bitmap and summary
+ * inodes for finding free space in xfs_rtallocate_extent_{near,size}
+ * and join the bitmap and summary inodes for the actual allocation
+ * down in xfs_rtallocate_range.
+ *
+ * For RTG-enabled file system we don't want to join the inodes to the
+ * transaction until we are committed to allocate to allocate from this
+ * RTG so that only one inode of each type is locked at a time.
+ *
+ * But for pre-RTG file systems we need to already to join the bitmap
+ * inode to the transaction for xfs_rtpick_extent, which bumps the
+ * sequence number in it, so we'll have to join the inode to the
+ * transaction early here.
+ *
+ * This is all a bit messy, but at least the mess is contained in
+ * this function.
*/
if (!*rtlocked) {
- xfs_rtbitmap_lock(args.mp);
- xfs_rtbitmap_trans_join(tp);
+ xfs_rtgroup_lock(args.rtg, XFS_RTGLOCK_BITMAP);
+ if (!xfs_has_rtgroups(args.mp))
+ xfs_rtgroup_trans_join(tp, args.rtg,
+ XFS_RTGLOCK_BITMAP);
*rtlocked = true;
}
@@ -1297,8 +1829,8 @@ xfs_rtallocate(
*/
if (bno_hint)
start = xfs_rtb_to_rtx(args.mp, bno_hint);
- else if (initial_user_data)
- start = xfs_rtpick_extent(args.mp, tp, maxlen);
+ else if (!xfs_has_rtgroups(args.mp) && initial_user_data)
+ start = xfs_rtpick_extent(args.rtg, tp, maxlen);
if (start) {
error = xfs_rtallocate_extent_near(&args, start, minlen, maxlen,
@@ -1318,8 +1850,20 @@ xfs_rtallocate(
prod, &rtx);
}
- if (error)
+ if (error) {
+ if (xfs_has_rtgroups(args.mp))
+ goto out_unlock;
goto out_release;
+ }
+
+ if (xfs_has_rtgroups(args.mp)) {
+ error = xfs_rtallocate_adjust_for_busy(&args, start, minlen,
+ maxlen, &len, prod, &rtx);
+ if (error)
+ goto out_unlock;
+
+ xfs_rtgroup_trans_join(tp, args.rtg, XFS_RTGLOCK_BITMAP);
+ }
error = xfs_rtallocate_range(&args, rtx, len);
if (error)
@@ -1328,12 +1872,64 @@ xfs_rtallocate(
xfs_trans_mod_sb(tp, wasdel ?
XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS,
-(long)len);
- *bno = xfs_rtx_to_rtb(args.mp, rtx);
+ *bno = xfs_rtx_to_rtb(args.rtg, rtx);
*blen = xfs_rtxlen_to_extlen(args.mp, len);
out_release:
+ xfs_rtgroup_rele(args.rtg);
xfs_rtbuf_cache_relse(&args);
return error;
+out_unlock:
+ xfs_rtgroup_unlock(args.rtg, XFS_RTGLOCK_BITMAP);
+ *rtlocked = false;
+ goto out_release;
+}
+
+static int
+xfs_rtallocate_rtgs(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno_hint,
+ xfs_rtxlen_t minlen,
+ xfs_rtxlen_t maxlen,
+ xfs_rtxlen_t prod,
+ bool wasdel,
+ bool initial_user_data,
+ xfs_rtblock_t *bno,
+ xfs_extlen_t *blen)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rgnumber_t start_rgno, rgno;
+ int error;
+
+ /*
+ * For now this just blindly iterates over the RTGs for an initial
+ * allocation. We could try to keep an in-memory rtg_longest member
+ * to avoid the locking when just looking for big enough free space,
+ * but for now this keeps things simple.
+ */
+ if (bno_hint != NULLFSBLOCK)
+ start_rgno = xfs_rtb_to_rgno(mp, bno_hint);
+ else
+ start_rgno = (atomic_inc_return(&mp->m_rtgrotor) - 1) %
+ mp->m_sb.sb_rgcount;
+
+ rgno = start_rgno;
+ do {
+ bool rtlocked = false;
+
+ error = xfs_rtallocate_rtg(tp, rgno, bno_hint, minlen, maxlen,
+ prod, wasdel, initial_user_data, &rtlocked,
+ bno, blen);
+ if (error != -ENOSPC)
+ return error;
+ ASSERT(!rtlocked);
+
+ if (++rgno == mp->m_sb.sb_rgcount)
+ rgno = 0;
+ bno_hint = NULLFSBLOCK;
+ } while (rgno != start_rgno);
+
+ return -ENOSPC;
}
static int
@@ -1430,9 +2026,16 @@ retry:
if (xfs_bmap_adjacent(ap))
bno_hint = ap->blkno;
- error = xfs_rtallocate(ap->tp, bno_hint, raminlen, ralen, prod,
- ap->wasdel, initial_user_data, &rtlocked,
- &ap->blkno, &ap->length);
+ if (xfs_has_rtgroups(ap->ip->i_mount)) {
+ error = xfs_rtallocate_rtgs(ap->tp, bno_hint, raminlen, ralen,
+ prod, ap->wasdel, initial_user_data,
+ &ap->blkno, &ap->length);
+ } else {
+ error = xfs_rtallocate_rtg(ap->tp, 0, bno_hint, raminlen, ralen,
+ prod, ap->wasdel, initial_user_data,
+ &rtlocked, &ap->blkno, &ap->length);
+ }
+
if (error == -ENOSPC) {
if (!noalign) {
/*
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index a6836da9bebe..8e2a07b8174b 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -12,6 +12,10 @@ struct xfs_mount;
struct xfs_trans;
#ifdef CONFIG_XFS_RT
+/* rtgroup superblock initialization */
+int xfs_rtmount_readsb(struct xfs_mount *mp);
+void xfs_rtmount_freesb(struct xfs_mount *mp);
+
/*
* Initialize realtime fields in the mount structure.
*/
@@ -42,6 +46,8 @@ int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
# define xfs_growfs_rt(mp,in) (-ENOSYS)
# define xfs_rtalloc_reinit_frextents(m) (0)
+# define xfs_rtmount_readsb(mp) (0)
+# define xfs_rtmount_freesb(mp) ((void)0)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index ed97d72caa66..ffb52725c2a8 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -115,10 +115,11 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
static int xqm_proc_show(struct seq_file *m, void *v)
{
- /* maximum; incore; ratio free to inuse; freelist */
- seq_printf(m, "%d\t%d\t%d\t%u\n",
+ /* maximum; incore; ratio free to inuse; freelist; rtquota */
+ seq_printf(m, "%d\t%d\t%d\t%u\t%s\n",
0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT),
- 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1));
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1),
+ IS_ENABLED(CONFIG_XFS_RT) ? "rtquota" : "quota");
return 0;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fbb3a1594c0d..394fdf3bb535 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -45,6 +45,7 @@
#include "xfs_rtbitmap.h"
#include "xfs_exchmaps_item.h"
#include "xfs_parent.h"
+#include "xfs_rtalloc.h"
#include "scrub/stats.h"
#include "scrub/rcbag_btree.h"
@@ -66,6 +67,9 @@ enum xfs_dax_mode {
XFS_DAX_NEVER = 2,
};
+/* Were quota mount options provided? Must use the upper 16 bits of qflags. */
+#define XFS_QFLAGS_MNTOPTS (1U << 31)
+
static void
xfs_mount_set_dax_mode(
struct xfs_mount *mp,
@@ -238,7 +242,7 @@ xfs_set_inode_alloc_perag(
xfs_ino_t ino,
xfs_agnumber_t max_metadata)
{
- if (!xfs_is_inode32(pag->pag_mount)) {
+ if (!xfs_is_inode32(pag_mount(pag))) {
set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate);
clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
return false;
@@ -251,7 +255,7 @@ xfs_set_inode_alloc_perag(
}
set_bit(XFS_AGSTATE_ALLOWS_INODES, &pag->pag_opstate);
- if (pag->pag_agno < max_metadata)
+ if (pag_agno(pag) < max_metadata)
set_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
else
clear_bit(XFS_AGSTATE_PREFERS_METADATA, &pag->pag_opstate);
@@ -873,21 +877,21 @@ xfs_fs_statfs(
ffree = statp->f_files - (icount - ifree);
statp->f_ffree = max_t(int64_t, ffree, 0);
-
- if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
- ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
- (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
- xfs_qm_statvfs(ip, statp);
-
if (XFS_IS_REALTIME_MOUNT(mp) &&
(ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
s64 freertx;
statp->f_blocks = sbp->sb_rblocks;
freertx = percpu_counter_sum_positive(&mp->m_frextents);
- statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx);
+ statp->f_bavail = statp->f_bfree =
+ xfs_rtbxlen_to_blen(mp, freertx);
}
+ if ((ip->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+ ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
+ (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
+ xfs_qm_statvfs(ip, statp);
+
return 0;
}
@@ -1144,6 +1148,7 @@ xfs_fs_put_super(
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
+ xfs_rtmount_freesb(mp);
xfs_freesb(mp);
xchk_mount_stats_free(mp);
free_percpu(mp->m_stats.xs_stats);
@@ -1261,6 +1266,8 @@ xfs_fs_parse_param(
int size = 0;
int opt;
+ BUILD_BUG_ON(XFS_QFLAGS_MNTOPTS & XFS_MOUNT_QUOTA_ALL);
+
opt = fs_parse(fc, xfs_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -1338,32 +1345,39 @@ xfs_fs_parse_param(
case Opt_noquota:
parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
+ parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
return 0;
case Opt_quota:
case Opt_uquota:
case Opt_usrquota:
parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD);
+ parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
return 0;
case Opt_qnoenforce:
case Opt_uqnoenforce:
parsing_mp->m_qflags |= XFS_UQUOTA_ACCT;
parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD;
+ parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
return 0;
case Opt_pquota:
case Opt_prjquota:
parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD);
+ parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
return 0;
case Opt_pqnoenforce:
parsing_mp->m_qflags |= XFS_PQUOTA_ACCT;
parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+ parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
return 0;
case Opt_gquota:
case Opt_grpquota:
parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD);
+ parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
return 0;
case Opt_gqnoenforce:
parsing_mp->m_qflags |= XFS_GQUOTA_ACCT;
parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD;
+ parsing_mp->m_qflags |= XFS_QFLAGS_MNTOPTS;
return 0;
case Opt_discard:
parsing_mp->m_features |= XFS_FEAT_DISCARD;
@@ -1430,7 +1444,8 @@ xfs_fs_validate_params(
return -EINVAL;
}
- if (!IS_ENABLED(CONFIG_XFS_QUOTA) && mp->m_qflags != 0) {
+ if (!IS_ENABLED(CONFIG_XFS_QUOTA) &&
+ (mp->m_qflags & ~XFS_QFLAGS_MNTOPTS)) {
xfs_warn(mp, "quota support not available in this kernel.");
return -EINVAL;
}
@@ -1657,9 +1672,7 @@ xfs_fs_fill_super(
goto out_free_sb;
}
- xfs_warn(mp,
-"EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.",
- mp->m_sb.sb_blocksize);
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_LBS);
}
/* Ensure this filesystem fits in the page cache limits */
@@ -1691,10 +1704,14 @@ xfs_fs_fill_super(
goto out_free_sb;
}
- error = xfs_filestream_mount(mp);
+ error = xfs_rtmount_readsb(mp);
if (error)
goto out_free_sb;
+ error = xfs_filestream_mount(mp);
+ if (error)
+ goto out_free_rtsb;
+
/*
* we must configure the block size in the superblock before we run the
* full mount process as the mount process can lookup and cache inodes.
@@ -1733,6 +1750,9 @@ xfs_fs_fill_super(
mp->m_features &= ~XFS_FEAT_DISCARD;
}
+ if (xfs_has_metadir(mp))
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
+
if (xfs_has_reflink(mp)) {
if (mp->m_sb.sb_rblocks) {
xfs_alert(mp,
@@ -1755,12 +1775,18 @@ xfs_fs_fill_super(
}
if (xfs_has_exchange_range(mp))
- xfs_warn(mp,
- "EXPERIMENTAL exchange-range feature enabled. Use at your own risk!");
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE);
if (xfs_has_parent(mp))
- xfs_warn(mp,
- "EXPERIMENTAL parent pointer feature enabled. Use at your own risk!");
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PPTR);
+
+ /*
+ * If no quota mount options were provided, maybe we'll try to pick
+ * up the quota accounting and enforcement flags from the ondisk sb.
+ */
+ if (!(mp->m_qflags & XFS_QFLAGS_MNTOPTS))
+ xfs_set_resuming_quotaon(mp);
+ mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS;
error = xfs_mountfs(mp);
if (error)
@@ -1781,6 +1807,8 @@ xfs_fs_fill_super(
out_filestream_unmount:
xfs_filestream_unmount(mp);
+ out_free_rtsb:
+ xfs_rtmount_freesb(mp);
out_free_sb:
xfs_freesb(mp);
out_free_scrub_stats:
@@ -1800,7 +1828,7 @@ xfs_fs_fill_super(
out_unmount:
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
- goto out_free_sb;
+ goto out_free_rtsb;
}
static int
@@ -1946,6 +1974,8 @@ xfs_fs_reconfigure(
int flags = fc->sb_flags;
int error;
+ new_mp->m_qflags &= ~XFS_QFLAGS_MNTOPTS;
+
/* version 5 superblocks always support version counters. */
if (xfs_has_crc(mp))
fc->sb_flags |= SB_I_VERSION;
@@ -2011,17 +2041,20 @@ static const struct fs_context_operations xfs_context_ops = {
* mount option parsing having already been performed as this can be called from
* fsopen() before any parameters have been set.
*/
-static int xfs_init_fs_context(
+static int
+xfs_init_fs_context(
struct fs_context *fc)
{
struct xfs_mount *mp;
+ int i;
mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL);
if (!mp)
return -ENOMEM;
spin_lock_init(&mp->m_sb_lock);
- xa_init(&mp->m_perags);
+ for (i = 0; i < XG_TYPE_MAX; i++)
+ xa_init(&mp->m_groups[i].xa);
mutex_init(&mp->m_growlock);
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
@@ -2063,7 +2096,7 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = xfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("xfs");
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 2af9f274e872..8f530e69c18a 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -11,6 +11,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_group.h"
#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_inode.h"
@@ -32,6 +33,7 @@
#include "xfs_fsmap.h"
#include "xfs_btree_staging.h"
#include "xfs_icache.h"
+#include "xfs_iunlink_item.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_error.h"
@@ -44,6 +46,9 @@
#include "xfs_parent.h"
#include "xfs_rmap.h"
#include "xfs_refcount.h"
+#include "xfs_metafile.h"
+#include "xfs_metadir.h"
+#include "xfs_rtgroup.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index fcb2bad4f76e..7b16cdd72e9d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -72,8 +72,11 @@ struct xfs_btree_cur;
struct xfs_defer_op_type;
struct xfs_refcount_irec;
struct xfs_fsmap;
+struct xfs_fsmap_irec;
+struct xfs_group;
struct xfs_rmap_irec;
struct xfs_icreate_log;
+struct xfs_iunlink_item;
struct xfs_owner_info;
struct xfs_trans_res;
struct xfs_inobt_rec_incore;
@@ -93,6 +96,8 @@ struct xfs_attrlist_cursor_kern;
struct xfs_extent_free_item;
struct xfs_rmap_intent;
struct xfs_refcount_intent;
+struct xfs_metadir_update;
+struct xfs_rtgroup;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -181,7 +186,7 @@ TRACE_EVENT(xlog_intent_recovery_failed,
);
DECLARE_EVENT_CLASS(xfs_perag_class,
- TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip),
+ TP_PROTO(const struct xfs_perag *pag, unsigned long caller_ip),
TP_ARGS(pag, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -191,10 +196,11 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
__field(unsigned long, caller_ip)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
- __entry->refcount = atomic_read(&pag->pag_ref);
- __entry->active_refcount = atomic_read(&pag->pag_active_ref);
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->refcount = atomic_read(&pag->pag_group.xg_ref);
+ __entry->active_refcount =
+ atomic_read(&pag->pag_group.xg_active_ref);
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d agno 0x%x passive refs %d active refs %d caller %pS",
@@ -207,18 +213,54 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
#define DEFINE_PERAG_REF_EVENT(name) \
DEFINE_EVENT(xfs_perag_class, name, \
- TP_PROTO(struct xfs_perag *pag, unsigned long caller_ip), \
+ TP_PROTO(const struct xfs_perag *pag, unsigned long caller_ip), \
TP_ARGS(pag, caller_ip))
-DEFINE_PERAG_REF_EVENT(xfs_perag_get);
-DEFINE_PERAG_REF_EVENT(xfs_perag_hold);
-DEFINE_PERAG_REF_EVENT(xfs_perag_put);
-DEFINE_PERAG_REF_EVENT(xfs_perag_grab);
-DEFINE_PERAG_REF_EVENT(xfs_perag_grab_next_tag);
-DEFINE_PERAG_REF_EVENT(xfs_perag_rele);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
DEFINE_PERAG_REF_EVENT(xfs_reclaim_inodes_count);
+TRACE_DEFINE_ENUM(XG_TYPE_AG);
+TRACE_DEFINE_ENUM(XG_TYPE_RTG);
+
+DECLARE_EVENT_CLASS(xfs_group_class,
+ TP_PROTO(struct xfs_group *xg, unsigned long caller_ip),
+ TP_ARGS(xg, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(enum xfs_group_type, type)
+ __field(xfs_agnumber_t, agno)
+ __field(int, refcount)
+ __field(int, active_refcount)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
+ __entry->refcount = atomic_read(&xg->xg_ref);
+ __entry->active_refcount = atomic_read(&xg->xg_active_ref);
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d %sno 0x%x passive refs %d active refs %d caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
+ __entry->agno,
+ __entry->refcount,
+ __entry->active_refcount,
+ (char *)__entry->caller_ip)
+);
+
+#define DEFINE_GROUP_REF_EVENT(name) \
+DEFINE_EVENT(xfs_group_class, name, \
+ TP_PROTO(struct xfs_group *xg, unsigned long caller_ip), \
+ TP_ARGS(xg, caller_ip))
+DEFINE_GROUP_REF_EVENT(xfs_group_get);
+DEFINE_GROUP_REF_EVENT(xfs_group_hold);
+DEFINE_GROUP_REF_EVENT(xfs_group_put);
+DEFINE_GROUP_REF_EVENT(xfs_group_grab);
+DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
+DEFINE_GROUP_REF_EVENT(xfs_group_rele);
+
TRACE_EVENT(xfs_inodegc_worker,
TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
TP_ARGS(mp, shrinker_hits),
@@ -299,15 +341,15 @@ TRACE_EVENT(xfs_inodegc_shrinker_scan,
);
DECLARE_EVENT_CLASS(xfs_ag_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
- TP_ARGS(mp, agno),
+ TP_PROTO(const struct xfs_perag *pag),
+ TP_ARGS(pag),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
),
TP_printk("dev %d:%d agno 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -315,8 +357,8 @@ DECLARE_EVENT_CLASS(xfs_ag_class,
);
#define DEFINE_AG_EVENT(name) \
DEFINE_EVENT(xfs_ag_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), \
- TP_ARGS(mp, agno))
+ TP_PROTO(const struct xfs_perag *pag), \
+ TP_ARGS(pag))
DEFINE_AG_EVENT(xfs_read_agf);
DEFINE_AG_EVENT(xfs_alloc_read_agf);
@@ -662,7 +704,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
DECLARE_EVENT_CLASS(xfs_filestream_class,
- TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino),
+ TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino),
TP_ARGS(pag, ino),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -671,9 +713,9 @@ DECLARE_EVENT_CLASS(xfs_filestream_class,
__field(int, streams)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
__entry->ino = ino;
- __entry->agno = pag->pag_agno;
+ __entry->agno = pag_agno(pag);
__entry->streams = atomic_read(&pag->pagf_fstrms);
),
TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d",
@@ -684,14 +726,14 @@ DECLARE_EVENT_CLASS(xfs_filestream_class,
)
#define DEFINE_FILESTREAM_EVENT(name) \
DEFINE_EVENT(xfs_filestream_class, name, \
- TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino), \
+ TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino), \
TP_ARGS(pag, ino))
DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
TRACE_EVENT(xfs_filestream_pick,
- TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino),
+ TP_PROTO(const struct xfs_perag *pag, xfs_ino_t ino),
TP_ARGS(pag, ino),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -701,9 +743,9 @@ TRACE_EVENT(xfs_filestream_pick,
__field(xfs_extlen_t, free)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
__entry->ino = ino;
- __entry->agno = pag->pag_agno;
+ __entry->agno = pag_agno(pag);
__entry->streams = atomic_read(&pag->pagf_fstrms);
__entry->free = pag->pagf_freeblks;
),
@@ -822,28 +864,32 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating);
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
-TRACE_EVENT(xfs_filemap_fault,
- TP_PROTO(struct xfs_inode *ip, unsigned int order, bool write_fault),
- TP_ARGS(ip, order, write_fault),
+DECLARE_EVENT_CLASS(xfs_fault_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned int order),
+ TP_ARGS(ip, order),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(unsigned int, order)
- __field(bool, write_fault)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->order = order;
- __entry->write_fault = write_fault;
),
- TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d",
+ TP_printk("dev %d:%d ino 0x%llx order %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->order,
- __entry->write_fault)
+ __entry->order)
)
+#define DEFINE_FAULT_EVENT(name) \
+DEFINE_EVENT(xfs_fault_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned int order), \
+ TP_ARGS(ip, order))
+DEFINE_FAULT_EVENT(xfs_read_fault);
+DEFINE_FAULT_EVENT(xfs_write_fault);
+
DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
TP_ARGS(ip, caller_ip),
@@ -894,9 +940,10 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
)
TRACE_EVENT(xfs_irec_merge_pre,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
- uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
- TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
+ TP_PROTO(const struct xfs_perag *pag,
+ const struct xfs_inobt_rec_incore *rec,
+ const struct xfs_inobt_rec_incore *nrec),
+ TP_ARGS(pag, rec, nrec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -906,12 +953,12 @@ TRACE_EVENT(xfs_irec_merge_pre,
__field(uint16_t, nholemask)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->agino = agino;
- __entry->holemask = holemask;
- __entry->nagino = nagino;
- __entry->nholemask = holemask;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->agino = rec->ir_startino;
+ __entry->holemask = rec->ir_holemask;
+ __entry->nagino = nrec->ir_startino;
+ __entry->nholemask = nrec->ir_holemask;
),
TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x new_agino 0x%x new_holemask 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -923,9 +970,9 @@ TRACE_EVENT(xfs_irec_merge_pre,
)
TRACE_EVENT(xfs_irec_merge_post,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
- uint16_t holemask),
- TP_ARGS(mp, agno, agino, holemask),
+ TP_PROTO(const struct xfs_perag *pag,
+ const struct xfs_inobt_rec_incore *nrec),
+ TP_ARGS(pag, nrec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -933,10 +980,10 @@ TRACE_EVENT(xfs_irec_merge_post,
__field(uint16_t, holemask)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->agino = agino;
- __entry->holemask = holemask;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
+ __entry->agino = nrec->ir_startino;
+ __entry->holemask = nrec->ir_holemask;
),
TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x",
MAJOR(__entry->dev),
@@ -1634,44 +1681,48 @@ TRACE_EVENT(xfs_bunmap,
);
DECLARE_EVENT_CLASS(xfs_extent_busy_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len),
- TP_ARGS(mp, agno, agbno, len),
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
+ xfs_extlen_t len),
+ TP_ARGS(xg, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agbno,
__entry->len)
);
#define DEFINE_BUSY_EVENT(name) \
DEFINE_EVENT(xfs_extent_busy_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- xfs_agblock_t agbno, xfs_extlen_t len), \
- TP_ARGS(mp, agno, agbno, len))
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
+ xfs_extlen_t len), \
+ TP_ARGS(xg, agbno, len))
DEFINE_BUSY_EVENT(xfs_extent_busy);
DEFINE_BUSY_EVENT(xfs_extent_busy_force);
DEFINE_BUSY_EVENT(xfs_extent_busy_reuse);
DEFINE_BUSY_EVENT(xfs_extent_busy_clear);
TRACE_EVENT(xfs_extent_busy_trim,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len,
- xfs_agblock_t tbno, xfs_extlen_t tlen),
- TP_ARGS(mp, agno, agbno, len, tbno, tlen),
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
+ xfs_extlen_t len, xfs_agblock_t tbno, xfs_extlen_t tlen),
+ TP_ARGS(xg, agbno, len, tbno, tlen),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
@@ -1679,22 +1730,99 @@ TRACE_EVENT(xfs_extent_busy_trim,
__field(xfs_extlen_t, tlen)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
__entry->tbno = tbno;
__entry->tlen = tlen;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x %sbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agbno,
__entry->len,
__entry->tbno,
__entry->tlen)
);
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xfs_rtalloc_extent_busy,
+ TP_PROTO(struct xfs_rtgroup *rtg, xfs_rtxnum_t start,
+ xfs_rtxlen_t minlen, xfs_rtxlen_t maxlen,
+ xfs_rtxlen_t len, xfs_rtxlen_t prod, xfs_rtxnum_t rtx,
+ unsigned busy_gen),
+ TP_ARGS(rtg, start, minlen, maxlen, len, prod, rtx, busy_gen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rtxnum_t, start)
+ __field(xfs_rtxlen_t, minlen)
+ __field(xfs_rtxlen_t, maxlen)
+ __field(xfs_rtxlen_t, mod)
+ __field(xfs_rtxlen_t, prod)
+ __field(xfs_rtxlen_t, len)
+ __field(xfs_rtxnum_t, rtx)
+ __field(unsigned, busy_gen)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->start = start;
+ __entry->minlen = minlen;
+ __entry->maxlen = maxlen;
+ __entry->prod = prod;
+ __entry->len = len;
+ __entry->rtx = rtx;
+ __entry->busy_gen = busy_gen;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x startrtx 0x%llx minlen %u maxlen %u "
+ "prod %u len %u rtx 0%llx busy_gen 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->start,
+ __entry->minlen,
+ __entry->maxlen,
+ __entry->prod,
+ __entry->len,
+ __entry->rtx,
+ __entry->busy_gen)
+)
+
+TRACE_EVENT(xfs_rtalloc_extent_busy_trim,
+ TP_PROTO(struct xfs_rtgroup *rtg, xfs_rtxnum_t old_rtx,
+ xfs_rtxlen_t old_len, xfs_rtxnum_t new_rtx,
+ xfs_rtxlen_t new_len),
+ TP_ARGS(rtg, old_rtx, old_len, new_rtx, new_len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rtxnum_t, old_rtx)
+ __field(xfs_rtxnum_t, new_rtx)
+ __field(xfs_rtxlen_t, old_len)
+ __field(xfs_rtxlen_t, new_len)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->old_rtx = old_rtx;
+ __entry->old_len = old_len;
+ __entry->new_rtx = new_rtx;
+ __entry->new_len = new_len;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x rtx 0x%llx rtxcount 0x%x -> rtx 0x%llx rtxcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->old_rtx,
+ __entry->old_len,
+ __entry->new_rtx,
+ __entry->new_len)
+);
+#endif /* CONFIG_XFS_RT */
+
DECLARE_EVENT_CLASS(xfs_agf_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
unsigned long caller_ip),
@@ -1758,10 +1886,10 @@ DEFINE_AGF_EVENT(xfs_agf);
DEFINE_AGF_EVENT(xfs_agfl_reset);
TRACE_EVENT(xfs_free_extent,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+ TP_PROTO(const struct xfs_perag *pag, xfs_agblock_t agbno,
xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft,
int haveright),
- TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright),
+ TP_ARGS(pag, agbno, len, resv, haveleft, haveright),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -1772,8 +1900,8 @@ TRACE_EVENT(xfs_free_extent,
__field(int, haveright)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->agbno = agbno;
__entry->len = len;
__entry->resv = resv;
@@ -2426,23 +2554,26 @@ DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel);
DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover);
DECLARE_EVENT_CLASS(xfs_discard_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len),
- TP_ARGS(mp, agno, agbno, len),
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
+ xfs_extlen_t len),
+ TP_ARGS(xg, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->agno = xg->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__entry->agbno,
__entry->len)
@@ -2450,9 +2581,9 @@ DECLARE_EVENT_CLASS(xfs_discard_class,
#define DEFINE_DISCARD_EVENT(name) \
DEFINE_EVENT(xfs_discard_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- xfs_agblock_t agbno, xfs_extlen_t len), \
- TP_ARGS(mp, agno, agbno, len))
+ TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, \
+ xfs_extlen_t len), \
+ TP_ARGS(xg, agbno, len))
DEFINE_DISCARD_EVENT(xfs_discard_extent);
DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
DEFINE_DISCARD_EVENT(xfs_discard_exclude);
@@ -2542,7 +2673,7 @@ TRACE_EVENT(xfs_btree_alloc_block,
__entry->ino = cur->bc_ino.ip->i_ino;
break;
case XFS_BTREE_TYPE_AG:
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->ino = 0;
break;
case XFS_BTREE_TYPE_MEM:
@@ -2712,6 +2843,7 @@ DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class,
TP_ARGS(mp, free),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
@@ -2719,13 +2851,16 @@ DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class,
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
- __entry->agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
- __entry->agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+ __entry->type = free->xefi_group->xg_type;
+ __entry->agno = free->xefi_group->xg_gno;
+ __entry->agbno = xfs_fsb_to_gbno(mp, free->xefi_startblock,
+ free->xefi_group->xg_type);
__entry->len = free->xefi_blockcount;
__entry->flags = free->xefi_flags;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x flags 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x gbno 0x%x fsbcount 0x%x flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
__entry->agbno,
__entry->len,
@@ -2735,7 +2870,6 @@ DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class,
DEFINE_EVENT(xfs_free_extent_deferred_class, name, \
TP_PROTO(struct xfs_mount *mp, struct xfs_extent_free_item *free), \
TP_ARGS(mp, free))
-DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer);
DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred);
DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_defer);
DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_extent_free_deferred);
@@ -2798,7 +2932,7 @@ DECLARE_EVENT_CLASS(xfs_rmap_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
__entry->owner = oinfo->oi_owner;
@@ -2843,7 +2977,7 @@ DECLARE_EVENT_CLASS(xfs_btree_error_class,
__entry->ino = cur->bc_ino.ip->i_ino;
break;
case XFS_BTREE_TYPE_AG:
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->ino = 0;
break;
case XFS_BTREE_TYPE_MEM:
@@ -2897,7 +3031,7 @@ TRACE_EVENT(xfs_rmap_convert_state,
__entry->ino = cur->bc_ino.ip->i_ino;
break;
case XFS_BTREE_TYPE_AG:
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->ino = 0;
break;
case XFS_BTREE_TYPE_MEM:
@@ -2932,7 +3066,7 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
__entry->owner = owner;
@@ -3032,11 +3166,10 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class,
TP_ARGS(bi),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(dev_t, opdev)
+ __field(enum xfs_group_type, type)
__field(xfs_agnumber_t, agno)
__field(xfs_ino_t, ino)
- __field(xfs_agblock_t, agbno)
- __field(xfs_fsblock_t, rtbno)
+ __field(unsigned long long, gbno)
__field(int, whichfork)
__field(xfs_fileoff_t, l_loff)
__field(xfs_filblks_t, l_len)
@@ -3045,20 +3178,25 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class,
),
TP_fast_assign(
struct xfs_inode *ip = bi->bi_owner;
+ struct xfs_mount *mp = ip->i_mount;
- __entry->dev = ip->i_mount->m_super->s_dev;
- if (xfs_ifork_is_realtime(ip, bi->bi_whichfork)) {
- __entry->agno = 0;
- __entry->agbno = 0;
- __entry->rtbno = bi->bi_bmap.br_startblock;
- __entry->opdev = ip->i_mount->m_rtdev_targp->bt_dev;
+ __entry->dev = mp->m_super->s_dev;
+ __entry->type = bi->bi_group->xg_type;
+ __entry->agno = bi->bi_group->xg_gno;
+ if (bi->bi_group->xg_type == XG_TYPE_RTG &&
+ !xfs_has_rtgroups(mp)) {
+ /*
+ * Legacy rt filesystems do not have allocation groups
+ * ondisk. We emulate this incore with one gigantic
+ * rtgroup whose size can exceed a 32-bit block number.
+ * For this tracepoint, we report group 0 and a 64-bit
+ * group block number.
+ */
+ __entry->gbno = bi->bi_bmap.br_startblock;
} else {
- __entry->agno = XFS_FSB_TO_AGNO(ip->i_mount,
- bi->bi_bmap.br_startblock);
- __entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount,
- bi->bi_bmap.br_startblock);
- __entry->rtbno = 0;
- __entry->opdev = __entry->dev;
+ __entry->gbno = xfs_fsb_to_gbno(mp,
+ bi->bi_bmap.br_startblock,
+ bi->bi_group->xg_type);
}
__entry->ino = ip->i_ino;
__entry->whichfork = bi->bi_whichfork;
@@ -3067,14 +3205,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class,
__entry->l_state = bi->bi_bmap.br_state;
__entry->op = bi->bi_type;
),
- TP_printk("dev %d:%d op %s opdev %d:%d ino 0x%llx agno 0x%x agbno 0x%x rtbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+ TP_printk("dev %d:%d op %s ino 0x%llx %sno 0x%x gbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS),
- MAJOR(__entry->opdev), MINOR(__entry->opdev),
__entry->ino,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
__entry->agno,
- __entry->agbno,
- __entry->rtbno,
+ __entry->gbno,
__print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
__entry->l_loff,
__entry->l_len,
@@ -3105,8 +3242,8 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class,
TP_fast_assign(
struct xfs_ag_resv *r = xfs_perag_resv(pag, resv);
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->resv = resv;
__entry->freeblks = pag->pagf_freeblks;
__entry->flcount = pag->pagf_flcount;
@@ -3139,11 +3276,10 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical);
DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
-/* simple AG-based error/%ip tracepoint class */
-DECLARE_EVENT_CLASS(xfs_ag_error_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error,
+TRACE_EVENT(xfs_ag_resv_init_error,
+ TP_PROTO(const struct xfs_perag *pag, int error,
unsigned long caller_ip),
- TP_ARGS(mp, agno, error, caller_ip),
+ TP_ARGS(pag, error, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -3151,8 +3287,8 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class,
__field(unsigned long, caller_ip)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->error = error;
__entry->caller_ip = caller_ip;
),
@@ -3163,13 +3299,6 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class,
(char *)__entry->caller_ip)
);
-#define DEFINE_AG_ERROR_EVENT(name) \
-DEFINE_EVENT(xfs_ag_error_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int error, \
- unsigned long caller_ip), \
- TP_ARGS(mp, agno, error, caller_ip))
-DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
-
/* refcount tracepoint classes */
DECLARE_EVENT_CLASS(xfs_refcount_class,
@@ -3184,7 +3313,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->agbno = agbno;
__entry->len = len;
),
@@ -3215,7 +3344,7 @@ TRACE_EVENT(xfs_refcount_lookup,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->agbno = agbno;
__entry->dir = dir;
),
@@ -3241,7 +3370,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
@@ -3277,7 +3406,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->domain = irec->rc_domain;
__entry->startblock = irec->rc_startblock;
__entry->blockcount = irec->rc_blockcount;
@@ -3319,7 +3448,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
@@ -3369,7 +3498,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
@@ -3424,7 +3553,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->i1_domain = i1->rc_domain;
__entry->i1_startblock = i1->rc_startblock;
__entry->i1_blockcount = i1->rc_blockcount;
@@ -3838,7 +3967,45 @@ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
/* fsmap traces */
-DECLARE_EVENT_CLASS(xfs_fsmap_class,
+TRACE_EVENT(xfs_fsmap_mapping,
+ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
+ const struct xfs_fsmap_irec *frec),
+ TP_ARGS(mp, keydev, agno, frec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, keydev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_daddr_t, start_daddr)
+ __field(xfs_daddr_t, len_daddr)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->keydev = new_decode_dev(keydev);
+ __entry->agno = agno;
+ __entry->agbno = frec->rec_key;
+ __entry->start_daddr = frec->start_daddr;
+ __entry->len_daddr = frec->len_daddr;
+ __entry->owner = frec->owner;
+ __entry->offset = frec->offset;
+ __entry->flags = frec->rm_flags;
+ ),
+ TP_printk("dev %d:%d keydev %d:%d agno 0x%x rmapbno 0x%x start_daddr 0x%llx len_daddr 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->keydev), MINOR(__entry->keydev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->start_daddr,
+ __entry->len_daddr,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+
+DECLARE_EVENT_CLASS(xfs_fsmap_group_key_class,
TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
const struct xfs_rmap_irec *rmap),
TP_ARGS(mp, keydev, agno, rmap),
@@ -3846,8 +4013,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class,
__field(dev_t, dev)
__field(dev_t, keydev)
__field(xfs_agnumber_t, agno)
- __field(xfs_fsblock_t, bno)
- __field(xfs_filblks_t, len)
+ __field(xfs_agblock_t, agbno)
__field(uint64_t, owner)
__field(uint64_t, offset)
__field(unsigned int, flags)
@@ -3856,33 +4022,30 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class,
__entry->dev = mp->m_super->s_dev;
__entry->keydev = new_decode_dev(keydev);
__entry->agno = agno;
- __entry->bno = rmap->rm_startblock;
- __entry->len = rmap->rm_blockcount;
+ __entry->agbno = rmap->rm_startblock;
__entry->owner = rmap->rm_owner;
__entry->offset = rmap->rm_offset;
__entry->flags = rmap->rm_flags;
),
- TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%llx fsbcount 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x",
+ TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
MAJOR(__entry->keydev), MINOR(__entry->keydev),
__entry->agno,
- __entry->bno,
- __entry->len,
+ __entry->agbno,
__entry->owner,
__entry->offset,
__entry->flags)
)
-#define DEFINE_FSMAP_EVENT(name) \
-DEFINE_EVENT(xfs_fsmap_class, name, \
+#define DEFINE_FSMAP_GROUP_KEY_EVENT(name) \
+DEFINE_EVENT(xfs_fsmap_group_key_class, name, \
TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \
const struct xfs_rmap_irec *rmap), \
TP_ARGS(mp, keydev, agno, rmap))
-DEFINE_FSMAP_EVENT(xfs_fsmap_low_key);
-DEFINE_FSMAP_EVENT(xfs_fsmap_high_key);
-DEFINE_FSMAP_EVENT(xfs_fsmap_mapping);
+DEFINE_FSMAP_GROUP_KEY_EVENT(xfs_fsmap_low_group_key);
+DEFINE_FSMAP_GROUP_KEY_EVENT(xfs_fsmap_high_group_key);
-DECLARE_EVENT_CLASS(xfs_fsmap_linear_class,
- TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno),
+DECLARE_EVENT_CLASS(xfs_fsmap_linear_key_class,
+ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_fsblock_t bno),
TP_ARGS(mp, keydev, bno),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -3899,12 +4062,12 @@ DECLARE_EVENT_CLASS(xfs_fsmap_linear_class,
MAJOR(__entry->keydev), MINOR(__entry->keydev),
__entry->bno)
)
-#define DEFINE_FSMAP_LINEAR_EVENT(name) \
-DEFINE_EVENT(xfs_fsmap_linear_class, name, \
+#define DEFINE_FSMAP_LINEAR_KEY_EVENT(name) \
+DEFINE_EVENT(xfs_fsmap_linear_key_class, name, \
TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), \
TP_ARGS(mp, keydev, bno))
-DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_low_key_linear);
-DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_high_key_linear);
+DEFINE_FSMAP_LINEAR_KEY_EVENT(xfs_fsmap_low_linear_key);
+DEFINE_FSMAP_LINEAR_KEY_EVENT(xfs_fsmap_high_linear_key);
DECLARE_EVENT_CLASS(xfs_getfsmap_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap),
@@ -4036,9 +4199,9 @@ DEFINE_TRANS_EVENT(xfs_trans_commit_items);
DEFINE_TRANS_EVENT(xfs_trans_free_items);
TRACE_EVENT(xfs_iunlink_update_bucket,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+ TP_PROTO(const struct xfs_perag *pag, unsigned int bucket,
xfs_agino_t old_ptr, xfs_agino_t new_ptr),
- TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+ TP_ARGS(pag, bucket, old_ptr, new_ptr),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -4047,8 +4210,8 @@ TRACE_EVENT(xfs_iunlink_update_bucket,
__field(xfs_agino_t, new_ptr)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->bucket = bucket;
__entry->old_ptr = old_ptr;
__entry->new_ptr = new_ptr;
@@ -4062,9 +4225,8 @@ TRACE_EVENT(xfs_iunlink_update_bucket,
);
TRACE_EVENT(xfs_iunlink_update_dinode,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
- xfs_agino_t old_ptr, xfs_agino_t new_ptr),
- TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+ TP_PROTO(const struct xfs_iunlink_item *iup, xfs_agino_t old_ptr),
+ TP_ARGS(iup, old_ptr),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -4073,11 +4235,12 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
__field(xfs_agino_t, new_ptr)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->agino = agino;
+ __entry->dev = pag_mount(iup->pag)->m_super->s_dev;
+ __entry->agno = pag_agno(iup->pag);
+ __entry->agino =
+ XFS_INO_TO_AGINO(iup->ip->i_mount, iup->ip->i_ino);
__entry->old_ptr = old_ptr;
- __entry->new_ptr = new_ptr;
+ __entry->new_ptr = iup->next_agino;
),
TP_printk("dev %d:%d agno 0x%x agino 0x%x old 0x%x new 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -4180,37 +4343,35 @@ DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_corrupt);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
-DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
-DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_corrupt);
-DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
-DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
-DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
- TP_ARGS(mp, agno, flags),
+DECLARE_EVENT_CLASS(xfs_group_corrupt_class,
+ TP_PROTO(const struct xfs_group *xg, unsigned int flags),
+ TP_ARGS(xg, flags),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_agnumber_t, agno)
+ __field(enum xfs_group_type, type)
+ __field(uint32_t, index)
__field(unsigned int, flags)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->index = xg->xg_gno;
__entry->flags = flags;
),
- TP_printk("dev %d:%d agno 0x%x flags 0x%x",
+ TP_printk("dev %d:%d %sno 0x%x flags 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->agno, __entry->flags)
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
+ __entry->index, __entry->flags)
);
-#define DEFINE_AG_CORRUPT_EVENT(name) \
-DEFINE_EVENT(xfs_ag_corrupt_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- unsigned int flags), \
- TP_ARGS(mp, agno, flags))
-DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
-DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_corrupt);
-DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
-DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
+#define DEFINE_GROUP_CORRUPT_EVENT(name) \
+DEFINE_EVENT(xfs_group_corrupt_class, name, \
+ TP_PROTO(const struct xfs_group *xg, unsigned int flags), \
+ TP_ARGS(xg, flags))
+DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_sick);
+DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_corrupt);
+DEFINE_GROUP_CORRUPT_EVENT(xfs_group_mark_healthy);
+DEFINE_GROUP_CORRUPT_EVENT(xfs_group_unfixed_corruption);
DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
TP_PROTO(struct xfs_inode *ip, unsigned int flags),
@@ -4238,29 +4399,10 @@ DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt);
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
DEFINE_INODE_CORRUPT_EVENT(xfs_inode_unfixed_corruption);
-TRACE_EVENT(xfs_iwalk_ag,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t startino),
- TP_ARGS(mp, agno, startino),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_agnumber_t, agno)
- __field(xfs_agino_t, startino)
- ),
- TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->startino = startino;
- ),
- TP_printk("dev %d:%d agno 0x%x startino 0x%x",
- MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
- __entry->startino)
-)
-
TRACE_EVENT(xfs_iwalk_ag_rec,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ TP_PROTO(const struct xfs_perag *pag, \
struct xfs_inobt_rec_incore *irec),
- TP_ARGS(mp, agno, irec),
+ TP_ARGS(pag, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -4268,8 +4410,8 @@ TRACE_EVENT(xfs_iwalk_ag_rec,
__field(uint64_t, freemask)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag_mount(pag)->m_super->s_dev;
+ __entry->agno = pag_agno(pag);
__entry->startino = irec->ir_startino;
__entry->freemask = irec->ir_free;
),
@@ -4331,7 +4473,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
__assign_str(name);
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->agbno = cur->bc_ag.afake->af_root;
__entry->levels = cur->bc_ag.afake->af_levels;
__entry->blocks = cur->bc_ag.afake->af_blocks;
@@ -4446,7 +4588,7 @@ TRACE_EVENT(xfs_btree_bload_block,
__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
__entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb);
} else {
- __entry->agno = cur->bc_ag.pag->pag_agno;
+ __entry->agno = cur->bc_group->xg_gno;
__entry->agbno = be32_to_cpu(ptr->s);
}
__entry->nr_records = nr_records;
@@ -4671,35 +4813,39 @@ TRACE_EVENT(xfs_force_shutdown,
);
#ifdef CONFIG_XFS_DRAIN_INTENTS
-DECLARE_EVENT_CLASS(xfs_perag_intents_class,
- TP_PROTO(struct xfs_perag *pag, void *caller_ip),
- TP_ARGS(pag, caller_ip),
+DECLARE_EVENT_CLASS(xfs_group_intents_class,
+ TP_PROTO(const struct xfs_group *xg, void *caller_ip),
+ TP_ARGS(xg, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_agnumber_t, agno)
+ __field(enum xfs_group_type, type)
+ __field(uint32_t, index)
__field(long, nr_intents)
__field(void *, caller_ip)
),
TP_fast_assign(
- __entry->dev = pag->pag_mount->m_super->s_dev;
- __entry->agno = pag->pag_agno;
- __entry->nr_intents = atomic_read(&pag->pag_intents_drain.dr_count);
+ __entry->dev = xg->xg_mount->m_super->s_dev;
+ __entry->type = xg->xg_type;
+ __entry->index = xg->xg_gno;
+ __entry->nr_intents =
+ atomic_read(&xg->xg_intents_drain.dr_count);
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS",
+ TP_printk("dev %d:%d %sno 0x%x intents %ld caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->agno,
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
+ __entry->index,
__entry->nr_intents,
__entry->caller_ip)
);
-#define DEFINE_PERAG_INTENTS_EVENT(name) \
-DEFINE_EVENT(xfs_perag_intents_class, name, \
- TP_PROTO(struct xfs_perag *pag, void *caller_ip), \
- TP_ARGS(pag, caller_ip))
-DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_hold);
-DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele);
-DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
+#define DEFINE_GROUP_INTENTS_EVENT(name) \
+DEFINE_EVENT(xfs_group_intents_class, name, \
+ TP_PROTO(const struct xfs_group *xg, void *caller_ip), \
+ TP_ARGS(xg, caller_ip))
+DEFINE_GROUP_INTENTS_EVENT(xfs_group_intent_hold);
+DEFINE_GROUP_INTENTS_EVENT(xfs_group_intent_rele);
+DEFINE_GROUP_INTENTS_EVENT(xfs_group_wait_intents);
#endif /* CONFIG_XFS_DRAIN_INTENTS */
@@ -5327,6 +5473,107 @@ DEFINE_EVENT(xfs_getparents_class, name, \
DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_begin);
DEFINE_XFS_GETPARENTS_EVENT(xfs_getparents_end);
+DECLARE_EVENT_CLASS(xfs_metadir_update_class,
+ TP_PROTO(const struct xfs_metadir_update *upd),
+ TP_ARGS(upd),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dp_ino)
+ __field(xfs_ino_t, ino)
+ __string(fname, upd->path)
+ ),
+ TP_fast_assign(
+ __entry->dev = upd->dp->i_mount->m_super->s_dev;
+ __entry->dp_ino = upd->dp->i_ino;
+ __entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO;
+ __assign_str(fname);
+ ),
+ TP_printk("dev %d:%d dp 0x%llx fname '%s' ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dp_ino,
+ __get_str(fname),
+ __entry->ino)
+)
+
+#define DEFINE_METADIR_UPDATE_EVENT(name) \
+DEFINE_EVENT(xfs_metadir_update_class, name, \
+ TP_PROTO(const struct xfs_metadir_update *upd), \
+ TP_ARGS(upd))
+DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_start_create);
+DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_start_link);
+DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_commit);
+DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_cancel);
+DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_try_create);
+DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_create);
+DEFINE_METADIR_UPDATE_EVENT(xfs_metadir_link);
+
+DECLARE_EVENT_CLASS(xfs_metadir_update_error_class,
+ TP_PROTO(const struct xfs_metadir_update *upd, int error),
+ TP_ARGS(upd, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dp_ino)
+ __field(xfs_ino_t, ino)
+ __field(int, error)
+ __string(fname, upd->path)
+ ),
+ TP_fast_assign(
+ __entry->dev = upd->dp->i_mount->m_super->s_dev;
+ __entry->dp_ino = upd->dp->i_ino;
+ __entry->ino = upd->ip ? upd->ip->i_ino : NULLFSINO;
+ __entry->error = error;
+ __assign_str(fname);
+ ),
+ TP_printk("dev %d:%d dp 0x%llx fname '%s' ino 0x%llx error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dp_ino,
+ __get_str(fname),
+ __entry->ino,
+ __entry->error)
+)
+
+#define DEFINE_METADIR_UPDATE_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_metadir_update_error_class, name, \
+ TP_PROTO(const struct xfs_metadir_update *upd, int error), \
+ TP_ARGS(upd, error))
+DEFINE_METADIR_UPDATE_ERROR_EVENT(xfs_metadir_teardown);
+
+DECLARE_EVENT_CLASS(xfs_metadir_class,
+ TP_PROTO(struct xfs_inode *dp, struct xfs_name *name,
+ xfs_ino_t ino),
+ TP_ARGS(dp, name, ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, dp_ino)
+ __field(xfs_ino_t, ino)
+ __field(int, ftype)
+ __field(int, namelen)
+ __dynamic_array(char, name, name->len)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(dp)->i_sb->s_dev;
+ __entry->dp_ino = dp->i_ino;
+ __entry->ino = ino,
+ __entry->ftype = name->type;
+ __entry->namelen = name->len;
+ memcpy(__get_str(name), name->name, name->len);
+ ),
+ TP_printk("dev %d:%d dir 0x%llx type %s name '%.*s' ino 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dp_ino,
+ __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+ __entry->namelen,
+ __get_str(name),
+ __entry->ino)
+)
+
+#define DEFINE_METADIR_EVENT(name) \
+DEFINE_EVENT(xfs_metadir_class, name, \
+ TP_PROTO(struct xfs_inode *dp, struct xfs_name *name, \
+ xfs_ino_t ino), \
+ TP_ARGS(dp, name, ino))
+DEFINE_METADIR_EVENT(xfs_metadir_lookup);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index bdf3704dc301..30fbed27cf05 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -25,6 +25,8 @@
#include "xfs_dquot.h"
#include "xfs_icache.h"
#include "xfs_rtbitmap.h"
+#include "xfs_rtgroup.h"
+#include "xfs_sb.h"
struct kmem_cache *xfs_trans_cache;
@@ -67,7 +69,7 @@ xfs_trans_free(
struct xfs_trans *tp)
{
xfs_extent_busy_sort(&tp->t_busy);
- xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
+ xfs_extent_busy_clear(&tp->t_busy, false);
trace_xfs_trans_free(tp, _RET_IP_);
xfs_trans_clear_context(tp);
@@ -420,6 +422,8 @@ xfs_trans_mod_sb(
ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res);
}
tp->t_frextents_delta += delta;
+ if (xfs_has_rtgroups(mp))
+ flags &= ~XFS_TRANS_SB_DIRTY;
break;
case XFS_TRANS_SB_RES_FREXTENTS:
/*
@@ -429,6 +433,8 @@ xfs_trans_mod_sb(
*/
ASSERT(delta < 0);
tp->t_res_frextents_delta += delta;
+ if (xfs_has_rtgroups(mp))
+ flags &= ~XFS_TRANS_SB_DIRTY;
break;
case XFS_TRANS_SB_DBLOCKS:
tp->t_dblocks_delta += delta;
@@ -455,6 +461,10 @@ xfs_trans_mod_sb(
case XFS_TRANS_SB_REXTSLOG:
tp->t_rextslog_delta += delta;
break;
+ case XFS_TRANS_SB_RGCOUNT:
+ ASSERT(delta > 0);
+ tp->t_rgcount_delta += delta;
+ break;
default:
ASSERT(0);
return;
@@ -497,20 +507,22 @@ xfs_trans_apply_sb_deltas(
}
/*
- * Updating frextents requires careful handling because it does not
- * behave like the lazysb counters because we cannot rely on log
- * recovery in older kenels to recompute the value from the rtbitmap.
- * This means that the ondisk frextents must be consistent with the
- * rtbitmap.
+ * sb_frextents was added to the lazy sb counters when the rt groups
+ * feature was introduced. This is possible because we know that all
+ * kernels supporting rtgroups will also recompute frextents from the
+ * realtime bitmap.
+ *
+ * For older file systems, updating frextents requires careful handling
+ * because we cannot rely on log recovery in older kernels to recompute
+ * the value from the rtbitmap. This means that the ondisk frextents
+ * must be consistent with the rtbitmap.
*
* Therefore, log the frextents change to the ondisk superblock and
* update the incore superblock so that future calls to xfs_log_sb
* write the correct value ondisk.
- *
- * Don't touch m_frextents because it includes incore reservations,
- * and those are handled by the unreserve function.
*/
- if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
+ if ((tp->t_frextents_delta || tp->t_res_frextents_delta) &&
+ !xfs_has_rtgroups(tp->t_mountp)) {
struct xfs_mount *mp = tp->t_mountp;
int64_t rtxdelta;
@@ -536,6 +548,18 @@ xfs_trans_apply_sb_deltas(
}
if (tp->t_rextsize_delta) {
be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta);
+
+ /*
+ * Because the ondisk sb records rtgroup size in units of rt
+ * extents, any time we update the rt extent size we have to
+ * recompute the ondisk rtgroup block log. The incore values
+ * will be recomputed in xfs_trans_unreserve_and_mod_sb.
+ */
+ if (xfs_has_rtgroups(tp->t_mountp)) {
+ sbp->sb_rgblklog = xfs_compute_rgblklog(
+ be32_to_cpu(sbp->sb_rgextents),
+ be32_to_cpu(sbp->sb_rextsize));
+ }
whole = 1;
}
if (tp->t_rbmblocks_delta) {
@@ -554,6 +578,10 @@ xfs_trans_apply_sb_deltas(
sbp->sb_rextslog += tp->t_rextslog_delta;
whole = 1;
}
+ if (tp->t_rgcount_delta) {
+ be32_add_cpu(&sbp->sb_rgcount, tp->t_rgcount_delta);
+ whole = 1;
+ }
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
if (whole)
@@ -618,7 +646,7 @@ xfs_trans_unreserve_and_mod_sb(
}
ASSERT(tp->t_rtx_res || tp->t_frextents_delta >= 0);
- if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+ if (xfs_has_rtgroups(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) {
rtxdelta += tp->t_frextents_delta;
ASSERT(rtxdelta >= 0);
}
@@ -651,23 +679,21 @@ xfs_trans_unreserve_and_mod_sb(
mp->m_sb.sb_icount += idelta;
mp->m_sb.sb_ifree += ifreedelta;
/*
- * Do not touch sb_frextents here because we are dealing with incore
- * reservation. sb_frextents is not part of the lazy sb counters so it
- * must be consistent with the ondisk rtbitmap and must never include
- * incore reservations.
+ * Do not touch sb_frextents here because it is handled in
+ * xfs_trans_apply_sb_deltas for file systems where it isn't a lazy
+ * counter anyway.
*/
mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
mp->m_sb.sb_agcount += tp->t_agcount_delta;
mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
- mp->m_sb.sb_rextsize += tp->t_rextsize_delta;
- if (tp->t_rextsize_delta) {
- mp->m_rtxblklog = log2_if_power2(mp->m_sb.sb_rextsize);
- mp->m_rtxblkmask = mask64_if_power2(mp->m_sb.sb_rextsize);
- }
+ if (tp->t_rextsize_delta)
+ xfs_mount_sb_set_rextsize(mp, &mp->m_sb,
+ mp->m_sb.sb_rextsize + tp->t_rextsize_delta);
mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta;
mp->m_sb.sb_rblocks += tp->t_rblocks_delta;
mp->m_sb.sb_rextents += tp->t_rextents_delta;
mp->m_sb.sb_rextslog += tp->t_rextslog_delta;
+ mp->m_sb.sb_rgcount += tp->t_rgcount_delta;
spin_unlock(&mp->m_sb_lock);
/*
@@ -1262,11 +1288,26 @@ retry:
gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL;
pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL;
if (udqp || gdqp || pdqp) {
+ xfs_filblks_t dblocks, rblocks;
unsigned int qflags = XFS_QMOPT_RES_REGBLKS;
+ bool isrt = XFS_IS_REALTIME_INODE(ip);
if (force)
qflags |= XFS_QMOPT_FORCE_RES;
+ if (isrt) {
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_cancel;
+ }
+
+ xfs_inode_count_blocks(tp, ip, &dblocks, &rblocks);
+
+ if (isrt)
+ rblocks += ip->i_delayed_blks;
+ else
+ dblocks += ip->i_delayed_blks;
+
/*
* Reserve enough quota to handle blocks on disk and reserved
* for a delayed allocation. We'll actually transfer the
@@ -1274,8 +1315,20 @@ retry:
* though that part is only semi-transactional.
*/
error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp,
- pdqp, ip->i_nblocks + ip->i_delayed_blks,
- 1, qflags);
+ pdqp, dblocks, 1, qflags);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error)
+ goto out_cancel;
+
+ /* Do the same for realtime. */
+ qflags = XFS_QMOPT_RES_RTBLKS | (qflags & XFS_QMOPT_FORCE_RES);
+ error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp,
+ pdqp, rblocks, 0, qflags);
if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
xfs_trans_cancel(tp);
xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index f06cc0f41665..71c2e82e4dad 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -148,6 +148,7 @@ typedef struct xfs_trans {
int64_t t_rblocks_delta;/* superblock rblocks change */
int64_t t_rextents_delta;/* superblocks rextents chg */
int64_t t_rextslog_delta;/* superblocks rextslog chg */
+ int64_t t_rgcount_delta; /* realtime group count */
struct list_head t_items; /* log item descriptors */
struct list_head t_busy; /* list of busy extents */
struct list_head t_dfops; /* deferred operations */
@@ -214,6 +215,7 @@ xfs_trans_read_buf(
}
struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
+struct xfs_buf *xfs_trans_getrtsb(struct xfs_trans *tp);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index e28ab74af4f0..8e886ecfd69a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -168,12 +168,11 @@ xfs_trans_get_buf_map(
/*
* Get and lock the superblock buffer for the given transaction.
*/
-struct xfs_buf *
-xfs_trans_getsb(
- struct xfs_trans *tp)
+static struct xfs_buf *
+__xfs_trans_getsb(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
{
- struct xfs_buf *bp = tp->t_mountp->m_sb_bp;
-
/*
* Just increment the lock recursion count if the buffer is already
* attached to this transaction.
@@ -197,6 +196,22 @@ xfs_trans_getsb(
return bp;
}
+struct xfs_buf *
+xfs_trans_getsb(
+ struct xfs_trans *tp)
+{
+ return __xfs_trans_getsb(tp, tp->t_mountp->m_sb_bp);
+}
+
+struct xfs_buf *
+xfs_trans_getrtsb(
+ struct xfs_trans *tp)
+{
+ if (!tp->t_mountp->m_rtsb_bp)
+ return NULL;
+ return __xfs_trans_getsb(tp, tp->t_mountp->m_rtsb_bp);
+}
+
/*
* Get and lock the buffer for the caller if it is not already
* locked within the given transaction. If it has not yet been
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index b368e13424c4..481ba3dc9f19 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -156,6 +156,8 @@ xfs_trans_mod_ino_dquot(
unsigned int field,
int64_t delta)
{
+ ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(ip));
+
xfs_trans_mod_dquot(tp, dqp, field, delta);
if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
@@ -247,6 +249,8 @@ xfs_trans_mod_dquot_byino(
xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
return;
+ ASSERT(!xfs_is_metadir_inode(ip) || XFS_IS_DQDETACHED(ip));
+
if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta);
if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
@@ -962,6 +966,8 @@ xfs_trans_reserve_quota_nblks(
if (!XFS_IS_QUOTA_ON(mp))
return 0;
+ if (xfs_is_metadir_inode(ip))
+ return 0;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
@@ -1025,3 +1031,14 @@ xfs_trans_free_dqinfo(
kmem_cache_free(xfs_dqtrx_cache, tp->t_dqinfo);
tp->t_dqinfo = NULL;
}
+
+int
+xfs_quota_reserve_blkres(
+ struct xfs_inode *ip,
+ int64_t blocks)
+{
+ if (XFS_IS_REALTIME_INODE(ip))
+ return xfs_trans_reserve_quota_nblks(NULL, ip, 0, blocks,
+ false);
+ return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
+}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index eaf849260bd6..0f641a9091ec 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -51,8 +51,7 @@ xfs_attr_grab_log_assist(
return error;
xfs_set_using_logged_xattrs(mp);
- xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP,
- "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!");
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_LARP);
return 0;
}