aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_exchrange.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_exchrange.c')
-rw-r--r--fs/xfs/xfs_exchrange.c334
1 files changed, 333 insertions, 1 deletions
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index 35351b973521..0fc95e6471cb 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -12,8 +12,15 @@
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
+#include "xfs_quota.h"
+#include "xfs_bmap_util.h"
+#include "xfs_reflink.h"
+#include "xfs_trace.h"
#include "xfs_exchrange.h"
#include "xfs_exchmaps.h"
+#include "xfs_sb.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
#include <linux/fsnotify.h>
/* Lock (and optionally join) two inodes for a file range exchange. */
@@ -64,6 +71,207 @@ xfs_exchrange_estimate(
return error;
}
+#define QRETRY_IP1 (0x1)
+#define QRETRY_IP2 (0x2)
+
+/*
+ * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
+ * this if quota enforcement is disabled or if both inodes' dquots are the
+ * same. The qretry structure must be initialized to zeroes before the first
+ * call to this function.
+ */
+STATIC int
+xfs_exchrange_reserve_quota(
+ struct xfs_trans *tp,
+ const struct xfs_exchmaps_req *req,
+ unsigned int *qretry)
+{
+ int64_t ddelta, rdelta;
+ int ip1_error = 0;
+ int error;
+
+ /*
+ * Don't bother with a quota reservation if we're not enforcing them
+ * or the two inodes have the same dquots.
+ */
+ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
+ (req->ip1->i_udquot == req->ip2->i_udquot &&
+ req->ip1->i_gdquot == req->ip2->i_gdquot &&
+ req->ip1->i_pdquot == req->ip2->i_pdquot))
+ return 0;
+
+ *qretry = 0;
+
+ /*
+ * For each file, compute the net gain in the number of regular blocks
+ * that will be mapped into that file and reserve that much quota. The
+ * quota counts must be able to absorb at least that much space.
+ */
+ ddelta = req->ip2_bcount - req->ip1_bcount;
+ rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
+ if (ddelta > 0 || rdelta > 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
+ ddelta > 0 ? ddelta : 0,
+ rdelta > 0 ? rdelta : 0,
+ false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ /*
+ * Save this error and see what happens if we try to
+ * reserve quota for ip2. Then report both.
+ */
+ *qretry |= QRETRY_IP1;
+ ip1_error = error;
+ error = 0;
+ }
+ if (error)
+ return error;
+ }
+ if (ddelta < 0 || rdelta < 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
+ ddelta < 0 ? -ddelta : 0,
+ rdelta < 0 ? -rdelta : 0,
+ false);
+ if (error == -EDQUOT || error == -ENOSPC)
+ *qretry |= QRETRY_IP2;
+ if (error)
+ return error;
+ }
+ if (ip1_error)
+ return ip1_error;
+
+ /*
+ * For each file, forcibly reserve the gross gain in mapped blocks so
+ * that we don't trip over any quota block reservation assertions.
+ * We must reserve the gross gain because the quota code subtracts from
+ * bcount the number of blocks that we unmap; it does not add that
+ * quantity back to the quota block reservation.
+ */
+ error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
+ req->ip1_rtbcount, true);
+ if (error)
+ return error;
+
+ return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
+ req->ip2_rtbcount, true);
+}
+
+/* Exchange the mappings (and hence the contents) of two files' forks. */
+STATIC int
+xfs_exchrange_mappings(
+ const struct xfs_exchrange *fxr,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ struct xfs_mount *mp = ip1->i_mount;
+ struct xfs_exchmaps_req req = {
+ .ip1 = ip1,
+ .ip2 = ip2,
+ .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset),
+ .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset),
+ .blockcount = XFS_B_TO_FSB(mp, fxr->length),
+ };
+ struct xfs_trans *tp;
+ unsigned int qretry;
+ bool retried = false;
+ int error;
+
+ trace_xfs_exchrange_mappings(fxr, ip1, ip2);
+
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
+ req.flags |= XFS_EXCHMAPS_SET_SIZES;
+ if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
+ req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
+
+ error = xfs_exchrange_estimate(&req);
+ if (error)
+ return error;
+
+retry:
+ /* Allocate the transaction, lock the inodes, and join them. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
+ XFS_TRANS_RES_FDBLKS, &tp);
+ if (error)
+ return error;
+
+ xfs_exchrange_ilock(tp, ip1, ip2);
+
+ trace_xfs_exchrange_before(ip2, 2);
+ trace_xfs_exchrange_before(ip1, 1);
+
+ error = xfs_exchmaps_check_forks(mp, &req);
+ if (error)
+ goto out_trans_cancel;
+
+ /*
+ * Reserve ourselves some quota if any of them are in enforcing mode.
+ * In theory we only need enough to satisfy the change in the number
+ * of blocks between the two ranges being remapped.
+ */
+ error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_exchrange_iunlock(ip1, ip2);
+ if (qretry & QRETRY_IP1)
+ xfs_blockgc_free_quota(ip1, 0);
+ if (qretry & QRETRY_IP2)
+ xfs_blockgc_free_quota(ip2, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error)
+ goto out_trans_cancel;
+
+ /* If we got this far on a dry run, all parameters are ok. */
+ if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
+ goto out_trans_cancel;
+
+ /* Update the mtime and ctime of both files. */
+ if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
+ xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
+ xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+ xfs_exchange_mappings(tp, &req);
+
+ /*
+ * Force the log to persist metadata updates if the caller or the
+ * administrator requires this. The generic prep function already
+ * flushed the relevant parts of the page cache.
+ */
+ if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
+ xfs_trans_set_sync(tp);
+
+ error = xfs_trans_commit(tp);
+
+ trace_xfs_exchrange_after(ip2, 2);
+ trace_xfs_exchrange_after(ip1, 1);
+
+ if (error)
+ goto out_unlock;
+
+ /*
+ * If the caller wanted us to exchange the contents of two complete
+ * files of unequal length, exchange the incore sizes now. This should
+ * be safe because we flushed both files' page caches, exchanged all
+ * the mappings, and updated the ondisk sizes.
+ */
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
+ loff_t temp;
+
+ temp = i_size_read(VFS_I(ip2));
+ i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
+ i_size_write(VFS_I(ip1), temp);
+ }
+
+out_unlock:
+ xfs_exchrange_iunlock(ip1, ip2);
+ return error;
+
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+}
+
/*
* Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
* This part deals with struct file objects and byte ranges and does not deal
@@ -287,6 +495,130 @@ xfs_exchange_range_finish(
return file_remove_privs(fxr->file2);
}
+/* Prepare two files to have their data exchanged. */
+STATIC int
+xfs_exchrange_prep(
+ struct xfs_exchrange *fxr,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2);
+ int error;
+
+ trace_xfs_exchrange_prep(fxr, ip1, ip2);
+
+ /* Verify both files are either real-time or non-realtime */
+ if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
+ return -EINVAL;
+
+ /*
+ * The alignment checks in the generic helpers cannot deal with
+ * allocation units that are not powers of 2. This can happen with the
+ * realtime volume if the extent size is set.
+ */
+ if (!is_power_of_2(alloc_unit))
+ return -EOPNOTSUPP;
+
+ error = xfs_exchange_range_prep(fxr, alloc_unit);
+ if (error || fxr->length == 0)
+ return error;
+
+ /* Attach dquots to both inodes before changing block maps. */
+ error = xfs_qm_dqattach(ip2);
+ if (error)
+ return error;
+ error = xfs_qm_dqattach(ip1);
+ if (error)
+ return error;
+
+ trace_xfs_exchrange_flush(fxr, ip1, ip2);
+
+ /* Flush the relevant ranges of both files. */
+ error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
+ if (error)
+ return error;
+ error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
+ if (error)
+ return error;
+
+ /*
+ * Cancel CoW fork preallocations for the ranges of both files. The
+ * prep function should have flushed all the dirty data, so the only
+ * CoW mappings remaining should be speculative.
+ */
+ if (xfs_inode_has_cow_data(ip1)) {
+ error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
+ fxr->length, true);
+ if (error)
+ return error;
+ }
+
+ if (xfs_inode_has_cow_data(ip2)) {
+ error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
+ fxr->length, true);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Exchange contents of files. This is the binding between the generic
+ * file-level concepts and the XFS inode-specific implementation.
+ */
+STATIC int
+xfs_exchrange_contents(
+ struct xfs_exchrange *fxr)
+{
+ struct inode *inode1 = file_inode(fxr->file1);
+ struct inode *inode2 = file_inode(fxr->file2);
+ struct xfs_inode *ip1 = XFS_I(inode1);
+ struct xfs_inode *ip2 = XFS_I(inode2);
+ struct xfs_mount *mp = ip1->i_mount;
+ int error;
+
+ if (!xfs_has_exchange_range(mp))
+ return -EOPNOTSUPP;
+
+ if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
+ XFS_EXCHANGE_RANGE_PRIV_FLAGS))
+ return -EINVAL;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ /* Lock both files against IO */
+ error = xfs_ilock2_io_mmap(ip1, ip2);
+ if (error)
+ goto out_err;
+
+ /* Prepare and then exchange file contents. */
+ error = xfs_exchrange_prep(fxr, ip1, ip2);
+ if (error)
+ goto out_unlock;
+
+ error = xfs_exchrange_mappings(fxr, ip1, ip2);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Finish the exchange by removing special file privileges like any
+ * other file write would do. This may involve turning on support for
+ * logged xattrs if either file has security capabilities.
+ */
+ error = xfs_exchange_range_finish(fxr);
+ if (error)
+ goto out_unlock;
+
+out_unlock:
+ xfs_iunlock2_io_mmap(ip1, ip2);
+out_err:
+ if (error)
+ trace_xfs_exchrange_error(ip2, error, _RET_IP_);
+ return error;
+}
+
/* Exchange parts of two files. */
static int
xfs_exchange_range(
@@ -341,7 +673,7 @@ xfs_exchange_range(
fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
file_start_write(fxr->file2);
- ret = -EOPNOTSUPP; /* XXX call out to lower level code */
+ ret = xfs_exchrange_contents(fxr);
file_end_write(fxr->file2);
if (ret)
return ret;