aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/cifs/Kconfig66
-rw-r--r--fs/cifs/cached_dir.c43
-rw-r--r--fs/cifs/cifs_debug.c11
-rw-r--r--fs/cifs/cifs_spnego.h2
-rw-r--r--fs/cifs/cifsacl.c34
-rw-r--r--fs/cifs/cifsencrypt.c172
-rw-r--r--fs/cifs/cifsfs.c12
-rw-r--r--fs/cifs/cifsfs.h8
-rw-r--r--fs/cifs/cifsglob.h81
-rw-r--r--fs/cifs/cifspdu.h98
-rw-r--r--fs/cifs/cifsproto.h11
-rw-r--r--fs/cifs/cifssmb.c48
-rw-r--r--fs/cifs/connect.c179
-rw-r--r--fs/cifs/dir.c19
-rw-r--r--fs/cifs/file.c1810
-rw-r--r--fs/cifs/fscache.c20
-rw-r--r--fs/cifs/fscache.h10
-rw-r--r--fs/cifs/inode.c53
-rw-r--r--fs/cifs/link.c66
-rw-r--r--fs/cifs/misc.c127
-rw-r--r--fs/cifs/ntlmssp.h4
-rw-r--r--fs/cifs/readdir.c6
-rw-r--r--fs/cifs/sess.c1
-rw-r--r--fs/cifs/smb1ops.c72
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/cifs/smb2inode.c17
-rw-r--r--fs/cifs/smb2misc.c2
-rw-r--r--fs/cifs/smb2ops.c606
-rw-r--r--fs/cifs/smb2pdu.c291
-rw-r--r--fs/cifs/smb2pdu.h4
-rw-r--r--fs/cifs/smbdirect.c539
-rw-r--r--fs/cifs/smbdirect.h7
-rw-r--r--fs/cifs/transport.c87
-rw-r--r--fs/dlm/lowcomms.c5
-rw-r--r--fs/ecryptfs/crypto.c30
-rw-r--r--fs/gfs2/aops.c9
-rw-r--r--fs/gfs2/aops.h4
-rw-r--r--fs/gfs2/bmap.c38
-rw-r--r--fs/gfs2/dentry.c18
-rw-r--r--fs/gfs2/glock.c128
-rw-r--r--fs/gfs2/glock.h4
-rw-r--r--fs/gfs2/glops.c21
-rw-r--r--fs/gfs2/incore.h11
-rw-r--r--fs/gfs2/inode.c8
-rw-r--r--fs/gfs2/ops_fstype.c71
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/gfs2/super.c49
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/iomap/buffered-io.c91
-rw-r--r--fs/ksmbd/Kconfig8
-rw-r--r--fs/ksmbd/asn1.c23
-rw-r--r--fs/ksmbd/connection.c6
-rw-r--r--fs/ksmbd/ksmbd_work.h2
-rw-r--r--fs/ksmbd/mgmt/user_session.c98
-rw-r--r--fs/ksmbd/mgmt/user_session.h6
-rw-r--r--fs/ksmbd/smb2misc.c31
-rw-r--r--fs/ksmbd/smb2ops.c8
-rw-r--r--fs/ksmbd/smb2pdu.c72
-rw-r--r--fs/ksmbd/vfs.c6
-rw-r--r--fs/ksmbd/vfs_cache.c5
-rw-r--r--fs/lockd/svc.c21
-rw-r--r--fs/namei.c6
-rw-r--r--fs/netfs/Makefile1
-rw-r--r--fs/netfs/iterator.c369
-rw-r--r--fs/nfs/callback_xdr.c13
-rw-r--r--fs/nfs/dir.c28
-rw-r--r--fs/nfs/direct.c12
-rw-r--r--fs/nfs/export.c2
-rw-r--r--fs/nfs/file.c124
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/internal.h38
-rw-r--r--fs/nfs/nfs42proc.c3
-rw-r--r--fs/nfs/nfs4proc.c4
-rw-r--r--fs/nfs/nfs4trace.h42
-rw-r--r--fs/nfs/nfstrace.h58
-rw-r--r--fs/nfs/pagelist.c217
-rw-r--r--fs/nfs/pnfs.c2
-rw-r--r--fs/nfs/pnfs.h10
-rw-r--r--fs/nfs/pnfs_nfs.c18
-rw-r--r--fs/nfs/read.c94
-rw-r--r--fs/nfs/write.c380
-rw-r--r--fs/nfsd/fault_inject.c142
-rw-r--r--fs/nfsd/filecache.c49
-rw-r--r--fs/nfsd/nfs2acl.c5
-rw-r--r--fs/nfsd/nfs3acl.c5
-rw-r--r--fs/nfsd/nfs3proc.c5
-rw-r--r--fs/nfsd/nfs4layouts.c4
-rw-r--r--fs/nfsd/nfs4proc.c202
-rw-r--r--fs/nfsd/nfs4state.c130
-rw-r--r--fs/nfsd/nfscache.c4
-rw-r--r--fs/nfsd/nfsctl.c77
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsproc.c6
-rw-r--r--fs/nfsd/nfssvc.c23
-rw-r--r--fs/nfsd/state.h2
-rw-r--r--fs/nfsd/trace.h31
-rw-r--r--fs/nfsd/vfs.c8
-rw-r--r--fs/nfsd/xdr4.h2
-rw-r--r--fs/ocfs2/cluster/tcp.c5
-rw-r--r--fs/smbfs_common/smb2pdu.h42
-rw-r--r--fs/splice.c93
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c32
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c32
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c96
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h4
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c50
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h6
-rw-r--r--fs/xfs/xfs_bmap_item.c137
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_error.h12
-rw-r--r--fs/xfs/xfs_extfree_item.c99
-rw-r--r--fs/xfs/xfs_globals.c3
-rw-r--r--fs/xfs/xfs_iomap.c4
-rw-r--r--fs/xfs/xfs_refcount_item.c110
-rw-r--r--fs/xfs/xfs_rmap_item.c142
-rw-r--r--fs/xfs/xfs_sysfs.c12
-rw-r--r--fs/xfs/xfs_sysfs.h10
-rw-r--r--fs/xfs/xfs_trace.h15
-rw-r--r--fs/zonefs/Makefile2
-rw-r--r--fs/zonefs/file.c878
-rw-r--r--fs/zonefs/super.c1887
-rw-r--r--fs/zonefs/sysfs.c2
-rw-r--r--fs/zonefs/trace.h20
-rw-r--r--fs/zonefs/zonefs.h110
126 files changed, 6074 insertions, 5159 deletions
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 3b7e3b9e4fd2..4c0d53bf931a 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -18,40 +18,38 @@ config CIFS
select DNS_RESOLVER
select ASN1
select OID_REGISTRY
+ select NETFS_SUPPORT
help
- This is the client VFS module for the SMB3 family of NAS protocols,
- (including support for the most recent, most secure dialect SMB3.1.1)
- as well as for earlier dialects such as SMB2.1, SMB2 and the older
- Common Internet File System (CIFS) protocol. CIFS was the successor
- to the original dialect, the Server Message Block (SMB) protocol, the
- native file sharing mechanism for most early PC operating systems.
-
- The SMB3 protocol is supported by most modern operating systems
- and NAS appliances (e.g. Samba, Windows 10, Windows Server 2016,
- MacOS) and even in the cloud (e.g. Microsoft Azure).
- The older CIFS protocol was included in Windows NT4, 2000 and XP (and
- later) as well by Samba (which provides excellent CIFS and SMB3
- server support for Linux and many other operating systems). Use of
- dialects older than SMB2.1 is often discouraged on public networks.
+ This is the client VFS module for the SMB3 family of network file
+ protocols (including the most recent, most secure dialect SMB3.1.1).
+ This module also includes support for earlier dialects such as
+ SMB2.1, SMB2 and even the old Common Internet File System (CIFS)
+ protocol. CIFS was the successor to the original network filesystem
+ protocol, Server Message Block (SMB ie SMB1), the native file sharing
+ mechanism for most early PC operating systems.
+
+ The SMB3.1.1 protocol is supported by most modern operating systems
+ and NAS appliances (e.g. Samba, Windows 11, Windows Server 2022,
+ MacOS) and even in the cloud (e.g. Microsoft Azure) and also by the
+ Linux kernel server, ksmbd. Support for the older CIFS protocol was
+ included in Windows NT4, 2000 and XP (and later). Use of dialects
+ older than SMB2.1 is often discouraged on public networks.
This module also provides limited support for OS/2 and Windows ME
and similar very old servers.
- This module provides an advanced network file system client
- for mounting to SMB3 (and CIFS) compliant servers. It includes
- support for DFS (hierarchical name space), secure per-user
- session establishment via Kerberos or NTLM or NTLMv2, RDMA
- (smbdirect), advanced security features, per-share encryption,
- directory leases, safe distributed caching (oplock), optional packet
- signing, Unicode and other internationalization improvements.
+ This module provides an advanced network file system client for
+ mounting to SMB3 (and CIFS) compliant servers. It includes support
+ for DFS (hierarchical name space), secure per-user session
+ establishment via Kerberos or NTLMv2, RDMA (smbdirect), advanced
+ security features, per-share encryption, packet-signing, snapshots,
+ directory leases, safe distributed caching (leases), multichannel,
+ Unicode and other internationalization improvements.
In general, the default dialects, SMB3 and later, enable better
performance, security and features, than would be possible with CIFS.
- Note that when mounting to Samba, due to the CIFS POSIX extensions,
- CIFS mounts can provide slightly better POSIX compatibility
- than SMB3 mounts. SMB2/SMB3 mount options are also
- slightly simpler (compared to CIFS) due to protocol improvements.
- If you need to mount to Samba, Azure, Macs or Windows from this machine, say Y.
+ If you need to mount to Samba, Azure, ksmbd, Macs or Windows from this
+ machine, say Y.
config CIFS_STATS2
bool "Extended statistics"
@@ -111,12 +109,12 @@ config CIFS_POSIX
depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
help
Enabling this option will cause the cifs client to attempt to
- negotiate a newer dialect with servers, such as Samba 3.0.5
- or later, that optionally can handle more POSIX like (rather
- than Windows like) file behavior. It also enables
- support for POSIX ACLs (getfacl and setfacl) to servers
- (such as Samba 3.10 and later) which can negotiate
- CIFS POSIX ACL support. If unsure, say N.
+ negotiate a feature of the older cifs dialect with servers, such as
+ Samba 3.0.5 or later, that optionally can handle more POSIX like
+ (rather than Windows like) file behavior. It also enables support
+ for POSIX ACLs (getfacl and setfacl) to servers (such as Samba 3.10
+ and later) which can negotiate CIFS POSIX ACL support. This config
+ option is not needed when mounting with SMB3.1.1. If unsure, say N.
config CIFS_DEBUG
bool "Enable CIFS debugging routines"
@@ -178,6 +176,8 @@ config CIFS_NFSD_EXPORT
help
Allows NFS server to export a CIFS mounted share (nfsd over cifs)
+if CIFS
+
config CIFS_SMB_DIRECT
bool "SMB Direct support"
depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
@@ -201,3 +201,5 @@ config CIFS_ROOT
Enables root file system support over SMB protocol.
Most people say N here.
+
+endif
diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c
index 60399081046a..75d5e06306ea 100644
--- a/fs/cifs/cached_dir.c
+++ b/fs/cifs/cached_dir.c
@@ -14,6 +14,7 @@
static struct cached_fid *init_cached_dir(const char *path);
static void free_cached_dir(struct cached_fid *cfid);
+static void smb2_close_cached_fid(struct kref *ref);
static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
const char *path,
@@ -181,12 +182,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
rqst[0].rq_iov = open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
- oparms.tcon = tcon;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE);
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.disposition = FILE_OPEN;
- oparms.fid = pfid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE),
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .disposition = FILE_OPEN,
+ .fid = pfid,
+ };
rc = SMB2_open_init(tcon, server,
&rqst[0], &oplock, &oparms, utf16_path);
@@ -220,8 +222,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
}
goto oshr_free;
}
-
- atomic_inc(&tcon->num_remote_opens);
+ cfid->tcon = tcon;
+ cfid->is_open = true;
o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
oparms.fid->persistent_fid = o_rsp->PersistentFileId;
@@ -233,12 +235,12 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE)
goto oshr_free;
-
smb2_parse_contexts(server, o_rsp,
&oparms.fid->epoch,
oparms.fid->lease_key, &oplock,
NULL, NULL);
-
+ if (!(oplock & SMB2_LEASE_READ_CACHING_HE))
+ goto oshr_free;
qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info))
goto oshr_free;
@@ -259,9 +261,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
}
}
cfid->dentry = dentry;
- cfid->tcon = tcon;
cfid->time = jiffies;
- cfid->is_open = true;
cfid->has_lease = true;
oshr_free:
@@ -271,7 +271,7 @@ oshr_free:
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
spin_lock(&cfids->cfid_list_lock);
- if (!cfid->has_lease) {
+ if (rc && !cfid->has_lease) {
if (cfid->on_list) {
list_del(&cfid->entry);
cfid->on_list = false;
@@ -280,13 +280,27 @@ oshr_free:
rc = -ENOENT;
}
spin_unlock(&cfids->cfid_list_lock);
+ if (!rc && !cfid->has_lease) {
+ /*
+ * We are guaranteed to have two references at this point.
+ * One for the caller and one for a potential lease.
+ * Release the Lease-ref so that the directory will be closed
+ * when the caller closes the cached handle.
+ */
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ }
if (rc) {
+ if (cfid->is_open)
+ SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid);
free_cached_dir(cfid);
cfid = NULL;
}
- if (rc == 0)
+ if (rc == 0) {
*ret_cfid = cfid;
+ atomic_inc(&tcon->num_remote_opens);
+ }
return rc;
}
@@ -335,6 +349,7 @@ smb2_close_cached_fid(struct kref *ref)
if (cfid->is_open) {
SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
cfid->fid.volatile_fid);
+ atomic_dec(&cfid->tcon->num_remote_opens);
}
free_cached_dir(cfid);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 56b23def4c95..1911f7016fa1 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/ctype.h>
+#include <linux/kstrtox.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h>
@@ -455,8 +456,10 @@ skip_rdma:
spin_lock(&ses->iface_lock);
if (ses->iface_count)
- seq_printf(m, "\n\n\tServer interfaces: %zu",
- ses->iface_count);
+ seq_printf(m, "\n\n\tServer interfaces: %zu"
+ "\tLast updated: %lu seconds ago",
+ ses->iface_count,
+ (jiffies - ses->iface_last_update) / HZ);
j = 0;
list_for_each_entry(iface, &ses->iface_list,
iface_head) {
@@ -787,7 +790,7 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
rc = get_user(c[0], buffer);
if (rc)
return rc;
- if (strtobool(c, &bv) == 0)
+ if (kstrtobool(c, &bv) == 0)
cifsFYI = bv;
else if ((c[0] > '1') && (c[0] <= '9'))
cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */
@@ -947,7 +950,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
if (count < 3) {
/* single char or single char followed by null */
- if (strtobool(flags_string, &bv) == 0) {
+ if (kstrtobool(flags_string, &bv) == 0) {
global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF;
return count;
} else if (!isdigit(flags_string[0])) {
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index 7f102ffeb675..e4d751b0c812 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -24,7 +24,7 @@ struct cifs_spnego_msg {
uint32_t flags;
uint32_t sesskey_len;
uint32_t secblob_len;
- uint8_t data[1];
+ uint8_t data[];
};
#ifdef __KERNEL__
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 9a2d390bd06f..f5b6df82e857 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1428,14 +1428,15 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
tcon = tlink_tcon(tlink);
xid = get_xid();
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = READ_CONTROL;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.disposition = FILE_OPEN;
- oparms.path = path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = READ_CONTROL,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .disposition = FILE_OPEN,
+ .path = path,
+ .fid = &fid,
+ };
rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (!rc) {
@@ -1494,14 +1495,15 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
else
access_flags = WRITE_DAC;
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = access_flags;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.disposition = FILE_OPEN;
- oparms.path = path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = access_flags,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .disposition = FILE_OPEN,
+ .path = path,
+ .fid = &fid,
+ };
rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc) {
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index cbc18b4a9cb2..357bd27a7fd1 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -24,12 +24,156 @@
#include "../smbfs_common/arc4.h"
#include <crypto/aead.h>
+/*
+ * Hash data from a BVEC-type iterator.
+ */
+static int cifs_shash_bvec(const struct iov_iter *iter, ssize_t maxsize,
+ struct shash_desc *shash)
+{
+ const struct bio_vec *bv = iter->bvec;
+ unsigned long start = iter->iov_offset;
+ unsigned int i;
+ void *p;
+ int ret;
+
+ for (i = 0; i < iter->nr_segs; i++) {
+ size_t off, len;
+
+ len = bv[i].bv_len;
+ if (start >= len) {
+ start -= len;
+ continue;
+ }
+
+ len = min_t(size_t, maxsize, len - start);
+ off = bv[i].bv_offset + start;
+
+ p = kmap_local_page(bv[i].bv_page);
+ ret = crypto_shash_update(shash, p + off, len);
+ kunmap_local(p);
+ if (ret < 0)
+ return ret;
+
+ maxsize -= len;
+ if (maxsize <= 0)
+ break;
+ start = 0;
+ }
+
+ return 0;
+}
+
+/*
+ * Hash data from a KVEC-type iterator.
+ */
+static int cifs_shash_kvec(const struct iov_iter *iter, ssize_t maxsize,
+ struct shash_desc *shash)
+{
+ const struct kvec *kv = iter->kvec;
+ unsigned long start = iter->iov_offset;
+ unsigned int i;
+ int ret;
+
+ for (i = 0; i < iter->nr_segs; i++) {
+ size_t len;
+
+ len = kv[i].iov_len;
+ if (start >= len) {
+ start -= len;
+ continue;
+ }
+
+ len = min_t(size_t, maxsize, len - start);
+ ret = crypto_shash_update(shash, kv[i].iov_base + start, len);
+ if (ret < 0)
+ return ret;
+ maxsize -= len;
+
+ if (maxsize <= 0)
+ break;
+ start = 0;
+ }
+
+ return 0;
+}
+
+/*
+ * Hash data from an XARRAY-type iterator.
+ */
+static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize,
+ struct shash_desc *shash)
+{
+ struct folio *folios[16], *folio;
+ unsigned int nr, i, j, npages;
+ loff_t start = iter->xarray_start + iter->iov_offset;
+ pgoff_t last, index = start / PAGE_SIZE;
+ ssize_t ret = 0;
+ size_t len, offset, foffset;
+ void *p;
+
+ if (maxsize == 0)
+ return 0;
+
+ last = (start + maxsize - 1) / PAGE_SIZE;
+ do {
+ nr = xa_extract(iter->xarray, (void **)folios, index, last,
+ ARRAY_SIZE(folios), XA_PRESENT);
+ if (nr == 0)
+ return -EIO;
+
+ for (i = 0; i < nr; i++) {
+ folio = folios[i];
+ npages = folio_nr_pages(folio);
+ foffset = start - folio_pos(folio);
+ offset = foffset % PAGE_SIZE;
+ for (j = foffset / PAGE_SIZE; j < npages; j++) {
+ len = min_t(size_t, maxsize, PAGE_SIZE - offset);
+ p = kmap_local_page(folio_page(folio, j));
+ ret = crypto_shash_update(shash, p, len);
+ kunmap_local(p);
+ if (ret < 0)
+ return ret;
+ maxsize -= len;
+ if (maxsize <= 0)
+ return 0;
+ start += len;
+ offset = 0;
+ index++;
+ }
+ }
+ } while (nr == ARRAY_SIZE(folios));
+ return 0;
+}
+
+/*
+ * Pass the data from an iterator into a hash.
+ */
+static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize,
+ struct shash_desc *shash)
+{
+ if (maxsize == 0)
+ return 0;
+
+ switch (iov_iter_type(iter)) {
+ case ITER_BVEC:
+ return cifs_shash_bvec(iter, maxsize, shash);
+ case ITER_KVEC:
+ return cifs_shash_kvec(iter, maxsize, shash);
+ case ITER_XARRAY:
+ return cifs_shash_xarray(iter, maxsize, shash);
+ default:
+ pr_err("cifs_shash_iter(%u) unsupported\n", iov_iter_type(iter));
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+}
+
int __cifs_calc_signature(struct smb_rqst *rqst,
- struct TCP_Server_Info *server, char *signature,
- struct shash_desc *shash)
+ struct TCP_Server_Info *server, char *signature,
+ struct shash_desc *shash)
{
int i;
- int rc;
+ ssize_t rc;
struct kvec *iov = rqst->rq_iov;
int n_vec = rqst->rq_nvec;
@@ -61,25 +205,9 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
}
}
- /* now hash over the rq_pages array */
- for (i = 0; i < rqst->rq_npages; i++) {
- void *kaddr;
- unsigned int len, offset;
-
- rqst_page_get_length(rqst, i, &len, &offset);
-
- kaddr = (char *) kmap(rqst->rq_pages[i]) + offset;
-
- rc = crypto_shash_update(shash, kaddr, len);
- if (rc) {
- cifs_dbg(VFS, "%s: Could not update with payload\n",
- __func__);
- kunmap(rqst->rq_pages[i]);
- return rc;
- }
-
- kunmap(rqst->rq_pages[i]);
- }
+ rc = cifs_shash_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), shash);
+ if (rc < 0)
+ return rc;
rc = crypto_shash_final(shash, signature);
if (rc)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index cb7c5460a80b..cbcf210d56e4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1359,7 +1359,7 @@ const struct file_operations cifs_file_ops = {
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
- .splice_read = generic_file_splice_read,
+ .splice_read = cifs_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
@@ -1379,7 +1379,7 @@ const struct file_operations cifs_file_strict_ops = {
.fsync = cifs_strict_fsync,
.flush = cifs_flush,
.mmap = cifs_file_strict_mmap,
- .splice_read = generic_file_splice_read,
+ .splice_read = cifs_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
@@ -1399,7 +1399,7 @@ const struct file_operations cifs_file_direct_ops = {
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
@@ -1417,7 +1417,7 @@ const struct file_operations cifs_file_nobrl_ops = {
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
- .splice_read = generic_file_splice_read,
+ .splice_read = cifs_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
@@ -1435,7 +1435,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.fsync = cifs_strict_fsync,
.flush = cifs_flush,
.mmap = cifs_file_strict_mmap,
- .splice_read = generic_file_splice_read,
+ .splice_read = cifs_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
@@ -1453,7 +1453,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
- .splice_read = generic_file_splice_read,
+ .splice_read = direct_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
.copy_file_range = cifs_copy_file_range,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b58cd737b21e..71fe0a0a7992 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,6 +100,9 @@ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags);
extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock);
extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
@@ -110,6 +113,9 @@ extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma);
extern const struct file_operations cifs_dir_ops;
extern int cifs_dir_open(struct inode *inode, struct file *file);
extern int cifs_readdir(struct file *file, struct dir_context *ctx);
+extern void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len);
+extern void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len);
+extern void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len);
/* Functions related to dir entries */
extern const struct dentry_operations cifs_dentry_ops;
@@ -154,5 +160,5 @@ extern const struct export_operations cifs_export_ops;
/* when changing internal version - update following two lines at same time */
#define SMB3_PRODUCT_BUILD 41
-#define CIFS_VERSION "2.41"
+#define CIFS_VERSION "2.42"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index cd8171a1c9a0..a99883f16d94 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -78,10 +78,6 @@
#define SMB_ECHO_INTERVAL_MAX 600
#define SMB_ECHO_INTERVAL_DEFAULT 60
-/* dns resolution intervals in seconds */
-#define SMB_DNS_RESOLVE_INTERVAL_MIN 120
-#define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600
-
/* smb multichannel query server interfaces interval in seconds */
#define SMB_INTERFACE_POLL_INTERVAL 600
@@ -217,11 +213,9 @@ static inline void cifs_free_open_info(struct cifs_open_info_data *data)
struct smb_rqst {
struct kvec *rq_iov; /* array of kvecs */
unsigned int rq_nvec; /* number of kvecs in array */
- struct page **rq_pages; /* pointer to array of page ptrs */
- unsigned int rq_offset; /* the offset to the 1st page */
- unsigned int rq_npages; /* number pages in array */
- unsigned int rq_pagesz; /* page size to use */
- unsigned int rq_tailsz; /* length of last page */
+ size_t rq_iter_size; /* Amount of data in ->rq_iter */
+ struct iov_iter rq_iter; /* Data iterator */
+ struct xarray rq_buffer; /* Page buffer for encryption */
};
struct mid_q_entry;
@@ -692,7 +686,6 @@ struct TCP_Server_Info {
/* point to the SMBD connection if RDMA is used instead of socket */
struct smbd_connection *smbd_conn;
struct delayed_work echo; /* echo ping workqueue job */
- struct delayed_work resolve; /* dns resolution workqueue job */
char *smallbuf; /* pointer to current "small" buffer */
char *bigbuf; /* pointer to current "big" buffer */
/* Total size of this PDU. Only valid from cifs_demultiplex_thread */
@@ -1427,10 +1420,11 @@ struct cifs_aio_ctx {
struct cifsFileInfo *cfile;
struct bio_vec *bv;
loff_t pos;
- unsigned int npages;
+ unsigned int nr_pinned_pages;
ssize_t rc;
unsigned int len;
unsigned int total_len;
+ unsigned int bv_need_unpin; /* If ->bv[] needs unpinning */
bool should_dirty;
/*
* Indicates if this aio_ctx is for direct_io,
@@ -1448,28 +1442,18 @@ struct cifs_readdata {
struct address_space *mapping;
struct cifs_aio_ctx *ctx;
__u64 offset;
+ ssize_t got_bytes;
unsigned int bytes;
- unsigned int got_bytes;
pid_t pid;
int result;
struct work_struct work;
- int (*read_into_pages)(struct TCP_Server_Info *server,
- struct cifs_readdata *rdata,
- unsigned int len);
- int (*copy_into_pages)(struct TCP_Server_Info *server,
- struct cifs_readdata *rdata,
- struct iov_iter *iter);
+ struct iov_iter iter;
struct kvec iov[2];
struct TCP_Server_Info *server;
#ifdef CONFIG_CIFS_SMB_DIRECT
struct smbd_mr *mr;
#endif
- unsigned int pagesz;
- unsigned int page_offset;
- unsigned int tailsz;
struct cifs_credits credits;
- unsigned int nr_pages;
- struct page **pages;
};
/* asynchronous write support */
@@ -1481,6 +1465,8 @@ struct cifs_writedata {
struct work_struct work;
struct cifsFileInfo *cfile;
struct cifs_aio_ctx *ctx;
+ struct iov_iter iter;
+ struct bio_vec *bv;
__u64 offset;
pid_t pid;
unsigned int bytes;
@@ -1489,12 +1475,7 @@ struct cifs_writedata {
#ifdef CONFIG_CIFS_SMB_DIRECT
struct smbd_mr *mr;
#endif
- unsigned int pagesz;
- unsigned int page_offset;
- unsigned int tailsz;
struct cifs_credits credits;
- unsigned int nr_pages;
- struct page **pages;
};
/*
@@ -2154,15 +2135,21 @@ static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const
dst->FileNameLength = src->FileNameLength;
}
-static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst,
- int num_rqst,
- const u8 *sig)
+static inline int cifs_get_num_sgs(const struct smb_rqst *rqst,
+ int num_rqst,
+ const u8 *sig)
{
unsigned int len, skip;
unsigned int nents = 0;
unsigned long addr;
int i, j;
+ /*
+ * The first rqst has a transform header where the first 20 bytes are
+ * not part of the encrypted blob.
+ */
+ skip = 20;
+
/* Assumes the first rqst has a transform header as the first iov.
* I.e.
* rqst[0].rq_iov[0] is transform header
@@ -2170,14 +2157,22 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst,
* rqst[1+].rq_iov[0+] data to be encrypted/decrypted
*/
for (i = 0; i < num_rqst; i++) {
- /*
- * The first rqst has a transform header where the
- * first 20 bytes are not part of the encrypted blob.
+ /* We really don't want a mixture of pinned and unpinned pages
+ * in the sglist. It's hard to keep track of which is what.
+ * Instead, we convert to a BVEC-type iterator higher up.
*/
+ if (WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter)))
+ return -EIO;
+
+ /* We also don't want to have any extra refs or pins to clean
+ * up in the sglist.
+ */
+ if (WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter)))
+ return -EIO;
+
for (j = 0; j < rqst[i].rq_nvec; j++) {
struct kvec *iov = &rqst[i].rq_iov[j];
- skip = (i == 0) && (j == 0) ? 20 : 0;
addr = (unsigned long)iov->iov_base + skip;
if (unlikely(is_vmalloc_addr((void *)addr))) {
len = iov->iov_len - skip;
@@ -2186,8 +2181,9 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst,
} else {
nents++;
}
+ skip = 0;
}
- nents += rqst[i].rq_npages;
+ nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX);
}
nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE);
return nents;
@@ -2196,9 +2192,9 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst,
/* We can not use the normal sg_set_buf() as we will sometimes pass a
* stack object as buf.
*/
-static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg,
- const void *buf,
- unsigned int buflen)
+static inline void cifs_sg_set_buf(struct sg_table *sgtable,
+ const void *buf,
+ unsigned int buflen)
{
unsigned long addr = (unsigned long)buf;
unsigned int off = offset_in_page(addr);
@@ -2208,16 +2204,17 @@ static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg,
do {
unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off);
- sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off);
+ sg_set_page(&sgtable->sgl[sgtable->nents++],
+ vmalloc_to_page((void *)addr), len, off);
off = 0;
addr += PAGE_SIZE;
buflen -= len;
} while (buflen);
} else {
- sg_set_page(sg++, virt_to_page(addr), buflen, off);
+ sg_set_page(&sgtable->sgl[sgtable->nents++],
+ virt_to_page(addr), buflen, off);
}
- return sg;
}
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 623caece2b10..445e3eaebcc1 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -562,7 +562,7 @@ typedef union smb_com_session_setup_andx {
__u32 Reserved;
__le32 Capabilities; /* see below */
__le16 ByteCount;
- unsigned char SecurityBlob[1]; /* followed by */
+ unsigned char SecurityBlob[]; /* followed by */
/* STRING NativeOS */
/* STRING NativeLanMan */
} __attribute__((packed)) req; /* NTLM request format (with
@@ -582,7 +582,7 @@ typedef union smb_com_session_setup_andx {
__u32 Reserved; /* see below */
__le32 Capabilities;
__le16 ByteCount;
- unsigned char CaseInsensitivePassword[1]; /* followed by: */
+ unsigned char CaseInsensitivePassword[]; /* followed by: */
/* unsigned char * CaseSensitivePassword; */
/* STRING AccountName */
/* STRING PrimaryDomain */
@@ -599,7 +599,7 @@ typedef union smb_com_session_setup_andx {
__le16 Action; /* see below */
__le16 SecurityBlobLength;
__u16 ByteCount;
- unsigned char SecurityBlob[1]; /* followed by */
+ unsigned char SecurityBlob[]; /* followed by */
/* unsigned char * NativeOS; */
/* unsigned char * NativeLanMan; */
/* unsigned char * PrimaryDomain; */
@@ -618,7 +618,7 @@ typedef union smb_com_session_setup_andx {
__le16 PasswordLength;
__u32 Reserved; /* encrypt key len and offset */
__le16 ByteCount;
- unsigned char AccountPassword[1]; /* followed by */
+ unsigned char AccountPassword[]; /* followed by */
/* STRING AccountName */
/* STRING PrimaryDomain */
/* STRING NativeOS */
@@ -632,7 +632,7 @@ typedef union smb_com_session_setup_andx {
__le16 AndXOffset;
__le16 Action; /* see below */
__u16 ByteCount;
- unsigned char NativeOS[1]; /* followed by */
+ unsigned char NativeOS[]; /* followed by */
/* unsigned char * NativeLanMan; */
/* unsigned char * PrimaryDomain; */
} __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */
@@ -693,7 +693,7 @@ typedef struct smb_com_tconx_req {
__le16 Flags; /* see below */
__le16 PasswordLength;
__le16 ByteCount;
- unsigned char Password[1]; /* followed by */
+ unsigned char Password[]; /* followed by */
/* STRING Path *//* \\server\share name */
/* STRING Service */
} __attribute__((packed)) TCONX_REQ;
@@ -705,7 +705,7 @@ typedef struct smb_com_tconx_rsp {
__le16 AndXOffset;
__le16 OptionalSupport; /* see below */
__u16 ByteCount;
- unsigned char Service[1]; /* always ASCII, not Unicode */
+ unsigned char Service[]; /* always ASCII, not Unicode */
/* STRING NativeFileSystem */
} __attribute__((packed)) TCONX_RSP;
@@ -718,7 +718,7 @@ typedef struct smb_com_tconx_rsp_ext {
__le32 MaximalShareAccessRights;
__le32 GuestMaximalShareAccessRights;
__u16 ByteCount;
- unsigned char Service[1]; /* always ASCII, not Unicode */
+ unsigned char Service[]; /* always ASCII, not Unicode */
/* STRING NativeFileSystem */
} __attribute__((packed)) TCONX_RSP_EXT;
@@ -755,14 +755,14 @@ typedef struct smb_com_echo_req {
struct smb_hdr hdr;
__le16 EchoCount;
__le16 ByteCount;
- char Data[1];
+ char Data[];
} __attribute__((packed)) ECHO_REQ;
typedef struct smb_com_echo_rsp {
struct smb_hdr hdr;
__le16 SequenceNumber;
__le16 ByteCount;
- char Data[1];
+ char Data[];
} __attribute__((packed)) ECHO_RSP;
typedef struct smb_com_logoff_andx_req {
@@ -862,7 +862,7 @@ typedef struct smb_com_open_req { /* also handles create */
__le32 ImpersonationLevel;
__u8 SecurityFlags;
__le16 ByteCount;
- char fileName[1];
+ char fileName[];
} __attribute__((packed)) OPEN_REQ;
/* open response: oplock levels */
@@ -937,7 +937,7 @@ typedef struct smb_com_openx_req {
__le32 Timeout;
__le32 Reserved;
__le16 ByteCount; /* file name follows */
- char fileName[1];
+ char fileName[];
} __attribute__((packed)) OPENX_REQ;
typedef struct smb_com_openx_rsp {
@@ -1085,7 +1085,7 @@ typedef struct smb_com_lock_req {
__le16 NumberOfUnlocks;
__le16 NumberOfLocks;
__le16 ByteCount;
- LOCKING_ANDX_RANGE Locks[1];
+ LOCKING_ANDX_RANGE Locks[];
} __attribute__((packed)) LOCK_REQ;
/* lock type */
@@ -1114,7 +1114,7 @@ typedef struct smb_com_rename_req {
__le16 SearchAttributes; /* target file attributes */
__le16 ByteCount;
__u8 BufferFormat; /* 4 = ASCII or Unicode */
- unsigned char OldFileName[1];
+ unsigned char OldFileName[];
/* followed by __u8 BufferFormat2 */
/* followed by NewFileName */
} __attribute__((packed)) RENAME_REQ;
@@ -1134,7 +1134,7 @@ typedef struct smb_com_copy_req {
__le16 Flags;
__le16 ByteCount;
__u8 BufferFormat; /* 4 = ASCII or Unicode */
- unsigned char OldFileName[1];
+ unsigned char OldFileName[];
/* followed by __u8 BufferFormat2 */
/* followed by NewFileName string */
} __attribute__((packed)) COPY_REQ;
@@ -1144,7 +1144,7 @@ typedef struct smb_com_copy_rsp {
__le16 CopyCount; /* number of files copied */
__u16 ByteCount; /* may be zero */
__u8 BufferFormat; /* 0x04 - only present if errored file follows */
- unsigned char ErrorFileName[1]; /* only present if error in copy */
+ unsigned char ErrorFileName[]; /* only present if error in copy */
} __attribute__((packed)) COPY_RSP;
#define CREATE_HARD_LINK 0x103
@@ -1158,7 +1158,7 @@ typedef struct smb_com_nt_rename_req { /* A5 - also used for create hardlink */
__le32 ClusterCount;
__le16 ByteCount;
__u8 BufferFormat; /* 4 = ASCII or Unicode */
- unsigned char OldFileName[1];
+ unsigned char OldFileName[];
/* followed by __u8 BufferFormat2 */
/* followed by NewFileName */
} __attribute__((packed)) NT_RENAME_REQ;
@@ -1173,7 +1173,7 @@ typedef struct smb_com_delete_file_req {
__le16 SearchAttributes;
__le16 ByteCount;
__u8 BufferFormat; /* 4 = ASCII */
- unsigned char fileName[1];
+ unsigned char fileName[];
} __attribute__((packed)) DELETE_FILE_REQ;
typedef struct smb_com_delete_file_rsp {
@@ -1185,7 +1185,7 @@ typedef struct smb_com_delete_directory_req {
struct smb_hdr hdr; /* wct = 0 */
__le16 ByteCount;
__u8 BufferFormat; /* 4 = ASCII */
- unsigned char DirName[1];
+ unsigned char DirName[];
} __attribute__((packed)) DELETE_DIRECTORY_REQ;
typedef struct smb_com_delete_directory_rsp {
@@ -1197,7 +1197,7 @@ typedef struct smb_com_create_directory_req {
struct smb_hdr hdr; /* wct = 0 */
__le16 ByteCount;
__u8 BufferFormat; /* 4 = ASCII */
- unsigned char DirName[1];
+ unsigned char DirName[];
} __attribute__((packed)) CREATE_DIRECTORY_REQ;
typedef struct smb_com_create_directory_rsp {
@@ -1209,7 +1209,7 @@ typedef struct smb_com_query_information_req {
struct smb_hdr hdr; /* wct = 0 */
__le16 ByteCount; /* 1 + namelen + 1 */
__u8 BufferFormat; /* 4 = ASCII */
- unsigned char FileName[1];
+ unsigned char FileName[];
} __attribute__((packed)) QUERY_INFORMATION_REQ;
typedef struct smb_com_query_information_rsp {
@@ -1229,7 +1229,7 @@ typedef struct smb_com_setattr_req {
__le16 reserved[5]; /* must be zero */
__u16 ByteCount;
__u8 BufferFormat; /* 4 = ASCII */
- unsigned char fileName[1];
+ unsigned char fileName[];
} __attribute__((packed)) SETATTR_REQ;
typedef struct smb_com_setattr_rsp {
@@ -1311,7 +1311,7 @@ typedef struct smb_com_transaction_ioctl_req {
__u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */
__le16 ByteCount;
__u8 Pad[3];
- __u8 Data[1];
+ __u8 Data[];
} __attribute__((packed)) TRANSACT_IOCTL_REQ;
typedef struct smb_com_transaction_compr_ioctl_req {
@@ -1430,7 +1430,7 @@ typedef struct smb_com_transaction_change_notify_req {
__u8 Reserved2;
__le16 ByteCount;
/* __u8 Pad[3];*/
-/* __u8 Data[1];*/
+/* __u8 Data[];*/
} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ;
/* BB eventually change to use generic ntransact rsp struct
@@ -1519,7 +1519,7 @@ struct cifs_quota_data {
__u64 space_used;
__u64 soft_limit;
__u64 hard_limit;
- char sid[1]; /* variable size? */
+ char sid[]; /* variable size? */
} __attribute__((packed));
/* quota sub commands */
@@ -1671,7 +1671,7 @@ typedef struct smb_com_transaction2_qpi_req {
__u8 Pad;
__le16 InformationLevel;
__u32 Reserved4;
- char FileName[1];
+ char FileName[];
} __attribute__((packed)) TRANSACTION2_QPI_REQ;
typedef struct smb_com_transaction2_qpi_rsp {
@@ -1704,7 +1704,7 @@ typedef struct smb_com_transaction2_spi_req {
__u16 Pad1;
__le16 InformationLevel;
__u32 Reserved4;
- char FileName[1];
+ char FileName[];
} __attribute__((packed)) TRANSACTION2_SPI_REQ;
typedef struct smb_com_transaction2_spi_rsp {
@@ -1809,7 +1809,7 @@ typedef struct smb_com_transaction2_ffirst_req {
__le16 SearchFlags;
__le16 InformationLevel;
__le32 SearchStorageType;
- char FileName[1];
+ char FileName[];
} __attribute__((packed)) TRANSACTION2_FFIRST_REQ;
typedef struct smb_com_transaction2_ffirst_rsp {
@@ -2020,7 +2020,7 @@ typedef struct smb_com_transaction2_get_dfs_refer_req {
perhaps?) followed by one byte pad - doesn't
seem to matter though */
__le16 MaxReferralLevel;
- char RequestFileName[1];
+ char RequestFileName[];
} __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_REQ;
#define DFS_VERSION cpu_to_le16(0x0003)
@@ -2049,7 +2049,7 @@ struct get_dfs_referral_rsp {
__le16 PathConsumed;
__le16 NumberOfReferrals;
__le32 DFSFlags;
- REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */
+ REFERRAL3 referrals[]; /* array of level 3 dfs_referral structures */
/* followed by the strings pointed to by the referral structures */
} __packed;
@@ -2284,7 +2284,10 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */
__le32 Mode;
__le32 AlignmentRequirement;
__le32 FileNameLength;
- char FileName[1];
+ union {
+ char __pad;
+ DECLARE_FLEX_ARRAY(char, FileName);
+ };
} __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */
typedef struct {
@@ -2322,7 +2325,7 @@ typedef struct {
} __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */
typedef struct {
- char LinkDest[1];
+ DECLARE_FLEX_ARRAY(char, LinkDest);
} __attribute__((packed)) FILE_UNIX_LINK_INFO; /* level 0x201 QPathInfo */
/* The following three structures are needed only for
@@ -2371,7 +2374,7 @@ struct file_end_of_file_info {
} __attribute__((packed)); /* size info, level 0x104 for set, 0x106 for query */
struct file_alt_name_info {
- __u8 alt_name[1];
+ DECLARE_FLEX_ARRAY(__u8, alt_name);
} __attribute__((packed)); /* level 0x0108 */
struct file_stream_info {
@@ -2480,7 +2483,10 @@ typedef struct {
__le32 NextEntryOffset;
__u32 ResumeKey; /* as with FileIndex - no need to convert */
FILE_UNIX_BASIC_INFO basic;
- char FileName[1];
+ union {
+ char __pad;
+ DECLARE_FLEX_ARRAY(char, FileName);
+ };
} __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */
typedef struct {
@@ -2494,7 +2500,7 @@ typedef struct {
__le64 AllocationSize;
__le32 ExtFileAttributes;
__le32 FileNameLength;
- char FileName[1];
+ char FileName[];
} __attribute__((packed)) FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */
typedef struct {
@@ -2509,7 +2515,7 @@ typedef struct {
__le32 ExtFileAttributes;
__le32 FileNameLength;
__le32 EaSize; /* length of the xattrs */
- char FileName[1];
+ char FileName[];
} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */
typedef struct {
@@ -2526,7 +2532,7 @@ typedef struct {
__le32 EaSize; /* EA size */
__le32 Reserved;
__le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
- char FileName[1];
+ char FileName[];
} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */
typedef struct {
@@ -2544,7 +2550,7 @@ typedef struct {
__u8 ShortNameLength;
__u8 Reserved;
__u8 ShortName[24];
- char FileName[1];
+ char FileName[];
} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */
typedef struct {
@@ -2559,7 +2565,7 @@ typedef struct {
__le32 AllocationSize;
__le16 Attributes; /* verify not u32 */
__u8 FileNameLength;
- char FileName[1];
+ char FileName[];
} __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */
@@ -2569,21 +2575,11 @@ struct win_dev {
__le64 minor;
} __attribute__((packed));
-struct gea {
- unsigned char name_len;
- char name[1];
-} __attribute__((packed));
-
-struct gealist {
- unsigned long list_len;
- struct gea list[1];
-} __attribute__((packed));
-
struct fea {
unsigned char EA_flags;
__u8 name_len;
__le16 value_len;
- char name[1];
+ char name[];
/* optionally followed by value */
} __attribute__((packed));
/* flags for _FEA.fEA */
@@ -2591,7 +2587,7 @@ struct fea {
struct fealist {
__le32 list_len;
- struct fea list[1];
+ struct fea list;
} __attribute__((packed));
/* used to hold an arbitrary blob of data */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b8a47704a6ef..b7a36ebd0f2f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -244,6 +244,9 @@ extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
struct page *page,
unsigned int page_offset,
unsigned int to_read);
+int cifs_read_iter_from_socket(struct TCP_Server_Info *server,
+ struct iov_iter *iter,
+ unsigned int to_read);
extern int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb);
void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx);
int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx);
@@ -581,10 +584,7 @@ int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
int cifs_async_writev(struct cifs_writedata *wdata,
void (*release)(struct kref *kref));
void cifs_writev_complete(struct work_struct *work);
-struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
- work_func_t complete);
-struct cifs_writedata *cifs_writedata_direct_alloc(struct page **pages,
- work_func_t complete);
+struct cifs_writedata *cifs_writedata_alloc(work_func_t complete);
void cifs_writedata_release(struct kref *refcount);
int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
@@ -601,13 +601,10 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
struct cifs_aio_ctx *cifs_aio_ctx_alloc(void);
void cifs_aio_ctx_release(struct kref *refcount);
-int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
int cifs_alloc_hash(const char *name, struct shash_desc **sdesc);
void cifs_free_hash(struct shash_desc **sdesc);
-void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
- unsigned int *len, unsigned int *offset);
struct cifs_chan *
cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server);
int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 60dd4e37030a..a24e4ddf8043 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -25,6 +25,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/uaccess.h>
#include "cifspdu.h"
+#include "cifsfs.h"
#include "cifsglob.h"
#include "cifsacl.h"
#include "cifsproto.h"
@@ -1295,11 +1296,8 @@ cifs_readv_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *server = tcon->ses->server;
struct smb_rqst rqst = { .rq_iov = rdata->iov,
.rq_nvec = 2,
- .rq_pages = rdata->pages,
- .rq_offset = rdata->page_offset,
- .rq_npages = rdata->nr_pages,
- .rq_pagesz = rdata->pagesz,
- .rq_tailsz = rdata->tailsz };
+ .rq_iter_size = iov_iter_count(&rdata->iter),
+ .rq_iter = rdata->iter };
struct cifs_credits credits = { .value = 1, .instance = 0 };
cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
@@ -1738,11 +1736,8 @@ cifs_async_writev(struct cifs_writedata *wdata,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
- rqst.rq_pages = wdata->pages;
- rqst.rq_offset = wdata->page_offset;
- rqst.rq_npages = wdata->nr_pages;
- rqst.rq_pagesz = wdata->pagesz;
- rqst.rq_tailsz = wdata->tailsz;
+ rqst.rq_iter = wdata->iter;
+ rqst.rq_iter_size = iov_iter_count(&wdata->iter);
cifs_dbg(FYI, "async write at %llu %u bytes\n",
wdata->offset, wdata->bytes);
@@ -5373,14 +5368,15 @@ CIFSSMBSetPathInfoFB(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
int rc;
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.disposition = FILE_OPEN;
- oparms.path = fileName;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = GENERIC_WRITE,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .disposition = FILE_OPEN,
+ .path = fileName,
+ .fid = &fid,
+ };
rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc)
@@ -5787,7 +5783,7 @@ QAllEAsRetry:
/* account for ea list len */
list_len -= 4;
- temp_fea = ea_response_data->list;
+ temp_fea = &ea_response_data->list;
temp_ptr = (char *)temp_fea;
while (list_len > 0) {
unsigned int name_len;
@@ -5902,7 +5898,7 @@ SetEARetry:
else
name_len = strnlen(ea_name, 255);
- count = sizeof(*parm_data) + ea_value_len + name_len;
+ count = sizeof(*parm_data) + 1 + ea_value_len + name_len;
pSMB->MaxParameterCount = cpu_to_le16(2);
/* BB find max SMB PDU from sess */
pSMB->MaxDataCount = cpu_to_le16(1000);
@@ -5926,14 +5922,14 @@ SetEARetry:
byte_count = 3 /* pad */ + params + count;
pSMB->DataCount = cpu_to_le16(count);
parm_data->list_len = cpu_to_le32(count);
- parm_data->list[0].EA_flags = 0;
+ parm_data->list.EA_flags = 0;
/* we checked above that name len is less than 255 */
- parm_data->list[0].name_len = (__u8)name_len;
+ parm_data->list.name_len = (__u8)name_len;
/* EA names are always ASCII */
if (ea_name)
- strncpy(parm_data->list[0].name, ea_name, name_len);
- parm_data->list[0].name[name_len] = 0;
- parm_data->list[0].value_len = cpu_to_le16(ea_value_len);
+ strncpy(parm_data->list.name, ea_name, name_len);
+ parm_data->list.name[name_len] = '\0';
+ parm_data->list.value_len = cpu_to_le16(ea_value_len);
/* caller ensures that ea_value_len is less than 64K but
we need to ensure that it fits within the smb */
@@ -5941,7 +5937,7 @@ SetEARetry:
negotiated SMB buffer size BB */
/* if (ea_value_len > buffer_size - 512 (enough for header)) */
if (ea_value_len)
- memcpy(parm_data->list[0].name+name_len+1,
+ memcpy(parm_data->list.name + name_len + 1,
ea_value, ea_value_len);
pSMB->TotalDataCount = pSMB->DataCount;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e6088d96eb04..ec020d860be3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -79,8 +79,6 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
int len;
char *unc;
struct sockaddr_storage ss;
- time64_t expiry, now;
- unsigned long ttl = SMB_DNS_RESOLVE_INTERVAL_DEFAULT;
if (!server->hostname)
return -EINVAL;
@@ -102,29 +100,19 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
ss = server->dstaddr;
spin_unlock(&server->srv_lock);
- rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, &expiry);
+ rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, NULL);
kfree(unc);
if (rc < 0) {
cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n",
__func__, server->hostname, rc);
- goto requeue_resolve;
+ } else {
+ spin_lock(&server->srv_lock);
+ memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr));
+ spin_unlock(&server->srv_lock);
+ rc = 0;
}
- spin_lock(&server->srv_lock);
- memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr));
- spin_unlock(&server->srv_lock);
-
- now = ktime_get_real_seconds();
- if (expiry && expiry > now)
- /* To make sure we don't use the cached entry, retry 1s */
- ttl = max_t(unsigned long, expiry - now, SMB_DNS_RESOLVE_INTERVAL_MIN) + 1;
-
-requeue_resolve:
- cifs_dbg(FYI, "%s: next dns resolution scheduled for %lu seconds in the future\n",
- __func__, ttl);
- mod_delayed_work(cifsiod_wq, &server->resolve, (ttl * HZ));
-
return rc;
}
@@ -148,26 +136,6 @@ static void smb2_query_server_interfaces(struct work_struct *work)
(SMB_INTERFACE_POLL_INTERVAL * HZ));
}
-static void cifs_resolve_server(struct work_struct *work)
-{
- int rc;
- struct TCP_Server_Info *server = container_of(work,
- struct TCP_Server_Info, resolve.work);
-
- cifs_server_lock(server);
-
- /*
- * Resolve the hostname again to make sure that IP address is up-to-date.
- */
- rc = reconn_set_ipaddr_from_hostname(server);
- if (rc) {
- cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
- __func__, rc);
- }
-
- cifs_server_unlock(server);
-}
-
/*
* Update the tcpStatus for the server.
* This is used to signal the cifsd thread to call cifs_reconnect
@@ -766,6 +734,20 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
return cifs_readv_from_socket(server, &smb_msg);
}
+int
+cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter,
+ unsigned int to_read)
+{
+ struct msghdr smb_msg = { .msg_iter = *iter };
+ int ret;
+
+ iov_iter_truncate(&smb_msg.msg_iter, to_read);
+ ret = cifs_readv_from_socket(server, &smb_msg);
+ if (ret > 0)
+ iov_iter_advance(iter, ret);
+ return ret;
+}
+
static bool
is_smb_response(struct TCP_Server_Info *server, unsigned char type)
{
@@ -926,7 +908,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
spin_unlock(&server->srv_lock);
cancel_delayed_work_sync(&server->echo);
- cancel_delayed_work_sync(&server->resolve);
spin_lock(&server->srv_lock);
server->tcpStatus = CifsExiting;
@@ -1550,7 +1531,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
cifs_put_tcp_session(server->primary_server, from_reconnect);
cancel_delayed_work_sync(&server->echo);
- cancel_delayed_work_sync(&server->resolve);
if (from_reconnect)
/*
@@ -1656,7 +1636,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
- INIT_DELAYED_WORK(&tcp_ses->resolve, cifs_resolve_server);
INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server);
mutex_init(&tcp_ses->reconnect_mutex);
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -1745,12 +1724,6 @@ smbd_connected:
/* queue echo request delayed work */
queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval);
- /* queue dns resolution delayed work */
- cifs_dbg(FYI, "%s: next dns resolution scheduled for %d seconds in the future\n",
- __func__, SMB_DNS_RESOLVE_INTERVAL_DEFAULT);
-
- queue_delayed_work(cifsiod_wq, &tcp_ses->resolve, (SMB_DNS_RESOLVE_INTERVAL_DEFAULT * HZ));
-
return tcp_ses;
out_err_crypto_release:
@@ -2844,72 +2817,48 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
* negprot - BB check reconnection in case where second
* sessinit is sent but no second negprot
*/
- struct rfc1002_session_packet *ses_init_buf;
- unsigned int req_noscope_len;
- struct smb_hdr *smb_buf;
+ struct rfc1002_session_packet req = {};
+ struct smb_hdr *smb_buf = (struct smb_hdr *)&req;
+ unsigned int len;
- ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
- GFP_KERNEL);
+ req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name);
- if (ses_init_buf) {
- ses_init_buf->trailer.session_req.called_len = 32;
+ if (server->server_RFC1001_name[0] != 0)
+ rfc1002mangle(req.trailer.session_req.called_name,
+ server->server_RFC1001_name,
+ RFC1001_NAME_LEN_WITH_NULL);
+ else
+ rfc1002mangle(req.trailer.session_req.called_name,
+ DEFAULT_CIFS_CALLED_NAME,
+ RFC1001_NAME_LEN_WITH_NULL);
- if (server->server_RFC1001_name[0] != 0)
- rfc1002mangle(ses_init_buf->trailer.
- session_req.called_name,
- server->server_RFC1001_name,
- RFC1001_NAME_LEN_WITH_NULL);
- else
- rfc1002mangle(ses_init_buf->trailer.
- session_req.called_name,
- DEFAULT_CIFS_CALLED_NAME,
- RFC1001_NAME_LEN_WITH_NULL);
+ req.trailer.session_req.calling_len = sizeof(req.trailer.session_req.calling_name);
- ses_init_buf->trailer.session_req.calling_len = 32;
+ /* calling name ends in null (byte 16) from old smb convention */
+ if (server->workstation_RFC1001_name[0] != 0)
+ rfc1002mangle(req.trailer.session_req.calling_name,
+ server->workstation_RFC1001_name,
+ RFC1001_NAME_LEN_WITH_NULL);
+ else
+ rfc1002mangle(req.trailer.session_req.calling_name,
+ "LINUX_CIFS_CLNT",
+ RFC1001_NAME_LEN_WITH_NULL);
- /*
- * calling name ends in null (byte 16) from old smb
- * convention.
- */
- if (server->workstation_RFC1001_name[0] != 0)
- rfc1002mangle(ses_init_buf->trailer.
- session_req.calling_name,
- server->workstation_RFC1001_name,
- RFC1001_NAME_LEN_WITH_NULL);
- else
- rfc1002mangle(ses_init_buf->trailer.
- session_req.calling_name,
- "LINUX_CIFS_CLNT",
- RFC1001_NAME_LEN_WITH_NULL);
-
- ses_init_buf->trailer.session_req.scope1 = 0;
- ses_init_buf->trailer.session_req.scope2 = 0;
- smb_buf = (struct smb_hdr *)ses_init_buf;
-
- /* sizeof RFC1002_SESSION_REQUEST with no scopes */
- req_noscope_len = sizeof(struct rfc1002_session_packet) - 2;
-
- /* == cpu_to_be32(0x81000044) */
- smb_buf->smb_buf_length =
- cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len);
- rc = smb_send(server, smb_buf, 0x44);
- kfree(ses_init_buf);
- /*
- * RFC1001 layer in at least one server
- * requires very short break before negprot
- * presumably because not expecting negprot
- * to follow so fast. This is a simple
- * solution that works without
- * complicating the code and causes no
- * significant slowing down on mount
- * for everyone else
- */
- usleep_range(1000, 2000);
- }
/*
- * else the negprot may still work without this
- * even though malloc failed
+ * As per rfc1002, @len must be the number of bytes that follows the
+ * length field of a rfc1002 session request payload.
*/
+ len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req);
+
+ smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len);
+ rc = smb_send(server, smb_buf, len);
+ /*
+ * RFC1001 layer in at least one server requires very short break before
+ * negprot presumably because not expecting negprot to follow so fast.
+ * This is a simple solution that works without complicating the code
+ * and causes no significant slowing down on mount for everyone else
+ */
+ usleep_range(1000, 2000);
return rc;
}
@@ -3760,16 +3709,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct nls_table *nls_info)
{
int rc = -ENOSYS;
- struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
- struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
+ struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr;
+ struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr;
bool is_binding = false;
spin_lock(&ses->ses_lock);
- if (server->dstaddr.ss_family == AF_INET6)
- scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr);
- else
- scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr);
-
if (ses->ses_status != SES_GOOD &&
ses->ses_status != SES_NEW &&
ses->ses_status != SES_NEED_RECON) {
@@ -3793,6 +3738,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
ses->ses_status = SES_IN_SETUP;
spin_unlock(&ses->ses_lock);
+ /* update ses ip_addr only for primary chan */
+ if (server == pserver) {
+ if (server->dstaddr.ss_family == AF_INET6)
+ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr);
+ else
+ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr);
+ }
+
if (!is_binding) {
ses->capabilities = server->capabilities;
if (!linuxExtEnabled)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 2b6076324ffc..30b1e1bfd204 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -304,15 +304,16 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
create_options |= CREATE_OPTION_READONLY;
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = desired_access;
- oparms.create_options = cifs_create_options(cifs_sb, create_options);
- oparms.disposition = disposition;
- oparms.path = full_path;
- oparms.fid = fid;
- oparms.reconnect = false;
- oparms.mode = mode;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = desired_access,
+ .create_options = cifs_create_options(cifs_sb, create_options),
+ .disposition = disposition,
+ .path = full_path,
+ .fid = fid,
+ .mode = mode,
+ };
rc = server->ops->open(xid, &oparms, oplock, buf);
if (rc) {
cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 2870e3b6ffe8..0e602173ac76 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -38,6 +38,125 @@
#include "cached_dir.h"
/*
+ * Remove the dirty flags from a span of pages.
+ */
+static void cifs_undirty_folios(struct inode *inode, loff_t start, unsigned int len)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct folio *folio;
+ pgoff_t end;
+
+ XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+
+ rcu_read_lock();
+
+ end = (start + len - 1) / PAGE_SIZE;
+ xas_for_each_marked(&xas, folio, end, PAGECACHE_TAG_DIRTY) {
+ xas_pause(&xas);
+ rcu_read_unlock();
+ folio_lock(folio);
+ folio_clear_dirty_for_io(folio);
+ folio_unlock(folio);
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Completion of write to server.
+ */
+void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct folio *folio;
+ pgoff_t end;
+
+ XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+
+ if (!len)
+ return;
+
+ rcu_read_lock();
+
+ end = (start + len - 1) / PAGE_SIZE;
+ xas_for_each(&xas, folio, end) {
+ if (!folio_test_writeback(folio)) {
+ WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
+ len, start, folio_index(folio), end);
+ continue;
+ }
+
+ folio_detach_private(folio);
+ folio_end_writeback(folio);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Failure of write to server.
+ */
+void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct folio *folio;
+ pgoff_t end;
+
+ XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+
+ if (!len)
+ return;
+
+ rcu_read_lock();
+
+ end = (start + len - 1) / PAGE_SIZE;
+ xas_for_each(&xas, folio, end) {
+ if (!folio_test_writeback(folio)) {
+ WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
+ len, start, folio_index(folio), end);
+ continue;
+ }
+
+ folio_set_error(folio);
+ folio_end_writeback(folio);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Redirty pages after a temporary failure.
+ */
+void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct folio *folio;
+ pgoff_t end;
+
+ XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
+
+ if (!len)
+ return;
+
+ rcu_read_lock();
+
+ end = (start + len - 1) / PAGE_SIZE;
+ xas_for_each(&xas, folio, end) {
+ if (!folio_test_writeback(folio)) {
+ WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
+ len, start, folio_index(folio), end);
+ continue;
+ }
+
+ filemap_dirty_folio(folio->mapping, folio);
+ folio_end_writeback(folio);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
* Mark as invalid, all open files on tree connections since they
* were closed when session to server was lost.
*/
@@ -261,14 +380,15 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_
if (f_flags & O_DIRECT)
create_options |= CREATE_NO_BUFFER;
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = desired_access;
- oparms.create_options = cifs_create_options(cifs_sb, create_options);
- oparms.disposition = disposition;
- oparms.path = full_path;
- oparms.fid = fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = desired_access,
+ .create_options = cifs_create_options(cifs_sb, create_options),
+ .disposition = disposition,
+ .path = full_path,
+ .fid = fid,
+ };
rc = server->ops->open(xid, &oparms, oplock, buf);
if (rc)
@@ -849,14 +969,16 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &cfile->fid);
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = desired_access;
- oparms.create_options = cifs_create_options(cifs_sb, create_options);
- oparms.disposition = disposition;
- oparms.path = full_path;
- oparms.fid = &cfile->fid;
- oparms.reconnect = true;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = desired_access,
+ .create_options = cifs_create_options(cifs_sb, create_options),
+ .disposition = disposition,
+ .path = full_path,
+ .fid = &cfile->fid,
+ .reconnect = true,
+ };
/*
* Can not refresh inode by passing in file_info buf to be returned by
@@ -2296,7 +2418,6 @@ cifs_writedata_release(struct kref *refcount)
if (wdata->cfile)
cifsFileInfo_put(wdata->cfile);
- kvfree(wdata->pages);
kfree(wdata);
}
@@ -2307,51 +2428,49 @@ cifs_writedata_release(struct kref *refcount)
static void
cifs_writev_requeue(struct cifs_writedata *wdata)
{
- int i, rc = 0;
+ int rc = 0;
struct inode *inode = d_inode(wdata->cfile->dentry);
struct TCP_Server_Info *server;
- unsigned int rest_len;
+ unsigned int rest_len = wdata->bytes;
+ loff_t fpos = wdata->offset;
server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- i = 0;
- rest_len = wdata->bytes;
do {
struct cifs_writedata *wdata2;
- unsigned int j, nr_pages, wsize, tailsz, cur_len;
+ unsigned int wsize, cur_len;
wsize = server->ops->wp_retry_size(inode);
if (wsize < rest_len) {
- nr_pages = wsize / PAGE_SIZE;
- if (!nr_pages) {
+ if (wsize < PAGE_SIZE) {
rc = -EOPNOTSUPP;
break;
}
- cur_len = nr_pages * PAGE_SIZE;
- tailsz = PAGE_SIZE;
+ cur_len = min(round_down(wsize, PAGE_SIZE), rest_len);
} else {
- nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE);
cur_len = rest_len;
- tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE;
}
- wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
+ wdata2 = cifs_writedata_alloc(cifs_writev_complete);
if (!wdata2) {
rc = -ENOMEM;
break;
}
- for (j = 0; j < nr_pages; j++) {
- wdata2->pages[j] = wdata->pages[i + j];
- lock_page(wdata2->pages[j]);
- clear_page_dirty_for_io(wdata2->pages[j]);
- }
-
wdata2->sync_mode = wdata->sync_mode;
- wdata2->nr_pages = nr_pages;
- wdata2->offset = page_offset(wdata2->pages[0]);
- wdata2->pagesz = PAGE_SIZE;
- wdata2->tailsz = tailsz;
- wdata2->bytes = cur_len;
+ wdata2->offset = fpos;
+ wdata2->bytes = cur_len;
+ wdata2->iter = wdata->iter;
+
+ iov_iter_advance(&wdata2->iter, fpos - wdata->offset);
+ iov_iter_truncate(&wdata2->iter, wdata2->bytes);
+
+ if (iov_iter_is_xarray(&wdata2->iter))
+ /* Check for pages having been redirtied and clean
+ * them. We can do this by walking the xarray. If
+ * it's not an xarray, then it's a DIO and we shouldn't
+ * be mucking around with the page bits.
+ */
+ cifs_undirty_folios(inode, fpos, cur_len);
rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY,
&wdata2->cfile);
@@ -2366,33 +2485,22 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
cifs_writedata_release);
}
- for (j = 0; j < nr_pages; j++) {
- unlock_page(wdata2->pages[j]);
- if (rc != 0 && !is_retryable_error(rc)) {
- SetPageError(wdata2->pages[j]);
- end_page_writeback(wdata2->pages[j]);
- put_page(wdata2->pages[j]);
- }
- }
-
kref_put(&wdata2->refcount, cifs_writedata_release);
if (rc) {
if (is_retryable_error(rc))
continue;
- i += nr_pages;
+ fpos += cur_len;
+ rest_len -= cur_len;
break;
}
+ fpos += cur_len;
rest_len -= cur_len;
- i += nr_pages;
- } while (i < wdata->nr_pages);
+ } while (rest_len > 0);
- /* cleanup remaining pages from the original wdata */
- for (; i < wdata->nr_pages; i++) {
- SetPageError(wdata->pages[i]);
- end_page_writeback(wdata->pages[i]);
- put_page(wdata->pages[i]);
- }
+ /* Clean up remaining pages from the original wdata */
+ if (iov_iter_is_xarray(&wdata->iter))
+ cifs_pages_write_failed(inode, fpos, rest_len);
if (rc != 0 && !is_retryable_error(rc))
mapping_set_error(inode->i_mapping, rc);
@@ -2405,7 +2513,6 @@ cifs_writev_complete(struct work_struct *work)
struct cifs_writedata *wdata = container_of(work,
struct cifs_writedata, work);
struct inode *inode = d_inode(wdata->cfile->dentry);
- int i = 0;
if (wdata->result == 0) {
spin_lock(&inode->i_lock);
@@ -2416,45 +2523,24 @@ cifs_writev_complete(struct work_struct *work)
} else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN)
return cifs_writev_requeue(wdata);
- for (i = 0; i < wdata->nr_pages; i++) {
- struct page *page = wdata->pages[i];
+ if (wdata->result == -EAGAIN)
+ cifs_pages_write_redirty(inode, wdata->offset, wdata->bytes);
+ else if (wdata->result < 0)
+ cifs_pages_write_failed(inode, wdata->offset, wdata->bytes);
+ else
+ cifs_pages_written_back(inode, wdata->offset, wdata->bytes);
- if (wdata->result == -EAGAIN)
- __set_page_dirty_nobuffers(page);
- else if (wdata->result < 0)
- SetPageError(page);
- end_page_writeback(page);
- cifs_readpage_to_fscache(inode, page);
- put_page(page);
- }
if (wdata->result != -EAGAIN)
mapping_set_error(inode->i_mapping, wdata->result);
kref_put(&wdata->refcount, cifs_writedata_release);
}
-struct cifs_writedata *
-cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
-{
- struct cifs_writedata *writedata = NULL;
- struct page **pages =
- kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (pages) {
- writedata = cifs_writedata_direct_alloc(pages, complete);
- if (!writedata)
- kvfree(pages);
- }
-
- return writedata;
-}
-
-struct cifs_writedata *
-cifs_writedata_direct_alloc(struct page **pages, work_func_t complete)
+struct cifs_writedata *cifs_writedata_alloc(work_func_t complete)
{
struct cifs_writedata *wdata;
wdata = kzalloc(sizeof(*wdata), GFP_NOFS);
if (wdata != NULL) {
- wdata->pages = pages;
kref_init(&wdata->refcount);
INIT_LIST_HEAD(&wdata->list);
init_completion(&wdata->done);
@@ -2463,7 +2549,6 @@ cifs_writedata_direct_alloc(struct page **pages, work_func_t complete)
return wdata;
}
-
static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
{
struct address_space *mapping = page->mapping;
@@ -2522,310 +2607,372 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
return rc;
}
-static struct cifs_writedata *
-wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
- pgoff_t end, pgoff_t *index,
- unsigned int *found_pages)
+/*
+ * Extend the region to be written back to include subsequent contiguously
+ * dirty pages if possible, but don't sleep while doing so.
+ */
+static void cifs_extend_writeback(struct address_space *mapping,
+ long *_count,
+ loff_t start,
+ int max_pages,
+ size_t max_len,
+ unsigned int *_len)
{
- struct cifs_writedata *wdata;
-
- wdata = cifs_writedata_alloc((unsigned int)tofind,
- cifs_writev_complete);
- if (!wdata)
- return NULL;
-
- *found_pages = find_get_pages_range_tag(mapping, index, end,
- PAGECACHE_TAG_DIRTY, tofind, wdata->pages);
- return wdata;
-}
+ struct folio_batch batch;
+ struct folio *folio;
+ unsigned int psize, nr_pages;
+ size_t len = *_len;
+ pgoff_t index = (start + len) / PAGE_SIZE;
+ bool stop = true;
+ unsigned int i;
+ XA_STATE(xas, &mapping->i_pages, index);
-static unsigned int
-wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
- struct address_space *mapping,
- struct writeback_control *wbc,
- pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done)
-{
- unsigned int nr_pages = 0, i;
- struct page *page;
+ folio_batch_init(&batch);
- for (i = 0; i < found_pages; i++) {
- page = wdata->pages[i];
- /*
- * At this point we hold neither the i_pages lock nor the
- * page lock: the page may be truncated or invalidated
- * (changing page->mapping to NULL), or even swizzled
- * back from swapper_space to tmpfs file mapping
+ do {
+ /* Firstly, we gather up a batch of contiguous dirty pages
+ * under the RCU read lock - but we can't clear the dirty flags
+ * there if any of those pages are mapped.
*/
+ rcu_read_lock();
- if (nr_pages == 0)
- lock_page(page);
- else if (!trylock_page(page))
- break;
-
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- break;
- }
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ stop = true;
+ if (xas_retry(&xas, folio))
+ continue;
+ if (xa_is_value(folio))
+ break;
+ if (folio_index(folio) != index)
+ break;
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(&xas);
+ continue;
+ }
+ nr_pages = folio_nr_pages(folio);
+ if (nr_pages > max_pages)
+ break;
- if (!wbc->range_cyclic && page->index > end) {
- *done = true;
- unlock_page(page);
- break;
- }
+ /* Has the page moved or been split? */
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
+ break;
+ }
- if (*next && (page->index != *next)) {
- /* Not next consecutive page */
- unlock_page(page);
- break;
- }
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ break;
+ }
+ if (!folio_test_dirty(folio) || folio_test_writeback(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ break;
+ }
- if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
+ max_pages -= nr_pages;
+ psize = folio_size(folio);
+ len += psize;
+ stop = false;
+ if (max_pages <= 0 || len >= max_len || *_count <= 0)
+ stop = true;
- if (PageWriteback(page) ||
- !clear_page_dirty_for_io(page)) {
- unlock_page(page);
- break;
+ index += nr_pages;
+ if (!folio_batch_add(&batch, folio))
+ break;
+ if (stop)
+ break;
}
- /*
- * This actually clears the dirty bit in the radix tree.
- * See cifs_writepage() for more commentary.
+ if (!stop)
+ xas_pause(&xas);
+ rcu_read_unlock();
+
+ /* Now, if we obtained any pages, we can shift them to being
+ * writable and mark them for caching.
*/
- set_page_writeback(page);
- if (page_offset(page) >= i_size_read(mapping->host)) {
- *done = true;
- unlock_page(page);
- end_page_writeback(page);
+ if (!folio_batch_count(&batch))
break;
- }
- wdata->pages[i] = page;
- *next = page->index + 1;
- ++nr_pages;
- }
-
- /* reset index to refind any pages skipped */
- if (nr_pages == 0)
- *index = wdata->pages[0]->index + 1;
-
- /* put any pages we aren't going to use */
- for (i = nr_pages; i < found_pages; i++) {
- put_page(wdata->pages[i]);
- wdata->pages[i] = NULL;
- }
-
- return nr_pages;
-}
-
-static int
-wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
- struct address_space *mapping, struct writeback_control *wbc)
-{
- int rc;
-
- wdata->sync_mode = wbc->sync_mode;
- wdata->nr_pages = nr_pages;
- wdata->offset = page_offset(wdata->pages[0]);
- wdata->pagesz = PAGE_SIZE;
- wdata->tailsz = min(i_size_read(mapping->host) -
- page_offset(wdata->pages[nr_pages - 1]),
- (loff_t)PAGE_SIZE);
- wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz;
- wdata->pid = wdata->cfile->pid;
-
- rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes);
- if (rc)
- return rc;
-
- if (wdata->cfile->invalidHandle)
- rc = -EAGAIN;
- else
- rc = wdata->server->ops->async_writev(wdata,
- cifs_writedata_release);
-
- return rc;
-}
+ for (i = 0; i < folio_batch_count(&batch); i++) {
+ folio = batch.folios[i];
+ /* The folio should be locked, dirty and not undergoing
+ * writeback from the loop above.
+ */
+ if (!folio_clear_dirty_for_io(folio))
+ WARN_ON(1);
+ if (folio_start_writeback(folio))
+ WARN_ON(1);
-static int
-cifs_writepage_locked(struct page *page, struct writeback_control *wbc);
+ *_count -= folio_nr_pages(folio);
+ folio_unlock(folio);
+ }
-static int cifs_write_one_page(struct page *page, struct writeback_control *wbc,
- void *data)
-{
- struct address_space *mapping = data;
- int ret;
+ folio_batch_release(&batch);
+ cond_resched();
+ } while (!stop);
- ret = cifs_writepage_locked(page, wbc);
- unlock_page(page);
- mapping_set_error(mapping, ret);
- return ret;
+ *_len = len;
}
-static int cifs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+/*
+ * Write back the locked page and any subsequent non-locked dirty pages.
+ */
+static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct folio *folio,
+ loff_t start, loff_t end)
{
struct inode *inode = mapping->host;
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct TCP_Server_Info *server;
- bool done = false, scanned = false, range_whole = false;
- pgoff_t end, index;
struct cifs_writedata *wdata;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifs_credits credits_on_stack;
+ struct cifs_credits *credits = &credits_on_stack;
struct cifsFileInfo *cfile = NULL;
- int rc = 0;
- int saved_rc = 0;
- unsigned int xid;
+ unsigned int xid, wsize, len;
+ loff_t i_size = i_size_read(inode);
+ size_t max_len;
+ long count = wbc->nr_to_write;
+ int rc;
- /*
- * If wsize is smaller than the page cache size, default to writing
- * one page at a time.
- */
- if (cifs_sb->ctx->wsize < PAGE_SIZE)
- return write_cache_pages(mapping, wbc, cifs_write_one_page,
- mapping);
+ /* The folio should be locked, dirty and not undergoing writeback. */
+ if (folio_start_writeback(folio))
+ WARN_ON(1);
+
+ count -= folio_nr_pages(folio);
+ len = folio_size(folio);
xid = get_xid();
- if (wbc->range_cyclic) {
- index = mapping->writeback_index; /* Start from prev offset */
- end = -1;
- } else {
- index = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
- if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = true;
- scanned = true;
- }
server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses);
-retry:
- while (!done && index <= end) {
- unsigned int i, nr_pages, found_pages, wsize;
- pgoff_t next = 0, tofind, saved_index = index;
- struct cifs_credits credits_on_stack;
- struct cifs_credits *credits = &credits_on_stack;
- int get_file_rc = 0;
+ rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile);
+ if (rc) {
+ cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", rc);
+ goto err_xid;
+ }
- if (cfile)
- cifsFileInfo_put(cfile);
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize,
+ &wsize, credits);
+ if (rc != 0)
+ goto err_close;
- rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile);
+ wdata = cifs_writedata_alloc(cifs_writev_complete);
+ if (!wdata) {
+ rc = -ENOMEM;
+ goto err_uncredit;
+ }
- /* in case of an error store it to return later */
- if (rc)
- get_file_rc = rc;
+ wdata->sync_mode = wbc->sync_mode;
+ wdata->offset = folio_pos(folio);
+ wdata->pid = cfile->pid;
+ wdata->credits = credits_on_stack;
+ wdata->cfile = cfile;
+ wdata->server = server;
+ cfile = NULL;
+
+ /* Find all consecutive lockable dirty pages, stopping when we find a
+ * page that is not immediately lockable, is not dirty or is missing,
+ * or we reach the end of the range.
+ */
+ if (start < i_size) {
+ /* Trim the write to the EOF; the extra data is ignored. Also
+ * put an upper limit on the size of a single storedata op.
+ */
+ max_len = wsize;
+ max_len = min_t(unsigned long long, max_len, end - start + 1);
+ max_len = min_t(unsigned long long, max_len, i_size - start);
- rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize,
- &wsize, credits);
- if (rc != 0) {
- done = true;
- break;
- }
+ if (len < max_len) {
+ int max_pages = INT_MAX;
- tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1;
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->smbd_conn)
+ max_pages = server->smbd_conn->max_frmr_depth;
+#endif
+ max_pages -= folio_nr_pages(folio);
- wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
- &found_pages);
- if (!wdata) {
- rc = -ENOMEM;
- done = true;
- add_credits_and_wake_if(server, credits, 0);
- break;
+ if (max_pages > 0)
+ cifs_extend_writeback(mapping, &count, start,
+ max_pages, max_len, &len);
}
+ len = min_t(loff_t, len, max_len);
+ }
- if (found_pages == 0) {
- kref_put(&wdata->refcount, cifs_writedata_release);
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
+ wdata->bytes = len;
+
+ /* We now have a contiguous set of dirty pages, each with writeback
+ * set; the first page is still locked at this point, but all the rest
+ * have been unlocked.
+ */
+ folio_unlock(folio);
+
+ if (start < i_size) {
+ iov_iter_xarray(&wdata->iter, ITER_SOURCE, &mapping->i_pages,
+ start, len);
- nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc,
- end, &index, &next, &done);
+ rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes);
+ if (rc)
+ goto err_wdata;
- /* nothing to write? */
- if (nr_pages == 0) {
+ if (wdata->cfile->invalidHandle)
+ rc = -EAGAIN;
+ else
+ rc = wdata->server->ops->async_writev(wdata,
+ cifs_writedata_release);
+ if (rc >= 0) {
kref_put(&wdata->refcount, cifs_writedata_release);
- add_credits_and_wake_if(server, credits, 0);
- continue;
+ goto err_close;
}
+ } else {
+ /* The dirty region was entirely beyond the EOF. */
+ cifs_pages_written_back(inode, start, len);
+ rc = 0;
+ }
- wdata->credits = credits_on_stack;
- wdata->cfile = cfile;
- wdata->server = server;
- cfile = NULL;
+err_wdata:
+ kref_put(&wdata->refcount, cifs_writedata_release);
+err_uncredit:
+ add_credits_and_wake_if(server, credits, 0);
+err_close:
+ if (cfile)
+ cifsFileInfo_put(cfile);
+err_xid:
+ free_xid(xid);
+ if (rc == 0) {
+ wbc->nr_to_write = count;
+ } else if (is_retryable_error(rc)) {
+ cifs_pages_write_redirty(inode, start, len);
+ } else {
+ cifs_pages_write_failed(inode, start, len);
+ mapping_set_error(mapping, rc);
+ }
+ /* Indication to update ctime and mtime as close is deferred */
+ set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
+ return rc;
+}
- if (!wdata->cfile) {
- cifs_dbg(VFS, "No writable handle in writepages rc=%d\n",
- get_file_rc);
- if (is_retryable_error(get_file_rc))
- rc = get_file_rc;
- else
- rc = -EBADF;
- } else
- rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
+/*
+ * write a region of pages back to the server
+ */
+static int cifs_writepages_region(struct address_space *mapping,
+ struct writeback_control *wbc,
+ loff_t start, loff_t end, loff_t *_next)
+{
+ struct folio *folio;
+ struct page *head_page;
+ ssize_t ret;
+ int n, skips = 0;
- for (i = 0; i < nr_pages; ++i)
- unlock_page(wdata->pages[i]);
+ do {
+ pgoff_t index = start / PAGE_SIZE;
- /* send failure -- clean up the mess */
- if (rc != 0) {
- add_credits_and_wake_if(server, &wdata->credits, 0);
- for (i = 0; i < nr_pages; ++i) {
- if (is_retryable_error(rc))
- redirty_page_for_writepage(wbc,
- wdata->pages[i]);
- else
- SetPageError(wdata->pages[i]);
- end_page_writeback(wdata->pages[i]);
- put_page(wdata->pages[i]);
+ n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE,
+ PAGECACHE_TAG_DIRTY, 1, &head_page);
+ if (!n)
+ break;
+
+ folio = page_folio(head_page);
+ start = folio_pos(folio); /* May regress with THPs */
+
+ /* At this point we hold neither the i_pages lock nor the
+ * page lock: the page may be truncated or invalidated
+ * (changing page->mapping to NULL), or even swizzled
+ * back from swapper_space to tmpfs file mapping
+ */
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ ret = folio_lock_killable(folio);
+ if (ret < 0) {
+ folio_put(folio);
+ return ret;
+ }
+ } else {
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ return 0;
}
- if (!is_retryable_error(rc))
- mapping_set_error(mapping, rc);
}
- kref_put(&wdata->refcount, cifs_writedata_release);
- if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) {
- index = saved_index;
+ if (folio_mapping(folio) != mapping ||
+ !folio_test_dirty(folio)) {
+ start += folio_size(folio);
+ folio_unlock(folio);
+ folio_put(folio);
continue;
}
- /* Return immediately if we received a signal during writing */
- if (is_interrupt_error(rc)) {
- done = true;
- break;
+ if (folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ folio_wait_writeback(folio);
+#ifdef CONFIG_CIFS_FSCACHE
+ folio_wait_fscache(folio);
+#endif
+ } else {
+ start += folio_size(folio);
+ }
+ folio_put(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (skips >= 5 || need_resched())
+ break;
+ skips++;
+ }
+ continue;
}
- if (rc != 0 && saved_rc == 0)
- saved_rc = rc;
+ if (!folio_clear_dirty_for_io(folio))
+ /* We hold the page lock - it should've been dirty. */
+ WARN_ON(1);
- wbc->nr_to_write -= nr_pages;
- if (wbc->nr_to_write <= 0)
- done = true;
+ ret = cifs_write_back_from_locked_folio(mapping, wbc, folio, start, end);
+ folio_put(folio);
+ if (ret < 0)
+ return ret;
- index = next;
- }
+ start += ret;
+ cond_resched();
+ } while (wbc->nr_to_write > 0);
- if (!scanned && !done) {
- /*
- * We hit the last page and there is more work to be done: wrap
- * back to the start of the file
- */
- scanned = true;
- index = 0;
- goto retry;
- }
+ *_next = start;
+ return 0;
+}
- if (saved_rc != 0)
- rc = saved_rc;
+/*
+ * Write some of the pending data back to the server
+ */
+static int cifs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ loff_t start, next;
+ int ret;
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = index;
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
- if (cfile)
- cifsFileInfo_put(cfile);
- free_xid(xid);
- /* Indication to update ctime and mtime as close is deferred */
- set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
- return rc;
+ if (wbc->range_cyclic) {
+ start = mapping->writeback_index * PAGE_SIZE;
+ ret = cifs_writepages_region(mapping, wbc, start, LLONG_MAX, &next);
+ if (ret == 0) {
+ mapping->writeback_index = next / PAGE_SIZE;
+ if (start > 0 && wbc->nr_to_write > 0) {
+ ret = cifs_writepages_region(mapping, wbc, 0,
+ start, &next);
+ if (ret == 0)
+ mapping->writeback_index =
+ next / PAGE_SIZE;
+ }
+ }
+ } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+ ret = cifs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next);
+ if (wbc->nr_to_write > 0 && ret == 0)
+ mapping->writeback_index = next / PAGE_SIZE;
+ } else {
+ ret = cifs_writepages_region(mapping, wbc,
+ wbc->range_start, wbc->range_end, &next);
+ }
+
+ return ret;
}
static int
@@ -2877,6 +3024,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct cifsFileInfo *cfile = file->private_data;
struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+ struct folio *folio = page_folio(page);
__u32 pid;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
@@ -2887,14 +3035,14 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n",
page, pos, copied);
- if (PageChecked(page)) {
+ if (folio_test_checked(folio)) {
if (copied == len)
- SetPageUptodate(page);
- ClearPageChecked(page);
- } else if (!PageUptodate(page) && copied == PAGE_SIZE)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
+ folio_clear_checked(folio);
+ } else if (!folio_test_uptodate(folio) && copied == PAGE_SIZE)
+ folio_mark_uptodate(folio);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
char *page_data;
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned int xid;
@@ -3054,57 +3202,13 @@ int cifs_flush(struct file *file, fl_owner_t id)
return rc;
}
-static int
-cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
-{
- int rc = 0;
- unsigned long i;
-
- for (i = 0; i < num_pages; i++) {
- pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
- if (!pages[i]) {
- /*
- * save number of pages we have already allocated and
- * return with ENOMEM error
- */
- num_pages = i;
- rc = -ENOMEM;
- break;
- }
- }
-
- if (rc) {
- for (i = 0; i < num_pages; i++)
- put_page(pages[i]);
- }
- return rc;
-}
-
-static inline
-size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
-{
- size_t num_pages;
- size_t clen;
-
- clen = min_t(const size_t, len, wsize);
- num_pages = DIV_ROUND_UP(clen, PAGE_SIZE);
-
- if (cur_len)
- *cur_len = clen;
-
- return num_pages;
-}
-
static void
cifs_uncached_writedata_release(struct kref *refcount)
{
- int i;
struct cifs_writedata *wdata = container_of(refcount,
struct cifs_writedata, refcount);
kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release);
- for (i = 0; i < wdata->nr_pages; i++)
- put_page(wdata->pages[i]);
cifs_writedata_release(refcount);
}
@@ -3131,48 +3235,6 @@ cifs_uncached_writev_complete(struct work_struct *work)
}
static int
-wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
- size_t *len, unsigned long *num_pages)
-{
- size_t save_len, copied, bytes, cur_len = *len;
- unsigned long i, nr_pages = *num_pages;
-
- save_len = cur_len;
- for (i = 0; i < nr_pages; i++) {
- bytes = min_t(const size_t, cur_len, PAGE_SIZE);
- copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
- cur_len -= copied;
- /*
- * If we didn't copy as much as we expected, then that
- * may mean we trod into an unmapped area. Stop copying
- * at that point. On the next pass through the big
- * loop, we'll likely end up getting a zero-length
- * write and bailing out of it.
- */
- if (copied < bytes)
- break;
- }
- cur_len = save_len - cur_len;
- *len = cur_len;
-
- /*
- * If we have no data to send, then that probably means that
- * the copy above failed altogether. That's most likely because
- * the address in the iovec was bogus. Return -EFAULT and let
- * the caller free anything we allocated and bail out.
- */
- if (!cur_len)
- return -EFAULT;
-
- /*
- * i + 1 now represents the number of pages we actually used in
- * the copy phase above.
- */
- *num_pages = i + 1;
- return 0;
-}
-
-static int
cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
struct cifs_aio_ctx *ctx)
{
@@ -3242,23 +3304,57 @@ fail:
return rc;
}
+/*
+ * Select span of a bvec iterator we're going to use. Limit it by both maximum
+ * size and maximum number of segments.
+ */
+static size_t cifs_limit_bvec_subset(const struct iov_iter *iter, size_t max_size,
+ size_t max_segs, unsigned int *_nsegs)
+{
+ const struct bio_vec *bvecs = iter->bvec;
+ unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0;
+ size_t len, span = 0, n = iter->count;
+ size_t skip = iter->iov_offset;
+
+ if (WARN_ON(!iov_iter_is_bvec(iter)) || n == 0)
+ return 0;
+
+ while (n && ix < nbv && skip) {
+ len = bvecs[ix].bv_len;
+ if (skip < len)
+ break;
+ skip -= len;
+ n -= len;
+ ix++;
+ }
+
+ while (n && ix < nbv) {
+ len = min3(n, bvecs[ix].bv_len - skip, max_size);
+ span += len;
+ nsegs++;
+ ix++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ skip = 0;
+ n -= len;
+ }
+
+ *_nsegs = nsegs;
+ return span;
+}
+
static int
-cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
+cifs_write_from_iter(loff_t fpos, size_t len, struct iov_iter *from,
struct cifsFileInfo *open_file,
struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
struct cifs_aio_ctx *ctx)
{
int rc = 0;
- size_t cur_len;
- unsigned long nr_pages, num_pages, i;
+ size_t cur_len, max_len;
struct cifs_writedata *wdata;
- struct iov_iter saved_from = *from;
- loff_t saved_offset = offset;
pid_t pid;
struct TCP_Server_Info *server;
- struct page **pagevec;
- size_t start;
- unsigned int xid;
+ unsigned int xid, max_segs = INT_MAX;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -3268,10 +3364,20 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
xid = get_xid();
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->smbd_conn)
+ max_segs = server->smbd_conn->max_frmr_depth;
+#endif
+
do {
- unsigned int wsize;
struct cifs_credits credits_on_stack;
struct cifs_credits *credits = &credits_on_stack;
+ unsigned int wsize, nsegs = 0;
+
+ if (signal_pending(current)) {
+ rc = -EINTR;
+ break;
+ }
if (open_file->invalidHandle) {
rc = cifs_reopen_file(open_file, false);
@@ -3286,99 +3392,42 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
if (rc)
break;
- cur_len = min_t(const size_t, len, wsize);
-
- if (ctx->direct_io) {
- ssize_t result;
-
- result = iov_iter_get_pages_alloc2(
- from, &pagevec, cur_len, &start);
- if (result < 0) {
- cifs_dbg(VFS,
- "direct_writev couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n",
- result, iov_iter_type(from),
- from->iov_offset, from->count);
- dump_stack();
-
- rc = result;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
- cur_len = (size_t)result;
-
- nr_pages =
- (cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE;
-
- wdata = cifs_writedata_direct_alloc(pagevec,
- cifs_uncached_writev_complete);
- if (!wdata) {
- rc = -ENOMEM;
- for (i = 0; i < nr_pages; i++)
- put_page(pagevec[i]);
- kvfree(pagevec);
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
-
- wdata->page_offset = start;
- wdata->tailsz =
- nr_pages > 1 ?
- cur_len - (PAGE_SIZE - start) -
- (nr_pages - 2) * PAGE_SIZE :
- cur_len;
- } else {
- nr_pages = get_numpages(wsize, len, &cur_len);
- wdata = cifs_writedata_alloc(nr_pages,
- cifs_uncached_writev_complete);
- if (!wdata) {
- rc = -ENOMEM;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
- if (rc) {
- kvfree(wdata->pages);
- kfree(wdata);
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
-
- num_pages = nr_pages;
- rc = wdata_fill_from_iovec(
- wdata, from, &cur_len, &num_pages);
- if (rc) {
- for (i = 0; i < nr_pages; i++)
- put_page(wdata->pages[i]);
- kvfree(wdata->pages);
- kfree(wdata);
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
+ max_len = min_t(const size_t, len, wsize);
+ if (!max_len) {
+ rc = -EAGAIN;
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
- /*
- * Bring nr_pages down to the number of pages we
- * actually used, and free any pages that we didn't use.
- */
- for ( ; nr_pages > num_pages; nr_pages--)
- put_page(wdata->pages[nr_pages - 1]);
+ cur_len = cifs_limit_bvec_subset(from, max_len, max_segs, &nsegs);
+ cifs_dbg(FYI, "write_from_iter len=%zx/%zx nsegs=%u/%lu/%u\n",
+ cur_len, max_len, nsegs, from->nr_segs, max_segs);
+ if (cur_len == 0) {
+ rc = -EIO;
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
- wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
+ wdata = cifs_writedata_alloc(cifs_uncached_writev_complete);
+ if (!wdata) {
+ rc = -ENOMEM;
+ add_credits_and_wake_if(server, credits, 0);
+ break;
}
wdata->sync_mode = WB_SYNC_ALL;
- wdata->nr_pages = nr_pages;
- wdata->offset = (__u64)offset;
- wdata->cfile = cifsFileInfo_get(open_file);
- wdata->server = server;
- wdata->pid = pid;
- wdata->bytes = cur_len;
- wdata->pagesz = PAGE_SIZE;
- wdata->credits = credits_on_stack;
- wdata->ctx = ctx;
+ wdata->offset = (__u64)fpos;
+ wdata->cfile = cifsFileInfo_get(open_file);
+ wdata->server = server;
+ wdata->pid = pid;
+ wdata->bytes = cur_len;
+ wdata->credits = credits_on_stack;
+ wdata->iter = *from;
+ wdata->ctx = ctx;
kref_get(&ctx->refcount);
+ iov_iter_truncate(&wdata->iter, cur_len);
+
rc = adjust_credits(server, &wdata->credits, wdata->bytes);
if (!rc) {
@@ -3393,16 +3442,14 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
add_credits_and_wake_if(server, &wdata->credits, 0);
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
- if (rc == -EAGAIN) {
- *from = saved_from;
- iov_iter_advance(from, offset - saved_offset);
+ if (rc == -EAGAIN)
continue;
- }
break;
}
list_add_tail(&wdata->list, wdata_list);
- offset += cur_len;
+ iov_iter_advance(from, cur_len);
+ fpos += cur_len;
len -= cur_len;
} while (len > 0);
@@ -3501,20 +3548,8 @@ static ssize_t __cifs_writev(
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
struct cifs_aio_ctx *ctx;
- struct iov_iter saved_from = *from;
- size_t len = iov_iter_count(from);
int rc;
- /*
- * iov_iter_get_pages_alloc doesn't work with ITER_KVEC.
- * In this case, fall back to non-direct write function.
- * this could be improved by getting pages directly in ITER_KVEC
- */
- if (direct && iov_iter_is_kvec(from)) {
- cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n");
- direct = false;
- }
-
rc = generic_write_checks(iocb, from);
if (rc <= 0)
return rc;
@@ -3536,23 +3571,54 @@ static ssize_t __cifs_writev(
ctx->iocb = iocb;
ctx->pos = iocb->ki_pos;
+ ctx->direct_io = direct;
+ ctx->nr_pinned_pages = 0;
- if (direct) {
- ctx->direct_io = true;
- ctx->iter = *from;
- ctx->len = len;
- } else {
- rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE);
- if (rc) {
+ if (user_backed_iter(from)) {
+ /*
+ * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as
+ * they contain references to the calling process's virtual
+ * memory layout which won't be available in an async worker
+ * thread. This also takes a pin on every folio involved.
+ */
+ rc = netfs_extract_user_iter(from, iov_iter_count(from),
+ &ctx->iter, 0);
+ if (rc < 0) {
kref_put(&ctx->refcount, cifs_aio_ctx_release);
return rc;
}
+
+ ctx->nr_pinned_pages = rc;
+ ctx->bv = (void *)ctx->iter.bvec;
+ ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter);
+ } else if ((iov_iter_is_bvec(from) || iov_iter_is_kvec(from)) &&
+ !is_sync_kiocb(iocb)) {
+ /*
+ * If the op is asynchronous, we need to copy the list attached
+ * to a BVEC/KVEC-type iterator, but we assume that the storage
+ * will be pinned by the caller; in any case, we may or may not
+ * be able to pin the pages, so we don't try.
+ */
+ ctx->bv = (void *)dup_iter(&ctx->iter, from, GFP_KERNEL);
+ if (!ctx->bv) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -ENOMEM;
+ }
+ } else {
+ /*
+ * Otherwise, we just pass the iterator down as-is and rely on
+ * the caller to make sure the pages referred to by the
+ * iterator don't evaporate.
+ */
+ ctx->iter = *from;
}
+ ctx->len = iov_iter_count(&ctx->iter);
+
/* grab a lock here due to read response handlers can access ctx */
mutex_lock(&ctx->aio_mutex);
- rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &saved_from,
+ rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &ctx->iter,
cfile, cifs_sb, &ctx->list, ctx);
/*
@@ -3695,14 +3761,12 @@ out:
return written;
}
-static struct cifs_readdata *
-cifs_readdata_direct_alloc(struct page **pages, work_func_t complete)
+static struct cifs_readdata *cifs_readdata_alloc(work_func_t complete)
{
struct cifs_readdata *rdata;
rdata = kzalloc(sizeof(*rdata), GFP_KERNEL);
- if (rdata != NULL) {
- rdata->pages = pages;
+ if (rdata) {
kref_init(&rdata->refcount);
INIT_LIST_HEAD(&rdata->list);
init_completion(&rdata->done);
@@ -3712,27 +3776,14 @@ cifs_readdata_direct_alloc(struct page **pages, work_func_t complete)
return rdata;
}
-static struct cifs_readdata *
-cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete)
-{
- struct page **pages =
- kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
- struct cifs_readdata *ret = NULL;
-
- if (pages) {
- ret = cifs_readdata_direct_alloc(pages, complete);
- if (!ret)
- kfree(pages);
- }
-
- return ret;
-}
-
void
cifs_readdata_release(struct kref *refcount)
{
struct cifs_readdata *rdata = container_of(refcount,
struct cifs_readdata, refcount);
+
+ if (rdata->ctx)
+ kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (rdata->mr) {
smbd_deregister_mr(rdata->mr);
@@ -3742,85 +3793,9 @@ cifs_readdata_release(struct kref *refcount)
if (rdata->cfile)
cifsFileInfo_put(rdata->cfile);
- kvfree(rdata->pages);
kfree(rdata);
}
-static int
-cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages)
-{
- int rc = 0;
- struct page *page;
- unsigned int i;
-
- for (i = 0; i < nr_pages; i++) {
- page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
- if (!page) {
- rc = -ENOMEM;
- break;
- }
- rdata->pages[i] = page;
- }
-
- if (rc) {
- unsigned int nr_page_failed = i;
-
- for (i = 0; i < nr_page_failed; i++) {
- put_page(rdata->pages[i]);
- rdata->pages[i] = NULL;
- }
- }
- return rc;
-}
-
-static void
-cifs_uncached_readdata_release(struct kref *refcount)
-{
- struct cifs_readdata *rdata = container_of(refcount,
- struct cifs_readdata, refcount);
- unsigned int i;
-
- kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
- for (i = 0; i < rdata->nr_pages; i++) {
- put_page(rdata->pages[i]);
- }
- cifs_readdata_release(refcount);
-}
-
-/**
- * cifs_readdata_to_iov - copy data from pages in response to an iovec
- * @rdata: the readdata response with list of pages holding data
- * @iter: destination for our data
- *
- * This function copies data from a list of pages in a readdata response into
- * an array of iovecs. It will first calculate where the data should go
- * based on the info in the readdata and then copy the data into that spot.
- */
-static int
-cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
-{
- size_t remaining = rdata->got_bytes;
- unsigned int i;
-
- for (i = 0; i < rdata->nr_pages; i++) {
- struct page *page = rdata->pages[i];
- size_t copy = min_t(size_t, remaining, PAGE_SIZE);
- size_t written;
-
- if (unlikely(iov_iter_is_pipe(iter))) {
- void *addr = kmap_atomic(page);
-
- written = copy_to_iter(addr, copy, iter);
- kunmap_atomic(addr);
- } else
- written = copy_page_to_iter(page, 0, copy, iter);
- remaining -= written;
- if (written < copy && iov_iter_count(iter) > 0)
- break;
- }
- return remaining ? -EFAULT : 0;
-}
-
static void collect_uncached_read_data(struct cifs_aio_ctx *ctx);
static void
@@ -3832,81 +3807,7 @@ cifs_uncached_readv_complete(struct work_struct *work)
complete(&rdata->done);
collect_uncached_read_data(rdata->ctx);
/* the below call can possibly free the last ref to aio ctx */
- kref_put(&rdata->refcount, cifs_uncached_readdata_release);
-}
-
-static int
-uncached_fill_pages(struct TCP_Server_Info *server,
- struct cifs_readdata *rdata, struct iov_iter *iter,
- unsigned int len)
-{
- int result = 0;
- unsigned int i;
- unsigned int nr_pages = rdata->nr_pages;
- unsigned int page_offset = rdata->page_offset;
-
- rdata->got_bytes = 0;
- rdata->tailsz = PAGE_SIZE;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = rdata->pages[i];
- size_t n;
- unsigned int segment_size = rdata->pagesz;
-
- if (i == 0)
- segment_size -= page_offset;
- else
- page_offset = 0;
-
-
- if (len <= 0) {
- /* no need to hold page hostage */
- rdata->pages[i] = NULL;
- rdata->nr_pages--;
- put_page(page);
- continue;
- }
-
- n = len;
- if (len >= segment_size)
- /* enough data to fill the page */
- n = segment_size;
- else
- rdata->tailsz = len;
- len -= n;
-
- if (iter)
- result = copy_page_from_iter(
- page, page_offset, n, iter);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- else if (rdata->mr)
- result = n;
-#endif
- else
- result = cifs_read_page_from_socket(
- server, page, page_offset, n);
- if (result < 0)
- break;
-
- rdata->got_bytes += result;
- }
-
- return result != -ECONNABORTED && rdata->got_bytes > 0 ?
- rdata->got_bytes : result;
-}
-
-static int
-cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
- struct cifs_readdata *rdata, unsigned int len)
-{
- return uncached_fill_pages(server, rdata, NULL, len);
-}
-
-static int
-cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
- struct cifs_readdata *rdata,
- struct iov_iter *iter)
-{
- return uncached_fill_pages(server, rdata, iter, iter->count);
+ kref_put(&rdata->refcount, cifs_readdata_release);
}
static int cifs_resend_rdata(struct cifs_readdata *rdata,
@@ -3977,37 +3878,36 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
} while (rc == -EAGAIN);
fail:
- kref_put(&rdata->refcount, cifs_uncached_readdata_release);
+ kref_put(&rdata->refcount, cifs_readdata_release);
return rc;
}
static int
-cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
+cifs_send_async_read(loff_t fpos, size_t len, struct cifsFileInfo *open_file,
struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
struct cifs_aio_ctx *ctx)
{
struct cifs_readdata *rdata;
- unsigned int npages, rsize;
+ unsigned int rsize, nsegs, max_segs = INT_MAX;
struct cifs_credits credits_on_stack;
struct cifs_credits *credits = &credits_on_stack;
- size_t cur_len;
+ size_t cur_len, max_len;
int rc;
pid_t pid;
struct TCP_Server_Info *server;
- struct page **pagevec;
- size_t start;
- struct iov_iter direct_iov = ctx->iter;
server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (server->smbd_conn)
+ max_segs = server->smbd_conn->max_frmr_depth;
+#endif
+
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
else
pid = current->tgid;
- if (ctx->direct_io)
- iov_iter_advance(&direct_iov, offset - ctx->pos);
-
do {
if (open_file->invalidHandle) {
rc = cifs_reopen_file(open_file, true);
@@ -4027,78 +3927,37 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
if (rc)
break;
- cur_len = min_t(const size_t, len, rsize);
-
- if (ctx->direct_io) {
- ssize_t result;
-
- result = iov_iter_get_pages_alloc2(
- &direct_iov, &pagevec,
- cur_len, &start);
- if (result < 0) {
- cifs_dbg(VFS,
- "Couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n",
- result, iov_iter_type(&direct_iov),
- direct_iov.iov_offset,
- direct_iov.count);
- dump_stack();
-
- rc = result;
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
- cur_len = (size_t)result;
-
- rdata = cifs_readdata_direct_alloc(
- pagevec, cifs_uncached_readv_complete);
- if (!rdata) {
- add_credits_and_wake_if(server, credits, 0);
- rc = -ENOMEM;
- break;
- }
-
- npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE;
- rdata->page_offset = start;
- rdata->tailsz = npages > 1 ?
- cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE :
- cur_len;
-
- } else {
+ max_len = min_t(size_t, len, rsize);
- npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
- /* allocate a readdata struct */
- rdata = cifs_readdata_alloc(npages,
- cifs_uncached_readv_complete);
- if (!rdata) {
- add_credits_and_wake_if(server, credits, 0);
- rc = -ENOMEM;
- break;
- }
-
- rc = cifs_read_allocate_pages(rdata, npages);
- if (rc) {
- kvfree(rdata->pages);
- kfree(rdata);
- add_credits_and_wake_if(server, credits, 0);
- break;
- }
+ cur_len = cifs_limit_bvec_subset(&ctx->iter, max_len,
+ max_segs, &nsegs);
+ cifs_dbg(FYI, "read-to-iter len=%zx/%zx nsegs=%u/%lu/%u\n",
+ cur_len, max_len, nsegs, ctx->iter.nr_segs, max_segs);
+ if (cur_len == 0) {
+ rc = -EIO;
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
- rdata->tailsz = PAGE_SIZE;
+ rdata = cifs_readdata_alloc(cifs_uncached_readv_complete);
+ if (!rdata) {
+ add_credits_and_wake_if(server, credits, 0);
+ rc = -ENOMEM;
+ break;
}
- rdata->server = server;
- rdata->cfile = cifsFileInfo_get(open_file);
- rdata->nr_pages = npages;
- rdata->offset = offset;
- rdata->bytes = cur_len;
- rdata->pid = pid;
- rdata->pagesz = PAGE_SIZE;
- rdata->read_into_pages = cifs_uncached_read_into_pages;
- rdata->copy_into_pages = cifs_uncached_copy_into_pages;
- rdata->credits = credits_on_stack;
- rdata->ctx = ctx;
+ rdata->server = server;
+ rdata->cfile = cifsFileInfo_get(open_file);
+ rdata->offset = fpos;
+ rdata->bytes = cur_len;
+ rdata->pid = pid;
+ rdata->credits = credits_on_stack;
+ rdata->ctx = ctx;
kref_get(&ctx->refcount);
+ rdata->iter = ctx->iter;
+ iov_iter_truncate(&rdata->iter, cur_len);
+
rc = adjust_credits(server, &rdata->credits, rdata->bytes);
if (!rc) {
@@ -4110,17 +3969,15 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
if (rc) {
add_credits_and_wake_if(server, &rdata->credits, 0);
- kref_put(&rdata->refcount,
- cifs_uncached_readdata_release);
- if (rc == -EAGAIN) {
- iov_iter_revert(&direct_iov, cur_len);
+ kref_put(&rdata->refcount, cifs_readdata_release);
+ if (rc == -EAGAIN)
continue;
- }
break;
}
list_add_tail(&rdata->list, rdata_list);
- offset += cur_len;
+ iov_iter_advance(&ctx->iter, cur_len);
+ fpos += cur_len;
len -= cur_len;
} while (len > 0);
@@ -4162,22 +4019,6 @@ again:
list_del_init(&rdata->list);
INIT_LIST_HEAD(&tmp_list);
- /*
- * Got a part of data and then reconnect has
- * happened -- fill the buffer and continue
- * reading.
- */
- if (got_bytes && got_bytes < rdata->bytes) {
- rc = 0;
- if (!ctx->direct_io)
- rc = cifs_readdata_to_iov(rdata, to);
- if (rc) {
- kref_put(&rdata->refcount,
- cifs_uncached_readdata_release);
- continue;
- }
- }
-
if (ctx->direct_io) {
/*
* Re-use rdata as this is a
@@ -4194,7 +4035,7 @@ again:
&tmp_list, ctx);
kref_put(&rdata->refcount,
- cifs_uncached_readdata_release);
+ cifs_readdata_release);
}
list_splice(&tmp_list, &ctx->list);
@@ -4202,8 +4043,6 @@ again:
goto again;
} else if (rdata->result)
rc = rdata->result;
- else if (!ctx->direct_io)
- rc = cifs_readdata_to_iov(rdata, to);
/* if there was a short read -- discard anything left */
if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
@@ -4212,7 +4051,7 @@ again:
ctx->total_len += rdata->got_bytes;
}
list_del_init(&rdata->list);
- kref_put(&rdata->refcount, cifs_uncached_readdata_release);
+ kref_put(&rdata->refcount, cifs_readdata_release);
}
if (!ctx->direct_io)
@@ -4244,16 +4083,6 @@ static ssize_t __cifs_readv(
loff_t offset = iocb->ki_pos;
struct cifs_aio_ctx *ctx;
- /*
- * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC,
- * fall back to data copy read path
- * this could be improved by getting pages directly in ITER_KVEC
- */
- if (direct && iov_iter_is_kvec(to)) {
- cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n");
- direct = false;
- }
-
len = iov_iter_count(to);
if (!len)
return 0;
@@ -4272,26 +4101,53 @@ static ssize_t __cifs_readv(
if (!ctx)
return -ENOMEM;
- ctx->cfile = cifsFileInfo_get(cfile);
+ ctx->pos = offset;
+ ctx->direct_io = direct;
+ ctx->len = len;
+ ctx->cfile = cifsFileInfo_get(cfile);
+ ctx->nr_pinned_pages = 0;
if (!is_sync_kiocb(iocb))
ctx->iocb = iocb;
- if (user_backed_iter(to))
- ctx->should_dirty = true;
-
- if (direct) {
- ctx->pos = offset;
- ctx->direct_io = true;
- ctx->iter = *to;
- ctx->len = len;
- } else {
- rc = setup_aio_ctx_iter(ctx, to, ITER_DEST);
- if (rc) {
+ if (user_backed_iter(to)) {
+ /*
+ * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as
+ * they contain references to the calling process's virtual
+ * memory layout which won't be available in an async worker
+ * thread. This also takes a pin on every folio involved.
+ */
+ rc = netfs_extract_user_iter(to, iov_iter_count(to),
+ &ctx->iter, 0);
+ if (rc < 0) {
kref_put(&ctx->refcount, cifs_aio_ctx_release);
return rc;
}
- len = ctx->len;
+
+ ctx->nr_pinned_pages = rc;
+ ctx->bv = (void *)ctx->iter.bvec;
+ ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter);
+ ctx->should_dirty = true;
+ } else if ((iov_iter_is_bvec(to) || iov_iter_is_kvec(to)) &&
+ !is_sync_kiocb(iocb)) {
+ /*
+ * If the op is asynchronous, we need to copy the list attached
+ * to a BVEC/KVEC-type iterator, but we assume that the storage
+ * will be retained by the caller; in any case, we may or may
+ * not be able to pin the pages, so we don't try.
+ */
+ ctx->bv = (void *)dup_iter(&ctx->iter, to, GFP_KERNEL);
+ if (!ctx->bv) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -ENOMEM;
+ }
+ } else {
+ /*
+ * Otherwise, we just pass the iterator down as-is and rely on
+ * the caller to make sure the pages referred to by the
+ * iterator don't evaporate.
+ */
+ ctx->iter = *to;
}
if (direct) {
@@ -4490,23 +4346,22 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
* If the page is mmap'ed into a process' page tables, then we need to make
* sure that it doesn't change while being written back.
*/
-static vm_fault_t
-cifs_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
- /* Wait for the page to be written to the cache before we allow it to
- * be modified. We then assume the entire page will need writing back.
+ /* Wait for the folio to be written to the cache before we allow it to
+ * be modified. We then assume the entire folio will need writing back.
*/
#ifdef CONFIG_CIFS_FSCACHE
- if (PageFsCache(page) &&
- wait_on_page_fscache_killable(page) < 0)
+ if (folio_test_fscache(folio) &&
+ folio_wait_fscache_killable(folio) < 0)
return VM_FAULT_RETRY;
#endif
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- if (lock_page_killable(page) < 0)
+ if (folio_lock_killable(folio) < 0)
return VM_FAULT_RETRY;
return VM_FAULT_LOCKED;
}
@@ -4554,149 +4409,72 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
return rc;
}
-static void
-cifs_readv_complete(struct work_struct *work)
+/*
+ * Unlock a bunch of folios in the pagecache.
+ */
+static void cifs_unlock_folios(struct address_space *mapping, pgoff_t first, pgoff_t last)
{
- unsigned int i, got_bytes;
- struct cifs_readdata *rdata = container_of(work,
- struct cifs_readdata, work);
+ struct folio *folio;
+ XA_STATE(xas, &mapping->i_pages, first);
- got_bytes = rdata->got_bytes;
- for (i = 0; i < rdata->nr_pages; i++) {
- struct page *page = rdata->pages[i];
-
- if (rdata->result == 0 ||
- (rdata->result == -EAGAIN && got_bytes)) {
- flush_dcache_page(page);
- SetPageUptodate(page);
- } else
- SetPageError(page);
-
- if (rdata->result == 0 ||
- (rdata->result == -EAGAIN && got_bytes))
- cifs_readpage_to_fscache(rdata->mapping->host, page);
-
- unlock_page(page);
-
- got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes);
-
- put_page(page);
- rdata->pages[i] = NULL;
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last) {
+ folio_unlock(folio);
}
- kref_put(&rdata->refcount, cifs_readdata_release);
+ rcu_read_unlock();
}
-static int
-readpages_fill_pages(struct TCP_Server_Info *server,
- struct cifs_readdata *rdata, struct iov_iter *iter,
- unsigned int len)
+static void cifs_readahead_complete(struct work_struct *work)
{
- int result = 0;
- unsigned int i;
- u64 eof;
- pgoff_t eof_index;
- unsigned int nr_pages = rdata->nr_pages;
- unsigned int page_offset = rdata->page_offset;
-
- /* determine the eof that the server (probably) has */
- eof = CIFS_I(rdata->mapping->host)->server_eof;
- eof_index = eof ? (eof - 1) >> PAGE_SHIFT : 0;
- cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
-
- rdata->got_bytes = 0;
- rdata->tailsz = PAGE_SIZE;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = rdata->pages[i];
- unsigned int to_read = rdata->pagesz;
- size_t n;
-
- if (i == 0)
- to_read -= page_offset;
- else
- page_offset = 0;
-
- n = to_read;
-
- if (len >= to_read) {
- len -= to_read;
- } else if (len > 0) {
- /* enough for partial page, fill and zero the rest */
- zero_user(page, len + page_offset, to_read - len);
- n = rdata->tailsz = len;
- len = 0;
- } else if (page->index > eof_index) {
- /*
- * The VFS will not try to do readahead past the
- * i_size, but it's possible that we have outstanding
- * writes with gaps in the middle and the i_size hasn't
- * caught up yet. Populate those with zeroed out pages
- * to prevent the VFS from repeatedly attempting to
- * fill them until the writes are flushed.
- */
- zero_user(page, 0, PAGE_SIZE);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
- put_page(page);
- rdata->pages[i] = NULL;
- rdata->nr_pages--;
- continue;
- } else {
- /* no need to hold page hostage */
- unlock_page(page);
- put_page(page);
- rdata->pages[i] = NULL;
- rdata->nr_pages--;
- continue;
- }
+ struct cifs_readdata *rdata = container_of(work,
+ struct cifs_readdata, work);
+ struct folio *folio;
+ pgoff_t last;
+ bool good = rdata->result == 0 || (rdata->result == -EAGAIN && rdata->got_bytes);
- if (iter)
- result = copy_page_from_iter(
- page, page_offset, n, iter);
-#ifdef CONFIG_CIFS_SMB_DIRECT
- else if (rdata->mr)
- result = n;
-#endif
- else
- result = cifs_read_page_from_socket(
- server, page, page_offset, n);
- if (result < 0)
- break;
+ XA_STATE(xas, &rdata->mapping->i_pages, rdata->offset / PAGE_SIZE);
- rdata->got_bytes += result;
- }
+ if (good)
+ cifs_readahead_to_fscache(rdata->mapping->host,
+ rdata->offset, rdata->bytes);
- return result != -ECONNABORTED && rdata->got_bytes > 0 ?
- rdata->got_bytes : result;
-}
+ if (iov_iter_count(&rdata->iter) > 0)
+ iov_iter_zero(iov_iter_count(&rdata->iter), &rdata->iter);
-static int
-cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
- struct cifs_readdata *rdata, unsigned int len)
-{
- return readpages_fill_pages(server, rdata, NULL, len);
-}
+ last = (rdata->offset + rdata->bytes - 1) / PAGE_SIZE;
-static int
-cifs_readpages_copy_into_pages(struct TCP_Server_Info *server,
- struct cifs_readdata *rdata,
- struct iov_iter *iter)
-{
- return readpages_fill_pages(server, rdata, iter, iter->count);
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last) {
+ if (good) {
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+ }
+ folio_unlock(folio);
+ }
+ rcu_read_unlock();
+
+ kref_put(&rdata->refcount, cifs_readdata_release);
}
static void cifs_readahead(struct readahead_control *ractl)
{
- int rc;
struct cifsFileInfo *open_file = ractl->file->private_data;
struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file);
struct TCP_Server_Info *server;
- pid_t pid;
- unsigned int xid, nr_pages, last_batch_size = 0, cache_nr_pages = 0;
- pgoff_t next_cached = ULONG_MAX;
+ unsigned int xid, nr_pages, cache_nr_pages = 0;
+ unsigned int ra_pages;
+ pgoff_t next_cached = ULONG_MAX, ra_index;
bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) &&
cifs_inode_cookie(ractl->mapping->host)->cache_priv;
bool check_cache = caching;
+ pid_t pid;
+ int rc = 0;
+
+ /* Note that readahead_count() lags behind our dequeuing of pages from
+ * the ractl, wo we have to keep track for ourselves.
+ */
+ ra_pages = readahead_count(ractl);
+ ra_index = readahead_index(ractl);
xid = get_xid();
@@ -4705,22 +4483,21 @@ static void cifs_readahead(struct readahead_control *ractl)
else
pid = current->tgid;
- rc = 0;
server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
- __func__, ractl->file, ractl->mapping, readahead_count(ractl));
+ __func__, ractl->file, ractl->mapping, ra_pages);
/*
* Chop the readahead request up into rsize-sized read requests.
*/
- while ((nr_pages = readahead_count(ractl) - last_batch_size)) {
- unsigned int i, got, rsize;
- struct page *page;
+ while ((nr_pages = ra_pages)) {
+ unsigned int i, rsize;
struct cifs_readdata *rdata;
struct cifs_credits credits_on_stack;
struct cifs_credits *credits = &credits_on_stack;
- pgoff_t index = readahead_index(ractl) + last_batch_size;
+ struct folio *folio;
+ pgoff_t fsize;
/*
* Find out if we have anything cached in the range of
@@ -4729,21 +4506,22 @@ static void cifs_readahead(struct readahead_control *ractl)
if (caching) {
if (check_cache) {
rc = cifs_fscache_query_occupancy(
- ractl->mapping->host, index, nr_pages,
+ ractl->mapping->host, ra_index, nr_pages,
&next_cached, &cache_nr_pages);
if (rc < 0)
caching = false;
check_cache = false;
}
- if (index == next_cached) {
+ if (ra_index == next_cached) {
/*
* TODO: Send a whole batch of pages to be read
* by the cache.
*/
- struct folio *folio = readahead_folio(ractl);
-
- last_batch_size = folio_nr_pages(folio);
+ folio = readahead_folio(ractl);
+ fsize = folio_nr_pages(folio);
+ ra_pages -= fsize;
+ ra_index += fsize;
if (cifs_readpage_from_fscache(ractl->mapping->host,
&folio->page) < 0) {
/*
@@ -4754,8 +4532,8 @@ static void cifs_readahead(struct readahead_control *ractl)
caching = false;
}
folio_unlock(folio);
- next_cached++;
- cache_nr_pages--;
+ next_cached += fsize;
+ cache_nr_pages -= fsize;
if (cache_nr_pages == 0)
check_cache = true;
continue;
@@ -4780,8 +4558,9 @@ static void cifs_readahead(struct readahead_control *ractl)
&rsize, credits);
if (rc)
break;
- nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl));
- nr_pages = min_t(size_t, nr_pages, next_cached - index);
+ nr_pages = min_t(size_t, rsize / PAGE_SIZE, ra_pages);
+ if (next_cached != ULONG_MAX)
+ nr_pages = min_t(size_t, nr_pages, next_cached - ra_index);
/*
* Give up immediately if rsize is too small to read an entire
@@ -4794,33 +4573,31 @@ static void cifs_readahead(struct readahead_control *ractl)
break;
}
- rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
+ rdata = cifs_readdata_alloc(cifs_readahead_complete);
if (!rdata) {
/* best to give up if we're out of mem */
add_credits_and_wake_if(server, credits, 0);
break;
}
- got = __readahead_batch(ractl, rdata->pages, nr_pages);
- if (got != nr_pages) {
- pr_warn("__readahead_batch() returned %u/%u\n",
- got, nr_pages);
- nr_pages = got;
- }
-
- rdata->nr_pages = nr_pages;
- rdata->bytes = readahead_batch_length(ractl);
+ rdata->offset = ra_index * PAGE_SIZE;
+ rdata->bytes = nr_pages * PAGE_SIZE;
rdata->cfile = cifsFileInfo_get(open_file);
rdata->server = server;
rdata->mapping = ractl->mapping;
- rdata->offset = readahead_pos(ractl);
rdata->pid = pid;
- rdata->pagesz = PAGE_SIZE;
- rdata->tailsz = PAGE_SIZE;
- rdata->read_into_pages = cifs_readpages_read_into_pages;
- rdata->copy_into_pages = cifs_readpages_copy_into_pages;
rdata->credits = credits_on_stack;
+ for (i = 0; i < nr_pages; i++) {
+ if (!readahead_folio(ractl))
+ WARN_ON(1);
+ }
+ ra_pages -= nr_pages;
+ ra_index += nr_pages;
+
+ iov_iter_xarray(&rdata->iter, ITER_DEST, &rdata->mapping->i_pages,
+ rdata->offset, rdata->bytes);
+
rc = adjust_credits(server, &rdata->credits, rdata->bytes);
if (!rc) {
if (rdata->cfile->invalidHandle)
@@ -4831,18 +4608,15 @@ static void cifs_readahead(struct readahead_control *ractl)
if (rc) {
add_credits_and_wake_if(server, &rdata->credits, 0);
- for (i = 0; i < rdata->nr_pages; i++) {
- page = rdata->pages[i];
- unlock_page(page);
- put_page(page);
- }
+ cifs_unlock_folios(rdata->mapping,
+ rdata->offset / PAGE_SIZE,
+ (rdata->offset + rdata->bytes - 1) / PAGE_SIZE);
/* Fallback to the readpage in error/reconnect cases */
kref_put(&rdata->refcount, cifs_readdata_release);
break;
}
kref_put(&rdata->refcount, cifs_readdata_release);
- last_batch_size = nr_pages;
}
free_xid(xid);
@@ -4884,10 +4658,6 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
flush_dcache_page(page);
SetPageUptodate(page);
-
- /* send this page to the cache */
- cifs_readpage_to_fscache(file_inode(file), page);
-
rc = 0;
io_error:
@@ -5274,3 +5044,19 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.launder_folio = cifs_launder_folio,
.migrate_folio = filemap_migrate_folio,
};
+
+/*
+ * Splice data from a file into a pipe.
+ */
+ssize_t cifs_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ if (unlikely(*ppos >= file_inode(in)->i_sb->s_maxbytes))
+ return 0;
+ if (unlikely(!len))
+ return 0;
+ if (in->f_flags & O_DIRECT)
+ return direct_splice_read(in, ppos, pipe, len, flags);
+ return filemap_splice_read(in, ppos, pipe, len, flags);
+}
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 0911327ebfde..8f6909d633da 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -163,20 +163,16 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page)
/*
* Fallback page writing interface.
*/
-static int fscache_fallback_write_page(struct inode *inode, struct page *page,
- bool no_space_allocated_yet)
+static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_t len,
+ bool no_space_allocated_yet)
{
struct netfs_cache_resources cres;
struct fscache_cookie *cookie = cifs_inode_cookie(inode);
struct iov_iter iter;
- struct bio_vec bvec;
- loff_t start = page_offset(page);
- size_t len = PAGE_SIZE;
int ret;
memset(&cres, 0, sizeof(cres));
- bvec_set_page(&bvec, page, PAGE_SIZE, 0);
- iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
+ iov_iter_xarray(&iter, ITER_SOURCE, &inode->i_mapping->i_pages, start, len);
ret = fscache_begin_write_operation(&cres, cookie);
if (ret < 0)
@@ -185,7 +181,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
no_space_allocated_yet);
if (ret == 0)
- ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL);
+ ret = fscache_write(&cres, start, &iter, NULL, NULL);
fscache_end_operation(&cres);
return ret;
}
@@ -209,12 +205,12 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
return 0;
}
-void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
+void __cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len)
{
- cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
- __func__, cifs_inode_cookie(inode), page, inode);
+ cifs_dbg(FYI, "%s: (fsc: %p, p: %llx, l: %zx, i: %p)\n",
+ __func__, cifs_inode_cookie(inode), pos, len, inode);
- fscache_fallback_write_page(inode, page, true);
+ fscache_fallback_write_pages(inode, pos, len, true);
}
/*
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 67b601041f0a..173999610997 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -90,7 +90,7 @@ static inline int cifs_fscache_query_occupancy(struct inode *inode,
}
extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage);
-extern void __cifs_readpage_to_fscache(struct inode *pinode, struct page *ppage);
+extern void __cifs_readahead_to_fscache(struct inode *pinode, loff_t pos, size_t len);
static inline int cifs_readpage_from_fscache(struct inode *inode,
@@ -101,11 +101,11 @@ static inline int cifs_readpage_from_fscache(struct inode *inode,
return -ENOBUFS;
}
-static inline void cifs_readpage_to_fscache(struct inode *inode,
- struct page *page)
+static inline void cifs_readahead_to_fscache(struct inode *inode,
+ loff_t pos, size_t len)
{
if (cifs_inode_cookie(inode))
- __cifs_readpage_to_fscache(inode, page);
+ __cifs_readahead_to_fscache(inode, pos, len);
}
#else /* CONFIG_CIFS_FSCACHE */
@@ -141,7 +141,7 @@ cifs_readpage_from_fscache(struct inode *inode, struct page *page)
}
static inline
-void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {}
+void cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len) {}
#endif /* CONFIG_CIFS_FSCACHE */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 11cdc7cfe0ba..1087ac6104a9 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -508,14 +508,15 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_READ;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
- oparms.disposition = FILE_OPEN;
- oparms.path = path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = GENERIC_READ,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR),
+ .disposition = FILE_OPEN,
+ .path = path,
+ .fid = &fid,
+ };
if (tcon->ses->server->oplocks)
oplock = REQ_OPLOCK;
@@ -1518,14 +1519,15 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
goto out;
}
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
- oparms.disposition = FILE_OPEN;
- oparms.path = full_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = DELETE | FILE_WRITE_ATTRIBUTES,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR),
+ .disposition = FILE_OPEN,
+ .path = full_path,
+ .fid = &fid,
+ };
rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc != 0)
@@ -2112,15 +2114,16 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
if (to_dentry->d_parent != from_dentry->d_parent)
goto do_rename_exit;
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- /* open the file to be renamed -- we need DELETE perms */
- oparms.desired_access = DELETE;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
- oparms.disposition = FILE_OPEN;
- oparms.path = from_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ /* open the file to be renamed -- we need DELETE perms */
+ .desired_access = DELETE,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR),
+ .disposition = FILE_OPEN,
+ .path = from_path,
+ .fid = &fid,
+ };
rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc == 0) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 4510dea77be3..7d97c10f2453 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -271,14 +271,15 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
int buf_type = CIFS_NO_BUFFER;
FILE_ALL_INFO file_info;
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_READ;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
- oparms.disposition = FILE_OPEN;
- oparms.path = path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = GENERIC_READ,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR),
+ .disposition = FILE_OPEN,
+ .path = path,
+ .fid = &fid,
+ };
rc = CIFS_open(xid, &oparms, &oplock, &file_info);
if (rc)
@@ -313,14 +314,15 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms = {0};
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
- oparms.disposition = FILE_CREATE;
- oparms.path = path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = GENERIC_WRITE,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR),
+ .disposition = FILE_CREATE,
+ .path = path,
+ .fid = &fid,
+ };
rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc)
@@ -355,13 +357,14 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct smb2_file_all_info *pfile_info = NULL;
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_READ;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
- oparms.disposition = FILE_OPEN;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = GENERIC_READ,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR),
+ .disposition = FILE_OPEN,
+ .fid = &fid,
+ };
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (utf16_path == NULL)
@@ -421,14 +424,15 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
if (!utf16_path)
return -ENOMEM;
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
- oparms.disposition = FILE_CREATE;
- oparms.fid = &fid;
- oparms.reconnect = false;
- oparms.mode = 0644;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = GENERIC_WRITE,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR),
+ .disposition = FILE_CREATE,
+ .fid = &fid,
+ .mode = 0644,
+ };
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
NULL, NULL);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 95cc4d7dd806..2905734eb289 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -966,16 +966,22 @@ cifs_aio_ctx_release(struct kref *refcount)
/*
* ctx->bv is only set if setup_aio_ctx_iter() was call successfuly
- * which means that iov_iter_get_pages() was a success and thus that
- * we have taken reference on pages.
+ * which means that iov_iter_extract_pages() was a success and thus
+ * that we may have references or pins on pages that we need to
+ * release.
*/
if (ctx->bv) {
- unsigned i;
+ if (ctx->should_dirty || ctx->bv_need_unpin) {
+ unsigned int i;
- for (i = 0; i < ctx->npages; i++) {
- if (ctx->should_dirty)
- set_page_dirty(ctx->bv[i].bv_page);
- put_page(ctx->bv[i].bv_page);
+ for (i = 0; i < ctx->nr_pinned_pages; i++) {
+ struct page *page = ctx->bv[i].bv_page;
+
+ if (ctx->should_dirty)
+ set_page_dirty(page);
+ if (ctx->bv_need_unpin)
+ unpin_user_page(page);
+ }
}
kvfree(ctx->bv);
}
@@ -983,94 +989,6 @@ cifs_aio_ctx_release(struct kref *refcount)
kfree(ctx);
}
-#define CIFS_AIO_KMALLOC_LIMIT (1024 * 1024)
-
-int
-setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
-{
- ssize_t rc;
- unsigned int cur_npages;
- unsigned int npages = 0;
- unsigned int i;
- size_t len;
- size_t count = iov_iter_count(iter);
- unsigned int saved_len;
- size_t start;
- unsigned int max_pages = iov_iter_npages(iter, INT_MAX);
- struct page **pages = NULL;
- struct bio_vec *bv = NULL;
-
- if (iov_iter_is_kvec(iter)) {
- memcpy(&ctx->iter, iter, sizeof(*iter));
- ctx->len = count;
- iov_iter_advance(iter, count);
- return 0;
- }
-
- if (array_size(max_pages, sizeof(*bv)) <= CIFS_AIO_KMALLOC_LIMIT)
- bv = kmalloc_array(max_pages, sizeof(*bv), GFP_KERNEL);
-
- if (!bv) {
- bv = vmalloc(array_size(max_pages, sizeof(*bv)));
- if (!bv)
- return -ENOMEM;
- }
-
- if (array_size(max_pages, sizeof(*pages)) <= CIFS_AIO_KMALLOC_LIMIT)
- pages = kmalloc_array(max_pages, sizeof(*pages), GFP_KERNEL);
-
- if (!pages) {
- pages = vmalloc(array_size(max_pages, sizeof(*pages)));
- if (!pages) {
- kvfree(bv);
- return -ENOMEM;
- }
- }
-
- saved_len = count;
-
- while (count && npages < max_pages) {
- rc = iov_iter_get_pages2(iter, pages, count, max_pages, &start);
- if (rc < 0) {
- cifs_dbg(VFS, "Couldn't get user pages (rc=%zd)\n", rc);
- break;
- }
-
- if (rc > count) {
- cifs_dbg(VFS, "get pages rc=%zd more than %zu\n", rc,
- count);
- break;
- }
-
- count -= rc;
- rc += start;
- cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE);
-
- if (npages + cur_npages > max_pages) {
- cifs_dbg(VFS, "out of vec array capacity (%u vs %u)\n",
- npages + cur_npages, max_pages);
- break;
- }
-
- for (i = 0; i < cur_npages; i++) {
- len = rc > PAGE_SIZE ? PAGE_SIZE : rc;
- bvec_set_page(&bv[npages + i], pages[i], len - start,
- start);
- rc -= len;
- start = 0;
- }
-
- npages += cur_npages;
- }
-
- kvfree(pages);
- ctx->bv = bv;
- ctx->len = saved_len - count;
- ctx->npages = npages;
- iov_iter_bvec(&ctx->iter, rw, ctx->bv, npages, ctx->len);
- return 0;
-}
-
/**
* cifs_alloc_hash - allocate hash and hash context together
* @name: The name of the crypto hash algo
@@ -1128,25 +1046,6 @@ cifs_free_hash(struct shash_desc **sdesc)
*sdesc = NULL;
}
-/**
- * rqst_page_get_length - obtain the length and offset for a page in smb_rqst
- * @rqst: The request descriptor
- * @page: The index of the page to query
- * @len: Where to store the length for this page:
- * @offset: Where to store the offset for this page
- */
-void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
- unsigned int *len, unsigned int *offset)
-{
- *len = rqst->rq_pagesz;
- *offset = (page == 0) ? rqst->rq_offset : 0;
-
- if (rqst->rq_npages == 1 || page == rqst->rq_npages-1)
- *len = rqst->rq_tailsz;
- else if (page == 0)
- *len = rqst->rq_pagesz - rqst->rq_offset;
-}
-
void extract_unc_hostname(const char *unc, const char **h, size_t *len)
{
const char *end;
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 55758b9ec877..2c5dde2ece58 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -83,7 +83,7 @@ typedef struct _NEGOTIATE_MESSAGE {
SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */
/* SECURITY_BUFFER for version info not present since we
do not set the version is present flag */
- char DomainString[0];
+ char DomainString[];
/* followed by WorkstationString */
} __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE;
@@ -135,7 +135,7 @@ typedef struct _AUTHENTICATE_MESSAGE {
__le32 NegotiateFlags;
/* SECURITY_BUFFER for version info not present since we
do not set the version is present flag */
- char UserString[0];
+ char UserString[];
} __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE;
/*
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 2d75ba5aaa8a..ef638086d734 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -495,7 +495,7 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
FIND_FILE_STANDARD_INFO *pfData;
pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo;
- new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) +
+ new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + 1 +
pfData->FileNameLength;
} else {
u32 next_offset = le32_to_cpu(pDirInfo->NextEntryOffset);
@@ -513,9 +513,9 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
new_entry, end_of_smb, old_entry);
return NULL;
} else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
- (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
+ (new_entry + sizeof(FIND_FILE_STANDARD_INFO) + 1 > end_of_smb))
|| ((level != SMB_FIND_FILE_INFO_STANDARD) &&
- (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) {
+ (new_entry + sizeof(FILE_DIRECTORY_INFO) + 1 > end_of_smb))) {
cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n",
new_entry, end_of_smb);
return NULL;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index c47b254f0d1e..d2cbae4b5d21 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -480,7 +480,6 @@ out:
* remove this channel
*/
cancel_delayed_work_sync(&chan->server->echo);
- cancel_delayed_work_sync(&chan->server->resolve);
cancel_delayed_work_sync(&chan->server->reconnect);
spin_lock(&ses->chan_lock);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 4cb364454e13..abda6148be10 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -576,14 +576,15 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE))
return 0;
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.disposition = FILE_OPEN;
- oparms.path = full_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .disposition = FILE_OPEN,
+ .path = full_path,
+ .fid = &fid,
+ };
/* Need to check if this is a symbolic link or not */
tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
@@ -823,14 +824,15 @@ smb_set_file_info(struct inode *inode, const char *full_path,
goto out;
}
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
- oparms.disposition = FILE_OPEN;
- oparms.path = full_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR),
+ .disposition = FILE_OPEN,
+ .path = full_path,
+ .fid = &fid,
+ };
cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
rc = CIFS_open(xid, &oparms, &oplock, NULL);
@@ -998,15 +1000,16 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
goto out;
}
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.create_options = cifs_create_options(cifs_sb,
- OPEN_REPARSE_POINT);
- oparms.disposition = FILE_OPEN;
- oparms.path = full_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .create_options = cifs_create_options(cifs_sb,
+ OPEN_REPARSE_POINT),
+ .disposition = FILE_OPEN,
+ .path = full_path,
+ .fid = &fid,
+ };
rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc)
@@ -1115,15 +1118,16 @@ cifs_make_node(unsigned int xid, struct inode *inode,
cifs_dbg(FYI, "sfu compat create special file\n");
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
- CREATE_OPTION_SPECIAL);
- oparms.disposition = FILE_CREATE;
- oparms.path = full_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = GENERIC_WRITE,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
+ CREATE_OPTION_SPECIAL),
+ .disposition = FILE_CREATE,
+ .path = full_path,
+ .fid = &fid,
+ };
if (tcon->ses->server->oplocks)
oplock = REQ_OPLOCK;
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 9f1dd04b555a..e0ee96d69d49 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -35,7 +35,7 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov)
len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp,
ErrorContextData) +
sizeof(struct smb2_symlink_err_rsp));
- if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err))
+ if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err) + 1)
return ERR_PTR(-EINVAL);
p = (struct smb2_error_context_rsp *)err->ErrorData;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 8521adf9ce79..37b4cd59245d 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -105,14 +105,15 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
goto finished;
}
- vars->oparms.tcon = tcon;
- vars->oparms.desired_access = desired_access;
- vars->oparms.disposition = create_disposition;
- vars->oparms.create_options = cifs_create_options(cifs_sb, create_options);
- vars->oparms.fid = &fid;
- vars->oparms.reconnect = false;
- vars->oparms.mode = mode;
- vars->oparms.cifs_sb = cifs_sb;
+ vars->oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = desired_access,
+ .disposition = create_disposition,
+ .create_options = cifs_create_options(cifs_sb, create_options),
+ .fid = &fid,
+ .mode = mode,
+ .cifs_sb = cifs_sb,
+ };
rqst[num_rqst].rq_iov = &vars->open_iov[0];
rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 572293c18e16..3935a60db5c3 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -113,7 +113,7 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len,
} else if (nc_offset + 1 == non_ctxlen) {
cifs_dbg(FYI, "no SPNEGO security blob in negprot rsp\n");
size_of_pad_before_neg_ctxts = 0;
- } else if (non_ctxlen == SMB311_NEGPROT_BASE_SIZE)
+ } else if (non_ctxlen == SMB311_NEGPROT_BASE_SIZE + 1)
/* has padding, but no SPNEGO blob */
size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen + 1;
else
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index cb2deac6b2d7..f79b075f2992 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -729,12 +729,13 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
struct cached_fid *cfid = NULL;
- oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .fid = &fid,
+ };
rc = open_cached_dir(xid, tcon, "", cifs_sb, false, &cfid);
if (rc == 0)
@@ -771,12 +772,13 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_fid fid;
- oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .fid = &fid,
+ };
rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
NULL, NULL);
@@ -816,12 +818,13 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
if (!utf16_path)
return -ENOMEM;
- oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .fid = &fid,
+ };
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
&err_iov, &err_buftype);
@@ -1097,13 +1100,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
rqst[0].rq_iov = open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
- memset(&oparms, 0, sizeof(oparms));
- oparms.tcon = tcon;
- oparms.desired_access = FILE_WRITE_EA;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = FILE_WRITE_EA,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .fid = &fid,
+ };
rc = SMB2_open_init(tcon, server,
&rqst[0], &oplock, &oparms, utf16_path);
@@ -1453,12 +1456,12 @@ smb2_ioctl_query_info(const unsigned int xid,
rqst[0].rq_iov = &vars->open_iov[0];
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
- memset(&oparms, 0, sizeof(oparms));
- oparms.tcon = tcon;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, create_options);
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, create_options),
+ .fid = &fid,
+ };
if (qi.flags & PASSTHRU_FSCTL) {
switch (qi.info_type & FSCTL_DEVICE_ACCESS_MASK) {
@@ -2088,12 +2091,13 @@ smb3_notify(const unsigned int xid, struct file *pfile,
}
tcon = cifs_sb_master_tcon(cifs_sb);
- oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .fid = &fid,
+ };
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL,
NULL);
@@ -2159,12 +2163,13 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
rqst[0].rq_iov = open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
- oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.fid = fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .fid = fid,
+ };
rc = SMB2_open_init(tcon, server,
&rqst[0], &oplock, &oparms, utf16_path);
@@ -2490,12 +2495,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
rqst[0].rq_iov = open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
- oparms.tcon = tcon;
- oparms.desired_access = desired_access;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = desired_access,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .fid = &fid,
+ };
rc = SMB2_open_init(tcon, server,
&rqst[0], &oplock, &oparms, utf16_path);
@@ -2623,12 +2629,13 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
if (!tcon->posix_extensions)
return smb2_queryfs(xid, tcon, cifs_sb, buf);
- oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .fid = &fid,
+ };
rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
NULL, NULL);
@@ -2916,13 +2923,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
rqst[0].rq_iov = open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
- memset(&oparms, 0, sizeof(oparms));
- oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, create_options);
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, create_options),
+ .fid = &fid,
+ };
rc = SMB2_open_init(tcon, server,
&rqst[0], &oplock, &oparms, utf16_path);
@@ -3056,13 +3063,13 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
rqst[0].rq_iov = open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
- memset(&oparms, 0, sizeof(oparms));
- oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT);
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT),
+ .fid = &fid,
+ };
rc = SMB2_open_init(tcon, server,
&rqst[0], &oplock, &oparms, utf16_path);
@@ -3196,17 +3203,20 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
return ERR_PTR(rc);
}
- oparms.tcon = tcon;
- oparms.desired_access = READ_CONTROL;
- oparms.disposition = FILE_OPEN;
- /*
- * When querying an ACL, even if the file is a symlink we want to open
- * the source not the target, and so the protocol requires that the
- * client specify this flag when opening a reparse point
- */
- oparms.create_options = cifs_create_options(cifs_sb, 0) | OPEN_REPARSE_POINT;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = READ_CONTROL,
+ .disposition = FILE_OPEN,
+ /*
+ * When querying an ACL, even if the file is a symlink
+ * we want to open the source not the target, and so
+ * the protocol requires that the client specify this
+ * flag when opening a reparse point
+ */
+ .create_options = cifs_create_options(cifs_sb, 0) |
+ OPEN_REPARSE_POINT,
+ .fid = &fid,
+ };
if (info & SACL_SECINFO)
oparms.desired_access |= SYSTEM_SECURITY;
@@ -3265,13 +3275,14 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
return rc;
}
- oparms.tcon = tcon;
- oparms.desired_access = access_flags;
- oparms.create_options = cifs_create_options(cifs_sb, 0);
- oparms.disposition = FILE_OPEN;
- oparms.path = path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = access_flags,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .disposition = FILE_OPEN,
+ .path = path,
+ .fid = &fid,
+ };
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
NULL, NULL);
@@ -4227,8 +4238,8 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst,
int num_rqst, const u8 *sig, u8 **iv,
- struct aead_request **req, struct scatterlist **sgl,
- unsigned int *num_sgs)
+ struct aead_request **req, struct sg_table *sgt,
+ unsigned int *num_sgs, size_t *sensitive_size)
{
unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm);
unsigned int iv_size = crypto_aead_ivsize(tfm);
@@ -4236,70 +4247,75 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst
u8 *p;
*num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig);
+ if (IS_ERR_VALUE((long)(int)*num_sgs))
+ return ERR_PTR(*num_sgs);
len = iv_size;
len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1);
len = ALIGN(len, crypto_tfm_ctx_alignment());
len += req_size;
len = ALIGN(len, __alignof__(struct scatterlist));
- len += *num_sgs * sizeof(**sgl);
+ len += array_size(*num_sgs, sizeof(struct scatterlist));
+ *sensitive_size = len;
- p = kmalloc(len, GFP_ATOMIC);
+ p = kvzalloc(len, GFP_NOFS);
if (!p)
- return NULL;
+ return ERR_PTR(-ENOMEM);
*iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1);
*req = (struct aead_request *)PTR_ALIGN(*iv + iv_size,
crypto_tfm_ctx_alignment());
- *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size,
- __alignof__(struct scatterlist));
+ sgt->sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size,
+ __alignof__(struct scatterlist));
return p;
}
-static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst,
int num_rqst, const u8 *sig, u8 **iv,
- struct aead_request **req, struct scatterlist **sgl)
+ struct aead_request **req, struct scatterlist **sgl,
+ size_t *sensitive_size)
{
- unsigned int off, len, skip;
- struct scatterlist *sg;
- unsigned int num_sgs;
- unsigned long addr;
- int i, j;
+ struct sg_table sgtable = {};
+ unsigned int skip, num_sgs, i, j;
+ ssize_t rc;
void *p;
- p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs);
- if (!p)
- return NULL;
+ p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable,
+ &num_sgs, sensitive_size);
+ if (IS_ERR(p))
+ return ERR_CAST(p);
- sg_init_table(*sgl, num_sgs);
- sg = *sgl;
+ sg_init_marker(sgtable.sgl, num_sgs);
- /* Assumes the first rqst has a transform header as the first iov.
- * I.e.
- * rqst[0].rq_iov[0] is transform header
- * rqst[0].rq_iov[1+] data to be encrypted/decrypted
- * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+ /*
+ * The first rqst has a transform header where the
+ * first 20 bytes are not part of the encrypted blob.
*/
+ skip = 20;
+
for (i = 0; i < num_rqst; i++) {
- /*
- * The first rqst has a transform header where the
- * first 20 bytes are not part of the encrypted blob.
- */
+ struct iov_iter *iter = &rqst[i].rq_iter;
+ size_t count = iov_iter_count(iter);
+
for (j = 0; j < rqst[i].rq_nvec; j++) {
- struct kvec *iov = &rqst[i].rq_iov[j];
+ cifs_sg_set_buf(&sgtable,
+ rqst[i].rq_iov[j].iov_base + skip,
+ rqst[i].rq_iov[j].iov_len - skip);
- skip = (i == 0) && (j == 0) ? 20 : 0;
- addr = (unsigned long)iov->iov_base + skip;
- len = iov->iov_len - skip;
- sg = cifs_sg_set_buf(sg, (void *)addr, len);
- }
- for (j = 0; j < rqst[i].rq_npages; j++) {
- rqst_page_get_length(&rqst[i], j, &len, &off);
- sg_set_page(sg++, rqst[i].rq_pages[j], len, off);
+ /* See the above comment on the 'skip' assignment */
+ skip = 0;
}
+ sgtable.orig_nents = sgtable.nents;
+
+ rc = netfs_extract_iter_to_sg(iter, count, &sgtable,
+ num_sgs - sgtable.nents, 0);
+ iov_iter_revert(iter, rc);
+ sgtable.orig_nents = sgtable.nents;
}
- cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE);
+ cifs_sg_set_buf(&sgtable, sig, SMB2_SIGNATURE_SIZE);
+ sg_mark_end(&sgtable.sgl[sgtable.nents - 1]);
+ *sgl = sgtable.sgl;
return p;
}
@@ -4353,6 +4369,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
void *creq;
+ size_t sensitive_size;
rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
if (rc) {
@@ -4386,9 +4403,10 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg);
- if (unlikely(!creq))
- return -ENOMEM;
+ creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg,
+ &sensitive_size);
+ if (IS_ERR(creq))
+ return PTR_ERR(creq);
if (!enc) {
memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE);
@@ -4416,22 +4434,35 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
- kfree_sensitive(creq);
+ kvfree_sensitive(creq, sensitive_size);
return rc;
}
+/*
+ * Clear a read buffer, discarding the folios which have XA_MARK_0 set.
+ */
+static void cifs_clear_xarray_buffer(struct xarray *buffer)
+{
+ struct folio *folio;
+
+ XA_STATE(xas, buffer, 0);
+
+ rcu_read_lock();
+ xas_for_each_marked(&xas, folio, ULONG_MAX, XA_MARK_0) {
+ folio_put(folio);
+ }
+ rcu_read_unlock();
+ xa_destroy(buffer);
+}
+
void
smb3_free_compound_rqst(int num_rqst, struct smb_rqst *rqst)
{
- int i, j;
+ int i;
- for (i = 0; i < num_rqst; i++) {
- if (rqst[i].rq_pages) {
- for (j = rqst[i].rq_npages - 1; j >= 0; j--)
- put_page(rqst[i].rq_pages[j]);
- kfree(rqst[i].rq_pages);
- }
- }
+ for (i = 0; i < num_rqst; i++)
+ if (!xa_empty(&rqst[i].rq_buffer))
+ cifs_clear_xarray_buffer(&rqst[i].rq_buffer);
}
/*
@@ -4451,9 +4482,8 @@ static int
smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
struct smb_rqst *new_rq, struct smb_rqst *old_rq)
{
- struct page **pages;
struct smb2_transform_hdr *tr_hdr = new_rq[0].rq_iov[0].iov_base;
- unsigned int npages;
+ struct page *page;
unsigned int orig_len = 0;
int i, j;
int rc = -ENOMEM;
@@ -4461,40 +4491,45 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
for (i = 1; i < num_rqst; i++) {
struct smb_rqst *old = &old_rq[i - 1];
struct smb_rqst *new = &new_rq[i];
+ struct xarray *buffer = &new->rq_buffer;
+ size_t size = iov_iter_count(&old->rq_iter), seg, copied = 0;
orig_len += smb_rqst_len(server, old);
new->rq_iov = old->rq_iov;
new->rq_nvec = old->rq_nvec;
- npages = old->rq_npages;
- if (!npages)
- continue;
-
- pages = kmalloc_array(npages, sizeof(struct page *),
- GFP_KERNEL);
- if (!pages)
- goto err_free;
-
- new->rq_pages = pages;
- new->rq_npages = npages;
- new->rq_offset = old->rq_offset;
- new->rq_pagesz = old->rq_pagesz;
- new->rq_tailsz = old->rq_tailsz;
-
- for (j = 0; j < npages; j++) {
- pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
- if (!pages[j])
- goto err_free;
- }
+ xa_init(buffer);
+
+ if (size > 0) {
+ unsigned int npages = DIV_ROUND_UP(size, PAGE_SIZE);
+
+ for (j = 0; j < npages; j++) {
+ void *o;
- /* copy pages form the old */
- for (j = 0; j < npages; j++) {
- unsigned int offset, len;
+ rc = -ENOMEM;
+ page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+ if (!page)
+ goto err_free;
+ page->index = j;
+ o = xa_store(buffer, j, page, GFP_KERNEL);
+ if (xa_is_err(o)) {
+ rc = xa_err(o);
+ put_page(page);
+ goto err_free;
+ }
- rqst_page_get_length(new, j, &len, &offset);
+ xa_set_mark(buffer, j, XA_MARK_0);
- memcpy_page(new->rq_pages[j], offset,
- old->rq_pages[j], offset, len);
+ seg = min_t(size_t, size - copied, PAGE_SIZE);
+ if (copy_page_from_iter(page, 0, seg, &old->rq_iter) != seg) {
+ rc = -EFAULT;
+ goto err_free;
+ }
+ copied += seg;
+ }
+ iov_iter_xarray(&new->rq_iter, ITER_SOURCE,
+ buffer, 0, size);
+ new->rq_iter_size = size;
}
}
@@ -4523,12 +4558,12 @@ smb3_is_transform_hdr(void *buf)
static int
decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
- unsigned int buf_data_size, struct page **pages,
- unsigned int npages, unsigned int page_data_size,
+ unsigned int buf_data_size, struct iov_iter *iter,
bool is_offloaded)
{
struct kvec iov[2];
struct smb_rqst rqst = {NULL};
+ size_t iter_size = 0;
int rc;
iov[0].iov_base = buf;
@@ -4538,10 +4573,11 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
- rqst.rq_pages = pages;
- rqst.rq_npages = npages;
- rqst.rq_pagesz = PAGE_SIZE;
- rqst.rq_tailsz = (page_data_size % PAGE_SIZE) ? : PAGE_SIZE;
+ if (iter) {
+ rqst.rq_iter = *iter;
+ rqst.rq_iter_size = iov_iter_count(iter);
+ iter_size = iov_iter_count(iter);
+ }
rc = crypt_message(server, 1, &rqst, 0);
cifs_dbg(FYI, "Decrypt message returned %d\n", rc);
@@ -4552,73 +4588,37 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
memmove(buf, iov[1].iov_base, buf_data_size);
if (!is_offloaded)
- server->total_read = buf_data_size + page_data_size;
+ server->total_read = buf_data_size + iter_size;
return rc;
}
static int
-read_data_into_pages(struct TCP_Server_Info *server, struct page **pages,
- unsigned int npages, unsigned int len)
+cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size,
+ unsigned int skip, struct iov_iter *iter)
{
- int i;
- int length;
+ struct page *page;
+ unsigned long index;
- for (i = 0; i < npages; i++) {
- struct page *page = pages[i];
- size_t n;
+ xa_for_each(pages, index, page) {
+ size_t n, len = min_t(unsigned int, PAGE_SIZE - skip, data_size);
- n = len;
- if (len >= PAGE_SIZE) {
- /* enough data to fill the page */
- n = PAGE_SIZE;
- len -= n;
- } else {
- zero_user(page, len, PAGE_SIZE - len);
- len = 0;
+ n = copy_page_to_iter(page, skip, len, iter);
+ if (n != len) {
+ cifs_dbg(VFS, "%s: something went wrong\n", __func__);
+ return -EIO;
}
- length = cifs_read_page_from_socket(server, page, 0, n);
- if (length < 0)
- return length;
- server->total_read += length;
+ data_size -= n;
+ skip = 0;
}
return 0;
}
static int
-init_read_bvec(struct page **pages, unsigned int npages, unsigned int data_size,
- unsigned int cur_off, struct bio_vec **page_vec)
-{
- struct bio_vec *bvec;
- int i;
-
- bvec = kcalloc(npages, sizeof(struct bio_vec), GFP_KERNEL);
- if (!bvec)
- return -ENOMEM;
-
- for (i = 0; i < npages; i++) {
- bvec_set_page(&bvec[i], pages[i],
- min_t(unsigned int, PAGE_SIZE, data_size),
- i == 0 ? cur_off : 0);
- data_size -= bvec[i].bv_len;
- }
-
- if (data_size != 0) {
- cifs_dbg(VFS, "%s: something went wrong\n", __func__);
- kfree(bvec);
- return -EIO;
- }
-
- *page_vec = bvec;
- return 0;
-}
-
-static int
handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
- char *buf, unsigned int buf_len, struct page **pages,
- unsigned int npages, unsigned int page_data_size,
- bool is_offloaded)
+ char *buf, unsigned int buf_len, struct xarray *pages,
+ unsigned int pages_len, bool is_offloaded)
{
unsigned int data_offset;
unsigned int data_len;
@@ -4627,9 +4627,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
unsigned int pad_len;
struct cifs_readdata *rdata = mid->callback_data;
struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
- struct bio_vec *bvec = NULL;
- struct iov_iter iter;
- struct kvec iov;
int length;
bool use_rdma_mr = false;
@@ -4718,7 +4715,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- if (data_len > page_data_size - pad_len) {
+ if (data_len > pages_len - pad_len) {
/* data_len is corrupt -- discard frame */
rdata->result = -EIO;
if (is_offloaded)
@@ -4728,8 +4725,9 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- rdata->result = init_read_bvec(pages, npages, page_data_size,
- cur_off, &bvec);
+ /* Copy the data to the output I/O iterator. */
+ rdata->result = cifs_copy_pages_to_iter(pages, pages_len,
+ cur_off, &rdata->iter);
if (rdata->result != 0) {
if (is_offloaded)
mid->mid_state = MID_RESPONSE_MALFORMED;
@@ -4737,14 +4735,16 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
dequeue_mid(mid, rdata->result);
return 0;
}
+ rdata->got_bytes = pages_len;
- iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len);
} else if (buf_len >= data_offset + data_len) {
/* read response payload is in buf */
- WARN_ONCE(npages > 0, "read data can be either in buf or in pages");
- iov.iov_base = buf + data_offset;
- iov.iov_len = data_len;
- iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len);
+ WARN_ONCE(pages && !xa_empty(pages),
+ "read data can be either in buf or in pages");
+ length = copy_to_iter(buf + data_offset, data_len, &rdata->iter);
+ if (length < 0)
+ return length;
+ rdata->got_bytes = data_len;
} else {
/* read response payload cannot be in both buf and pages */
WARN_ONCE(1, "buf can not contain only a part of read data");
@@ -4756,26 +4756,18 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- length = rdata->copy_into_pages(server, rdata, &iter);
-
- kfree(bvec);
-
- if (length < 0)
- return length;
-
if (is_offloaded)
mid->mid_state = MID_RESPONSE_RECEIVED;
else
dequeue_mid(mid, false);
- return length;
+ return 0;
}
struct smb2_decrypt_work {
struct work_struct decrypt;
struct TCP_Server_Info *server;
- struct page **ppages;
+ struct xarray buffer;
char *buf;
- unsigned int npages;
unsigned int len;
};
@@ -4784,11 +4776,13 @@ static void smb2_decrypt_offload(struct work_struct *work)
{
struct smb2_decrypt_work *dw = container_of(work,
struct smb2_decrypt_work, decrypt);
- int i, rc;
+ int rc;
struct mid_q_entry *mid;
+ struct iov_iter iter;
+ iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, dw->len);
rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size,
- dw->ppages, dw->npages, dw->len, true);
+ &iter, true);
if (rc) {
cifs_dbg(VFS, "error decrypting rc=%d\n", rc);
goto free_pages;
@@ -4802,7 +4796,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
mid->decrypted = true;
rc = handle_read_data(dw->server, mid, dw->buf,
dw->server->vals->read_rsp_size,
- dw->ppages, dw->npages, dw->len,
+ &dw->buffer, dw->len,
true);
if (rc >= 0) {
#ifdef CONFIG_CIFS_STATS2
@@ -4835,10 +4829,7 @@ static void smb2_decrypt_offload(struct work_struct *work)
}
free_pages:
- for (i = dw->npages-1; i >= 0; i--)
- put_page(dw->ppages[i]);
-
- kfree(dw->ppages);
+ cifs_clear_xarray_buffer(&dw->buffer);
cifs_small_buf_release(dw->buf);
kfree(dw);
}
@@ -4848,47 +4839,66 @@ static int
receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
int *num_mids)
{
+ struct page *page;
char *buf = server->smallbuf;
struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
- unsigned int npages;
- struct page **pages;
- unsigned int len;
+ struct iov_iter iter;
+ unsigned int len, npages;
unsigned int buflen = server->pdu_size;
int rc;
int i = 0;
struct smb2_decrypt_work *dw;
+ dw = kzalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL);
+ if (!dw)
+ return -ENOMEM;
+ xa_init(&dw->buffer);
+ INIT_WORK(&dw->decrypt, smb2_decrypt_offload);
+ dw->server = server;
+
*num_mids = 1;
len = min_t(unsigned int, buflen, server->vals->read_rsp_size +
sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1;
rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len);
if (rc < 0)
- return rc;
+ goto free_dw;
server->total_read += rc;
len = le32_to_cpu(tr_hdr->OriginalMessageSize) -
server->vals->read_rsp_size;
+ dw->len = len;
npages = DIV_ROUND_UP(len, PAGE_SIZE);
- pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- rc = -ENOMEM;
- goto discard_data;
- }
-
+ rc = -ENOMEM;
for (; i < npages; i++) {
- pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
- if (!pages[i]) {
- rc = -ENOMEM;
+ void *old;
+
+ page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+ if (!page)
+ goto discard_data;
+ page->index = i;
+ old = xa_store(&dw->buffer, i, page, GFP_KERNEL);
+ if (xa_is_err(old)) {
+ rc = xa_err(old);
+ put_page(page);
goto discard_data;
}
+ xa_set_mark(&dw->buffer, i, XA_MARK_0);
}
- /* read read data into pages */
- rc = read_data_into_pages(server, pages, npages, len);
- if (rc)
- goto free_pages;
+ iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, npages * PAGE_SIZE);
+
+ /* Read the data into the buffer and clear excess bufferage. */
+ rc = cifs_read_iter_from_socket(server, &iter, dw->len);
+ if (rc < 0)
+ goto discard_data;
+
+ server->total_read += rc;
+ if (rc < npages * PAGE_SIZE)
+ iov_iter_zero(npages * PAGE_SIZE - rc, &iter);
+ iov_iter_revert(&iter, npages * PAGE_SIZE);
+ iov_iter_truncate(&iter, dw->len);
rc = cifs_discard_remaining_data(server);
if (rc)
@@ -4901,39 +4911,28 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid,
if ((server->min_offload) && (server->in_flight > 1) &&
(server->pdu_size >= server->min_offload)) {
- dw = kmalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL);
- if (dw == NULL)
- goto non_offloaded_decrypt;
-
dw->buf = server->smallbuf;
server->smallbuf = (char *)cifs_small_buf_get();
- INIT_WORK(&dw->decrypt, smb2_decrypt_offload);
-
- dw->npages = npages;
- dw->server = server;
- dw->ppages = pages;
- dw->len = len;
queue_work(decrypt_wq, &dw->decrypt);
*num_mids = 0; /* worker thread takes care of finding mid */
return -1;
}
-non_offloaded_decrypt:
rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size,
- pages, npages, len, false);
+ &iter, false);
if (rc)
goto free_pages;
*mid = smb2_find_mid(server, buf);
- if (*mid == NULL)
+ if (*mid == NULL) {
cifs_dbg(FYI, "mid not found\n");
- else {
+ } else {
cifs_dbg(FYI, "mid found\n");
(*mid)->decrypted = true;
rc = handle_read_data(server, *mid, buf,
server->vals->read_rsp_size,
- pages, npages, len, false);
+ &dw->buffer, dw->len, false);
if (rc >= 0) {
if (server->ops->is_network_name_deleted) {
server->ops->is_network_name_deleted(buf,
@@ -4943,9 +4942,9 @@ non_offloaded_decrypt:
}
free_pages:
- for (i = i - 1; i >= 0; i--)
- put_page(pages[i]);
- kfree(pages);
+ cifs_clear_xarray_buffer(&dw->buffer);
+free_dw:
+ kfree(dw);
return rc;
discard_data:
cifs_discard_remaining_data(server);
@@ -4983,7 +4982,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
server->total_read += length;
buf_size = pdu_length - sizeof(struct smb2_transform_hdr);
- length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0, false);
+ length = decrypt_raw_data(server, buf, buf_size, NULL, false);
if (length)
return length;
@@ -5082,7 +5081,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
char *buf = server->large_buf ? server->bigbuf : server->smallbuf;
return handle_read_data(server, mid, buf, server->pdu_size,
- NULL, 0, 0, false);
+ NULL, 0, false);
}
static int
@@ -5134,15 +5133,16 @@ smb2_make_node(unsigned int xid, struct inode *inode,
cifs_dbg(FYI, "sfu compat create special file\n");
- oparms.tcon = tcon;
- oparms.cifs_sb = cifs_sb;
- oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
- CREATE_OPTION_SPECIAL);
- oparms.disposition = FILE_CREATE;
- oparms.path = full_path;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .cifs_sb = cifs_sb,
+ .desired_access = GENERIC_WRITE,
+ .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
+ CREATE_OPTION_SPECIAL),
+ .disposition = FILE_CREATE,
+ .path = full_path,
+ .fid = &fid,
+ };
if (tcon->ses->server->oplocks)
oplock = REQ_OPLOCK;
@@ -5629,7 +5629,7 @@ struct smb_version_values smb20_values = {
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
@@ -5651,7 +5651,7 @@ struct smb_version_values smb21_values = {
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
@@ -5672,7 +5672,7 @@ struct smb_version_values smb3any_values = {
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
@@ -5693,7 +5693,7 @@ struct smb_version_values smbdefault_values = {
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
@@ -5714,7 +5714,7 @@ struct smb_version_values smb30_values = {
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
@@ -5735,7 +5735,7 @@ struct smb_version_values smb302_values = {
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
@@ -5756,7 +5756,7 @@ struct smb_version_values smb311_values = {
.header_size = sizeof(struct smb2_hdr),
.header_preamble_size = 0,
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2c9ffa921e6f..ca9d7110ddcb 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -139,6 +139,66 @@ out:
return;
}
+static int wait_for_server_reconnect(struct TCP_Server_Info *server,
+ __le16 smb2_command, bool retry)
+{
+ int timeout = 10;
+ int rc;
+
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus != CifsNeedReconnect) {
+ spin_unlock(&server->srv_lock);
+ return 0;
+ }
+ timeout *= server->nr_targets;
+ spin_unlock(&server->srv_lock);
+
+ /*
+ * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE
+ * here since they are implicitly done when session drops.
+ */
+ switch (smb2_command) {
+ /*
+ * BB Should we keep oplock break and add flush to exceptions?
+ */
+ case SMB2_TREE_DISCONNECT:
+ case SMB2_CANCEL:
+ case SMB2_CLOSE:
+ case SMB2_OPLOCK_BREAK:
+ return -EAGAIN;
+ }
+
+ /*
+ * Give demultiplex thread up to 10 seconds to each target available for
+ * reconnect -- should be greater than cifs socket timeout which is 7
+ * seconds.
+ *
+ * On "soft" mounts we wait once. Hard mounts keep retrying until
+ * process is killed or server comes back on-line.
+ */
+ do {
+ rc = wait_event_interruptible_timeout(server->response_q,
+ (server->tcpStatus != CifsNeedReconnect),
+ timeout * HZ);
+ if (rc < 0) {
+ cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n",
+ __func__);
+ return -ERESTARTSYS;
+ }
+
+ /* are we still trying to reconnect? */
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus != CifsNeedReconnect) {
+ spin_unlock(&server->srv_lock);
+ return 0;
+ }
+ spin_unlock(&server->srv_lock);
+ } while (retry);
+
+ cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__);
+ return -EHOSTDOWN;
+}
+
static int
smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct TCP_Server_Info *server)
@@ -146,7 +206,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
int rc = 0;
struct nls_table *nls_codepage;
struct cifs_ses *ses;
- int retries;
/*
* SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
@@ -184,61 +243,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
(!tcon->ses->server) || !server)
return -EIO;
- ses = tcon->ses;
- retries = server->nr_targets;
-
- /*
- * Give demultiplex thread up to 10 seconds to each target available for
- * reconnect -- should be greater than cifs socket timeout which is 7
- * seconds.
- */
- while (server->tcpStatus == CifsNeedReconnect) {
- /*
- * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE
- * here since they are implicitly done when session drops.
- */
- switch (smb2_command) {
- /*
- * BB Should we keep oplock break and add flush to exceptions?
- */
- case SMB2_TREE_DISCONNECT:
- case SMB2_CANCEL:
- case SMB2_CLOSE:
- case SMB2_OPLOCK_BREAK:
- return -EAGAIN;
- }
-
- rc = wait_event_interruptible_timeout(server->response_q,
- (server->tcpStatus != CifsNeedReconnect),
- 10 * HZ);
- if (rc < 0) {
- cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n",
- __func__);
- return -ERESTARTSYS;
- }
-
- /* are we still trying to reconnect? */
- spin_lock(&server->srv_lock);
- if (server->tcpStatus != CifsNeedReconnect) {
- spin_unlock(&server->srv_lock);
- break;
- }
- spin_unlock(&server->srv_lock);
-
- if (retries && --retries)
- continue;
+ rc = wait_for_server_reconnect(server, smb2_command, tcon->retry);
+ if (rc)
+ return rc;
- /*
- * on "soft" mounts we wait once. Hard mounts keep
- * retrying until process is killed or server comes
- * back on-line
- */
- if (!tcon->retry) {
- cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
- return -EHOSTDOWN;
- }
- retries = server->nr_targets;
- }
+ ses = tcon->ses;
spin_lock(&ses->chan_lock);
if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) {
@@ -1364,7 +1373,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
/* Testing shows that buffer offset must be at location of Buffer[0] */
req->SecurityBufferOffset =
- cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */);
+ cpu_to_le16(sizeof(struct smb2_sess_setup_req));
req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len);
memset(&rqst, 0, sizeof(struct smb_rqst));
@@ -1858,12 +1867,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
if (unc_path == NULL)
return -ENOMEM;
- unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp) + 1;
- unc_path_len *= 2;
- if (unc_path_len < 2) {
+ unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp);
+ if (unc_path_len <= 0) {
kfree(unc_path);
return -EINVAL;
}
+ unc_path_len *= 2;
/* SMB2 TREE_CONNECT request must be called with TreeId == 0 */
tcon->tid = 0;
@@ -1883,9 +1892,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
iov[0].iov_len = total_len - 1;
/* Testing shows that buffer offset must be at location of Buffer[0] */
- req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req)
- - 1 /* pad */);
- req->PathLength = cpu_to_le16(unc_path_len - 2);
+ req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req));
+ req->PathLength = cpu_to_le16(unc_path_len);
iov[1].iov_base = unc_path;
iov[1].iov_len = unc_path_len;
@@ -3764,7 +3772,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
ses->Suid, (u8)watch_tree, completion_filter);
/* validate that notify information is plausible */
if ((rsp_iov.iov_base == NULL) ||
- (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp)))
+ (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp) + 1))
goto cnotify_exit;
smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base;
@@ -3898,7 +3906,7 @@ void smb2_reconnect_server(struct work_struct *work)
goto done;
/* allocate a dummy tcon struct used for reconnect */
- tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL);
+ tcon = tconInfoAlloc();
if (!tcon) {
resched = true;
list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
@@ -3921,7 +3929,7 @@ void smb2_reconnect_server(struct work_struct *work)
list_del_init(&ses->rlist);
cifs_put_smb_ses(ses);
}
- kfree(tcon);
+ tconInfoFree(tcon);
done:
cifs_dbg(FYI, "Reconnecting tcons and channels finished\n");
@@ -4054,6 +4062,36 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
return rc;
}
+#ifdef CONFIG_CIFS_SMB_DIRECT
+static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms)
+{
+ struct TCP_Server_Info *server = io_parms->server;
+ struct cifs_tcon *tcon = io_parms->tcon;
+
+ /* we can only offload if we're connected */
+ if (!server || !tcon)
+ return false;
+
+ /* we can only offload on an rdma connection */
+ if (!server->rdma || !server->smbd_conn)
+ return false;
+
+ /* we don't support signed offload yet */
+ if (server->sign)
+ return false;
+
+ /* we don't support encrypted offload yet */
+ if (smb3_encryption_required(tcon))
+ return false;
+
+ /* offload also has its overhead, so only do it if desired */
+ if (io_parms->length < server->smbd_conn->rdma_readwrite_threshold)
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_CIFS_SMB_DIRECT */
+
/*
* To form a chain of read requests, any read requests after the first should
* have the end_of_chain boolean set to true.
@@ -4097,16 +4135,12 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
* If we want to do a RDMA write, fill in and append
* smbd_buffer_descriptor_v1 to the end of read request
*/
- if (server->rdma && rdata && !server->sign &&
- rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) {
-
+ if (smb3_use_rdma_offload(io_parms)) {
struct smbd_buffer_descriptor_v1 *v1;
bool need_invalidate = server->dialect == SMB30_PROT_ID;
- rdata->mr = smbd_register_mr(
- server->smbd_conn, rdata->pages,
- rdata->nr_pages, rdata->page_offset,
- rdata->tailsz, true, need_invalidate);
+ rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->iter,
+ true, need_invalidate);
if (!rdata->mr)
return -EAGAIN;
@@ -4163,15 +4197,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
(struct smb2_hdr *)rdata->iov[0].iov_base;
struct cifs_credits credits = { .value = 0, .instance = 0 };
struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
- .rq_nvec = 1, };
-
- if (rdata->got_bytes) {
- rqst.rq_pages = rdata->pages;
- rqst.rq_offset = rdata->page_offset;
- rqst.rq_npages = rdata->nr_pages;
- rqst.rq_pagesz = rdata->pagesz;
- rqst.rq_tailsz = rdata->tailsz;
- }
+ .rq_nvec = 1,
+ .rq_iter = rdata->iter,
+ .rq_iter_size = iov_iter_count(&rdata->iter), };
WARN_ONCE(rdata->server != mid->server,
"rdata server %p != mid server %p",
@@ -4189,6 +4217,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (server->sign && !mid->decrypted) {
int rc;
+ iov_iter_revert(&rqst.rq_iter, rdata->got_bytes);
+ iov_iter_truncate(&rqst.rq_iter, rdata->got_bytes);
rc = smb2_verify_signature(&rqst, server);
if (rc)
cifs_tcon_dbg(VFS, "SMB signature verification returned error = %d\n",
@@ -4495,10 +4525,27 @@ smb2_async_writev(struct cifs_writedata *wdata,
struct kvec iov[1];
struct smb_rqst rqst = { };
unsigned int total_len;
+ struct cifs_io_parms _io_parms;
+ struct cifs_io_parms *io_parms = NULL;
if (!wdata->server)
server = wdata->server = cifs_pick_channel(tcon->ses);
+ /*
+ * in future we may get cifs_io_parms passed in from the caller,
+ * but for now we construct it here...
+ */
+ _io_parms = (struct cifs_io_parms) {
+ .tcon = tcon,
+ .server = server,
+ .offset = wdata->offset,
+ .length = wdata->bytes,
+ .persistent_fid = wdata->cfile->fid.persistent_fid,
+ .volatile_fid = wdata->cfile->fid.volatile_fid,
+ .pid = wdata->pid,
+ };
+ io_parms = &_io_parms;
+
rc = smb2_plain_req_init(SMB2_WRITE, tcon, server,
(void **) &req, &total_len);
if (rc)
@@ -4508,49 +4555,44 @@ smb2_async_writev(struct cifs_writedata *wdata,
flags |= CIFS_TRANSFORM_REQ;
shdr = (struct smb2_hdr *)req;
- shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid);
+ shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
- req->PersistentFileId = wdata->cfile->fid.persistent_fid;
- req->VolatileFileId = wdata->cfile->fid.volatile_fid;
+ req->PersistentFileId = io_parms->persistent_fid;
+ req->VolatileFileId = io_parms->volatile_fid;
req->WriteChannelInfoOffset = 0;
req->WriteChannelInfoLength = 0;
- req->Channel = 0;
- req->Offset = cpu_to_le64(wdata->offset);
+ req->Channel = SMB2_CHANNEL_NONE;
+ req->Offset = cpu_to_le64(io_parms->offset);
req->DataOffset = cpu_to_le16(
offsetof(struct smb2_write_req, Buffer));
req->RemainingBytes = 0;
- trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid,
- tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes);
+ trace_smb3_write_enter(0 /* xid */,
+ io_parms->persistent_fid,
+ io_parms->tcon->tid,
+ io_parms->tcon->ses->Suid,
+ io_parms->offset,
+ io_parms->length);
+
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a server RDMA read, fill in and append
* smbd_buffer_descriptor_v1 to the end of write request
*/
- if (server->rdma && !server->sign && wdata->bytes >=
- server->smbd_conn->rdma_readwrite_threshold) {
-
+ if (smb3_use_rdma_offload(io_parms)) {
struct smbd_buffer_descriptor_v1 *v1;
+ size_t data_size = iov_iter_count(&wdata->iter);
bool need_invalidate = server->dialect == SMB30_PROT_ID;
- wdata->mr = smbd_register_mr(
- server->smbd_conn, wdata->pages,
- wdata->nr_pages, wdata->page_offset,
- wdata->tailsz, false, need_invalidate);
+ wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->iter,
+ false, need_invalidate);
if (!wdata->mr) {
rc = -EAGAIN;
goto async_writev_out;
}
req->Length = 0;
req->DataOffset = 0;
- if (wdata->nr_pages > 1)
- req->RemainingBytes =
- cpu_to_le32(
- (wdata->nr_pages - 1) * wdata->pagesz -
- wdata->page_offset + wdata->tailsz
- );
- else
- req->RemainingBytes = cpu_to_le32(wdata->tailsz);
+ req->RemainingBytes = cpu_to_le32(data_size);
req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
if (need_invalidate)
req->Channel = SMB2_CHANNEL_RDMA_V1;
@@ -4569,26 +4611,21 @@ smb2_async_writev(struct cifs_writedata *wdata,
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
- rqst.rq_pages = wdata->pages;
- rqst.rq_offset = wdata->page_offset;
- rqst.rq_npages = wdata->nr_pages;
- rqst.rq_pagesz = wdata->pagesz;
- rqst.rq_tailsz = wdata->tailsz;
+ rqst.rq_iter = wdata->iter;
+ rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter);
#ifdef CONFIG_CIFS_SMB_DIRECT
- if (wdata->mr) {
+ if (wdata->mr)
iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1);
- rqst.rq_npages = 0;
- }
#endif
- cifs_dbg(FYI, "async write at %llu %u bytes\n",
- wdata->offset, wdata->bytes);
+ cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n",
+ io_parms->offset, io_parms->length, iov_iter_count(&rqst.rq_iter));
#ifdef CONFIG_CIFS_SMB_DIRECT
/* For RDMA read, I/O size is in RemainingBytes not in Length */
if (!wdata->mr)
- req->Length = cpu_to_le32(wdata->bytes);
+ req->Length = cpu_to_le32(io_parms->length);
#else
- req->Length = cpu_to_le32(wdata->bytes);
+ req->Length = cpu_to_le32(io_parms->length);
#endif
if (wdata->credits.value > 0) {
@@ -4596,7 +4633,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
SMB2_MAX_BUFFER_SIZE));
shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8);
- rc = adjust_credits(server, &wdata->credits, wdata->bytes);
+ rc = adjust_credits(server, &wdata->credits, io_parms->length);
if (rc)
goto async_writev_out;
@@ -4609,9 +4646,12 @@ smb2_async_writev(struct cifs_writedata *wdata,
if (rc) {
trace_smb3_write_err(0 /* no xid */,
- req->PersistentFileId,
- tcon->tid, tcon->ses->Suid, wdata->offset,
- wdata->bytes, rc);
+ io_parms->persistent_fid,
+ io_parms->tcon->tid,
+ io_parms->tcon->ses->Suid,
+ io_parms->offset,
+ io_parms->length,
+ rc);
kref_put(&wdata->refcount, release);
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
}
@@ -4906,7 +4946,7 @@ int SMB2_query_directory_init(const unsigned int xid,
memcpy(bufptr, &asteriks, len);
req->FileNameOffset =
- cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1);
+ cpu_to_le16(sizeof(struct smb2_query_directory_req));
req->FileNameLength = cpu_to_le16(len);
/*
* BB could be 30 bytes or so longer if we used SMB2 specific
@@ -4951,10 +4991,10 @@ smb2_parse_query_directory(struct cifs_tcon *tcon,
switch (srch_inf->info_level) {
case SMB_FIND_FILE_DIRECTORY_INFO:
- info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1;
+ info_buf_size = sizeof(FILE_DIRECTORY_INFO);
break;
case SMB_FIND_FILE_ID_FULL_DIR_INFO:
- info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
+ info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO);
break;
case SMB_FIND_FILE_POSIX_INFO:
/* note that posix payload are variable size */
@@ -5102,8 +5142,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
req->VolatileFileId = volatile_fid;
req->AdditionalInformation = cpu_to_le32(additional_info);
- req->BufferOffset =
- cpu_to_le16(sizeof(struct smb2_set_info_req) - 1);
+ req->BufferOffset = cpu_to_le16(sizeof(struct smb2_set_info_req));
req->BufferLength = cpu_to_le32(*size);
memcpy(req->Buffer, *data, *size);
@@ -5337,9 +5376,9 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon,
req->VolatileFileId = volatile_fid;
/* 1 for pad */
req->InputBufferOffset =
- cpu_to_le16(sizeof(struct smb2_query_info_req) - 1);
+ cpu_to_le16(sizeof(struct smb2_query_info_req));
req->OutputBufferLength = cpu_to_le32(
- outbuf_len + sizeof(struct smb2_query_info_rsp) - 1);
+ outbuf_len + sizeof(struct smb2_query_info_rsp));
iov->iov_base = (char *)req;
iov->iov_len = total_len;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 1237bb86e93a..2114e8a0c63a 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -57,7 +57,7 @@ struct smb2_rdma_crypto_transform {
#define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL
#define SMB2_SYMLINK_STRUCT_SIZE \
- (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp))
+ (sizeof(struct smb2_err_rsp) + sizeof(struct smb2_symlink_err_rsp))
#define SYMLINK_ERROR_TAG 0x4c4d5953
@@ -371,7 +371,7 @@ struct smb2_file_id_extd_directory_info {
__le32 EaSize; /* EA size */
__le32 ReparsePointTag; /* valid if FILE_ATTR_REPARSE_POINT set in FileAttributes */
__le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit */
- char FileName[1];
+ char FileName[];
} __packed; /* level 60 */
extern char smb2_padding[7];
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 8c816b25ce7c..55b6e319a61d 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -34,16 +34,21 @@ static int smbd_post_recv(
struct smbd_response *response);
static int smbd_post_send_empty(struct smbd_connection *info);
-static int smbd_post_send_data(
- struct smbd_connection *info,
- struct kvec *iov, int n_vec, int remaining_data_length);
-static int smbd_post_send_page(struct smbd_connection *info,
- struct page *page, unsigned long offset,
- size_t size, int remaining_data_length);
static void destroy_mr_list(struct smbd_connection *info);
static int allocate_mr_list(struct smbd_connection *info);
+struct smb_extract_to_rdma {
+ struct ib_sge *sge;
+ unsigned int nr_sge;
+ unsigned int max_sge;
+ struct ib_device *device;
+ u32 local_dma_lkey;
+ enum dma_data_direction direction;
+};
+static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
+ struct smb_extract_to_rdma *rdma);
+
/* SMBD version number */
#define SMBD_V1 0x0100
@@ -823,16 +828,16 @@ static int smbd_post_send(struct smbd_connection *info,
return rc;
}
-static int smbd_post_send_sgl(struct smbd_connection *info,
- struct scatterlist *sgl, int data_length, int remaining_data_length)
+static int smbd_post_send_iter(struct smbd_connection *info,
+ struct iov_iter *iter,
+ int *_remaining_data_length)
{
- int num_sgs;
int i, rc;
int header_length;
+ int data_length;
struct smbd_request *request;
struct smbd_data_transfer *packet;
int new_credits;
- struct scatterlist *sg;
wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
@@ -876,6 +881,30 @@ wait_send_queue:
}
request->info = info;
+ memset(request->sge, 0, sizeof(request->sge));
+
+ /* Fill in the data payload to find out how much data we can add */
+ if (iter) {
+ struct smb_extract_to_rdma extract = {
+ .nr_sge = 1,
+ .max_sge = SMBDIRECT_MAX_SEND_SGE,
+ .sge = request->sge,
+ .device = info->id->device,
+ .local_dma_lkey = info->pd->local_dma_lkey,
+ .direction = DMA_TO_DEVICE,
+ };
+
+ rc = smb_extract_iter_to_rdma(iter, *_remaining_data_length,
+ &extract);
+ if (rc < 0)
+ goto err_dma;
+ data_length = rc;
+ request->num_sge = extract.nr_sge;
+ *_remaining_data_length -= data_length;
+ } else {
+ data_length = 0;
+ request->num_sge = 1;
+ }
/* Fill in the packet header */
packet = smbd_request_payload(request);
@@ -897,7 +926,7 @@ wait_send_queue:
else
packet->data_offset = cpu_to_le32(24);
packet->data_length = cpu_to_le32(data_length);
- packet->remaining_data_length = cpu_to_le32(remaining_data_length);
+ packet->remaining_data_length = cpu_to_le32(*_remaining_data_length);
packet->padding = 0;
log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n",
@@ -913,7 +942,6 @@ wait_send_queue:
if (!data_length)
header_length = offsetof(struct smbd_data_transfer, padding);
- request->num_sge = 1;
request->sge[0].addr = ib_dma_map_single(info->id->device,
(void *)packet,
header_length,
@@ -927,23 +955,6 @@ wait_send_queue:
request->sge[0].length = header_length;
request->sge[0].lkey = info->pd->local_dma_lkey;
- /* Fill in the packet data payload */
- num_sgs = sgl ? sg_nents(sgl) : 0;
- for_each_sg(sgl, sg, num_sgs, i) {
- request->sge[i+1].addr =
- ib_dma_map_page(info->id->device, sg_page(sg),
- sg->offset, sg->length, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(
- info->id->device, request->sge[i+1].addr)) {
- rc = -EIO;
- request->sge[i+1].addr = 0;
- goto err_dma;
- }
- request->sge[i+1].length = sg->length;
- request->sge[i+1].lkey = info->pd->local_dma_lkey;
- request->num_sge++;
- }
-
rc = smbd_post_send(info, request);
if (!rc)
return 0;
@@ -976,61 +987,16 @@ err_wait_credit:
}
/*
- * Send a page
- * page: the page to send
- * offset: offset in the page to send
- * size: length in the page to send
- * remaining_data_length: remaining data to send in this payload
- */
-static int smbd_post_send_page(struct smbd_connection *info, struct page *page,
- unsigned long offset, size_t size, int remaining_data_length)
-{
- struct scatterlist sgl;
-
- sg_init_table(&sgl, 1);
- sg_set_page(&sgl, page, size, offset);
-
- return smbd_post_send_sgl(info, &sgl, size, remaining_data_length);
-}
-
-/*
* Send an empty message
* Empty message is used to extend credits to peer to for keep live
* while there is no upper layer payload to send at the time
*/
static int smbd_post_send_empty(struct smbd_connection *info)
{
- info->count_send_empty++;
- return smbd_post_send_sgl(info, NULL, 0, 0);
-}
-
-/*
- * Send a data buffer
- * iov: the iov array describing the data buffers
- * n_vec: number of iov array
- * remaining_data_length: remaining data to send following this packet
- * in segmented SMBD packet
- */
-static int smbd_post_send_data(
- struct smbd_connection *info, struct kvec *iov, int n_vec,
- int remaining_data_length)
-{
- int i;
- u32 data_length = 0;
- struct scatterlist sgl[SMBDIRECT_MAX_SEND_SGE - 1];
-
- if (n_vec > SMBDIRECT_MAX_SEND_SGE - 1) {
- cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
- return -EINVAL;
- }
+ int remaining_data_length = 0;
- sg_init_table(sgl, n_vec);
- for (i = 0; i < n_vec; i++) {
- data_length += iov[i].iov_len;
- sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len);
- }
-
- return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length);
+ info->count_send_empty++;
+ return smbd_post_send_iter(info, NULL, &remaining_data_length);
}
/*
@@ -1700,6 +1666,7 @@ static struct smbd_connection *_smbd_get_connection(
allocate_mr_failed:
/* At this point, need to a full transport shutdown */
+ server->smbd_conn = info;
smbd_destroy(server);
return NULL;
@@ -1985,18 +1952,10 @@ int smbd_send(struct TCP_Server_Info *server,
int num_rqst, struct smb_rqst *rqst_array)
{
struct smbd_connection *info = server->smbd_conn;
- struct kvec vecs[SMBDIRECT_MAX_SEND_SGE - 1];
- int nvecs;
- int size;
- unsigned int buflen, remaining_data_length;
- unsigned int offset, remaining_vec_data_length;
- int start, i, j;
- int max_iov_size =
- info->max_send_size - sizeof(struct smbd_data_transfer);
- struct kvec *iov;
- int rc;
struct smb_rqst *rqst;
- int rqst_idx;
+ struct iov_iter iter;
+ unsigned int remaining_data_length, klen;
+ int rc, i, rqst_idx;
if (info->transport_status != SMBD_CONNECTED)
return -EAGAIN;
@@ -2023,84 +1982,36 @@ int smbd_send(struct TCP_Server_Info *server,
rqst_idx = 0;
do {
rqst = &rqst_array[rqst_idx];
- iov = rqst->rq_iov;
cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
- rqst_idx, smb_rqst_len(server, rqst));
- remaining_vec_data_length = 0;
- for (i = 0; i < rqst->rq_nvec; i++) {
- remaining_vec_data_length += iov[i].iov_len;
- dump_smb(iov[i].iov_base, iov[i].iov_len);
- }
-
- log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n",
- rqst_idx, rqst->rq_nvec,
- rqst->rq_npages, rqst->rq_pagesz,
- rqst->rq_tailsz, smb_rqst_len(server, rqst));
-
- start = 0;
- offset = 0;
- do {
- buflen = 0;
- i = start;
- j = 0;
- while (i < rqst->rq_nvec &&
- j < SMBDIRECT_MAX_SEND_SGE - 1 &&
- buflen < max_iov_size) {
-
- vecs[j].iov_base = iov[i].iov_base + offset;
- if (buflen + iov[i].iov_len > max_iov_size) {
- vecs[j].iov_len =
- max_iov_size - iov[i].iov_len;
- buflen = max_iov_size;
- offset = vecs[j].iov_len;
- } else {
- vecs[j].iov_len =
- iov[i].iov_len - offset;
- buflen += vecs[j].iov_len;
- offset = 0;
- ++i;
- }
- ++j;
- }
+ rqst_idx, smb_rqst_len(server, rqst));
+ for (i = 0; i < rqst->rq_nvec; i++)
+ dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len);
+
+ log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n",
+ rqst_idx, rqst->rq_nvec, remaining_data_length,
+ iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst));
+
+ /* Send the metadata pages. */
+ klen = 0;
+ for (i = 0; i < rqst->rq_nvec; i++)
+ klen += rqst->rq_iov[i].iov_len;
+ iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
+
+ rc = smbd_post_send_iter(info, &iter, &remaining_data_length);
+ if (rc < 0)
+ break;
- remaining_vec_data_length -= buflen;
- remaining_data_length -= buflen;
- log_write(INFO, "sending %s iov[%d] from start=%d nvecs=%d remaining_data_length=%d\n",
- remaining_vec_data_length > 0 ?
- "partial" : "complete",
- rqst->rq_nvec, start, j,
- remaining_data_length);
-
- start = i;
- rc = smbd_post_send_data(info, vecs, j, remaining_data_length);
- if (rc)
- goto done;
- } while (remaining_vec_data_length > 0);
-
- /* now sending pages if there are any */
- for (i = 0; i < rqst->rq_npages; i++) {
- rqst_page_get_length(rqst, i, &buflen, &offset);
- nvecs = (buflen + max_iov_size - 1) / max_iov_size;
- log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
- buflen, nvecs);
- for (j = 0; j < nvecs; j++) {
- size = min_t(unsigned int, max_iov_size, remaining_data_length);
- remaining_data_length -= size;
- log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n",
- i, j * max_iov_size + offset, size,
- remaining_data_length);
- rc = smbd_post_send_page(
- info, rqst->rq_pages[i],
- j*max_iov_size + offset,
- size, remaining_data_length);
- if (rc)
- goto done;
- }
+ if (iov_iter_count(&rqst->rq_iter) > 0) {
+ /* And then the data pages if there are any */
+ rc = smbd_post_send_iter(info, &rqst->rq_iter,
+ &remaining_data_length);
+ if (rc < 0)
+ break;
}
+
} while (++rqst_idx < num_rqst);
-done:
/*
* As an optimization, we don't wait for individual I/O to finish
* before sending the next one.
@@ -2191,10 +2102,10 @@ static void destroy_mr_list(struct smbd_connection *info)
cancel_work_sync(&info->mr_recovery_work);
list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
if (mr->state == MR_INVALIDATED)
- ib_dma_unmap_sg(info->id->device, mr->sgl,
- mr->sgl_count, mr->dir);
+ ib_dma_unmap_sg(info->id->device, mr->sgt.sgl,
+ mr->sgt.nents, mr->dir);
ib_dereg_mr(mr->mr);
- kfree(mr->sgl);
+ kfree(mr->sgt.sgl);
kfree(mr);
}
}
@@ -2217,6 +2128,7 @@ static int allocate_mr_list(struct smbd_connection *info)
atomic_set(&info->mr_ready_count, 0);
atomic_set(&info->mr_used_count, 0);
init_waitqueue_head(&info->wait_for_mr_cleanup);
+ INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
/* Allocate more MRs (2x) than hardware responder_resources */
for (i = 0; i < info->responder_resources * 2; i++) {
smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
@@ -2229,11 +2141,10 @@ static int allocate_mr_list(struct smbd_connection *info)
info->mr_type, info->max_frmr_depth);
goto out;
}
- smbdirect_mr->sgl = kcalloc(
- info->max_frmr_depth,
- sizeof(struct scatterlist),
- GFP_KERNEL);
- if (!smbdirect_mr->sgl) {
+ smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth,
+ sizeof(struct scatterlist),
+ GFP_KERNEL);
+ if (!smbdirect_mr->sgt.sgl) {
log_rdma_mr(ERR, "failed to allocate sgl\n");
ib_dereg_mr(smbdirect_mr->mr);
goto out;
@@ -2244,15 +2155,15 @@ static int allocate_mr_list(struct smbd_connection *info)
list_add_tail(&smbdirect_mr->list, &info->mr_list);
atomic_inc(&info->mr_ready_count);
}
- INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
return 0;
out:
kfree(smbdirect_mr);
list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
+ list_del(&smbdirect_mr->list);
ib_dereg_mr(smbdirect_mr->mr);
- kfree(smbdirect_mr->sgl);
+ kfree(smbdirect_mr->sgt.sgl);
kfree(smbdirect_mr);
}
return -ENOMEM;
@@ -2305,26 +2216,45 @@ again:
}
/*
+ * Transcribe the pages from an iterator into an MR scatterlist.
+ */
+static int smbd_iter_to_mr(struct smbd_connection *info,
+ struct iov_iter *iter,
+ struct sg_table *sgt,
+ unsigned int max_sg)
+{
+ int ret;
+
+ memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist));
+
+ ret = netfs_extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
+ WARN_ON(ret < 0);
+ if (sgt->nents > 0)
+ sg_mark_end(&sgt->sgl[sgt->nents - 1]);
+ return ret;
+}
+
+/*
* Register memory for RDMA read/write
- * pages[]: the list of pages to register memory with
- * num_pages: the number of pages to register
- * tailsz: if non-zero, the bytes to register in the last page
+ * iter: the buffer to register memory with
* writing: true if this is a RDMA write (SMB read), false for RDMA read
* need_invalidate: true if this MR needs to be locally invalidated after I/O
* return value: the MR registered, NULL if failed.
*/
-struct smbd_mr *smbd_register_mr(
- struct smbd_connection *info, struct page *pages[], int num_pages,
- int offset, int tailsz, bool writing, bool need_invalidate)
+struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
+ struct iov_iter *iter,
+ bool writing, bool need_invalidate)
{
struct smbd_mr *smbdirect_mr;
- int rc, i;
+ int rc, num_pages;
enum dma_data_direction dir;
struct ib_reg_wr *reg_wr;
+ num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1);
if (num_pages > info->max_frmr_depth) {
log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
num_pages, info->max_frmr_depth);
+ WARN_ON_ONCE(1);
return NULL;
}
@@ -2333,45 +2263,31 @@ struct smbd_mr *smbd_register_mr(
log_rdma_mr(ERR, "get_mr returning NULL\n");
return NULL;
}
- smbdirect_mr->need_invalidate = need_invalidate;
- smbdirect_mr->sgl_count = num_pages;
- sg_init_table(smbdirect_mr->sgl, num_pages);
-
- log_rdma_mr(INFO, "num_pages=0x%x offset=0x%x tailsz=0x%x\n",
- num_pages, offset, tailsz);
-
- if (num_pages == 1) {
- sg_set_page(&smbdirect_mr->sgl[0], pages[0], tailsz, offset);
- goto skip_multiple_pages;
- }
- /* We have at least two pages to register */
- sg_set_page(
- &smbdirect_mr->sgl[0], pages[0], PAGE_SIZE - offset, offset);
- i = 1;
- while (i < num_pages - 1) {
- sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0);
- i++;
- }
- sg_set_page(&smbdirect_mr->sgl[i], pages[i],
- tailsz ? tailsz : PAGE_SIZE, 0);
-
-skip_multiple_pages:
dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
smbdirect_mr->dir = dir;
- rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir);
+ smbdirect_mr->need_invalidate = need_invalidate;
+ smbdirect_mr->sgt.nents = 0;
+ smbdirect_mr->sgt.orig_nents = 0;
+
+ log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",
+ num_pages, iov_iter_count(iter), info->max_frmr_depth);
+ smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth);
+
+ rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgt.sgl,
+ smbdirect_mr->sgt.nents, dir);
if (!rc) {
log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
num_pages, dir, rc);
goto dma_map_error;
}
- rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages,
- NULL, PAGE_SIZE);
- if (rc != num_pages) {
+ rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgt.sgl,
+ smbdirect_mr->sgt.nents, NULL, PAGE_SIZE);
+ if (rc != smbdirect_mr->sgt.nents) {
log_rdma_mr(ERR,
- "ib_map_mr_sg failed rc = %d num_pages = %x\n",
- rc, num_pages);
+ "ib_map_mr_sg failed rc = %d nents = %x\n",
+ rc, smbdirect_mr->sgt.nents);
goto map_mr_error;
}
@@ -2403,8 +2319,8 @@ skip_multiple_pages:
/* If all failed, attempt to recover this MR by setting it MR_ERROR*/
map_mr_error:
- ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl,
- smbdirect_mr->sgl_count, smbdirect_mr->dir);
+ ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgt.sgl,
+ smbdirect_mr->sgt.nents, smbdirect_mr->dir);
dma_map_error:
smbdirect_mr->state = MR_ERROR;
@@ -2471,8 +2387,8 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
if (smbdirect_mr->state == MR_INVALIDATED) {
ib_dma_unmap_sg(
- info->id->device, smbdirect_mr->sgl,
- smbdirect_mr->sgl_count,
+ info->id->device, smbdirect_mr->sgt.sgl,
+ smbdirect_mr->sgt.nents,
smbdirect_mr->dir);
smbdirect_mr->state = MR_READY;
if (atomic_inc_return(&info->mr_ready_count) == 1)
@@ -2490,3 +2406,206 @@ done:
return rc;
}
+
+static bool smb_set_sge(struct smb_extract_to_rdma *rdma,
+ struct page *lowest_page, size_t off, size_t len)
+{
+ struct ib_sge *sge = &rdma->sge[rdma->nr_sge];
+ u64 addr;
+
+ addr = ib_dma_map_page(rdma->device, lowest_page,
+ off, len, rdma->direction);
+ if (ib_dma_mapping_error(rdma->device, addr))
+ return false;
+
+ sge->addr = addr;
+ sge->length = len;
+ sge->lkey = rdma->local_dma_lkey;
+ rdma->nr_sge++;
+ return true;
+}
+
+/*
+ * Extract page fragments from a BVEC-class iterator and add them to an RDMA
+ * element list. The pages are not pinned.
+ */
+static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter,
+ struct smb_extract_to_rdma *rdma,
+ ssize_t maxsize)
+{
+ const struct bio_vec *bv = iter->bvec;
+ unsigned long start = iter->iov_offset;
+ unsigned int i;
+ ssize_t ret = 0;
+
+ for (i = 0; i < iter->nr_segs; i++) {
+ size_t off, len;
+
+ len = bv[i].bv_len;
+ if (start >= len) {
+ start -= len;
+ continue;
+ }
+
+ len = min_t(size_t, maxsize, len - start);
+ off = bv[i].bv_offset + start;
+
+ if (!smb_set_sge(rdma, bv[i].bv_page, off, len))
+ return -EIO;
+
+ ret += len;
+ maxsize -= len;
+ if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
+ break;
+ start = 0;
+ }
+
+ return ret;
+}
+
+/*
+ * Extract fragments from a KVEC-class iterator and add them to an RDMA list.
+ * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers.
+ * The pages are not pinned.
+ */
+static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter,
+ struct smb_extract_to_rdma *rdma,
+ ssize_t maxsize)
+{
+ const struct kvec *kv = iter->kvec;
+ unsigned long start = iter->iov_offset;
+ unsigned int i;
+ ssize_t ret = 0;
+
+ for (i = 0; i < iter->nr_segs; i++) {
+ struct page *page;
+ unsigned long kaddr;
+ size_t off, len, seg;
+
+ len = kv[i].iov_len;
+ if (start >= len) {
+ start -= len;
+ continue;
+ }
+
+ kaddr = (unsigned long)kv[i].iov_base + start;
+ off = kaddr & ~PAGE_MASK;
+ len = min_t(size_t, maxsize, len - start);
+ kaddr &= PAGE_MASK;
+
+ maxsize -= len;
+ do {
+ seg = min_t(size_t, len, PAGE_SIZE - off);
+
+ if (is_vmalloc_or_module_addr((void *)kaddr))
+ page = vmalloc_to_page((void *)kaddr);
+ else
+ page = virt_to_page(kaddr);
+
+ if (!smb_set_sge(rdma, page, off, seg))
+ return -EIO;
+
+ ret += seg;
+ len -= seg;
+ kaddr += PAGE_SIZE;
+ off = 0;
+ } while (len > 0 && rdma->nr_sge < rdma->max_sge);
+
+ if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
+ break;
+ start = 0;
+ }
+
+ return ret;
+}
+
+/*
+ * Extract folio fragments from an XARRAY-class iterator and add them to an
+ * RDMA list. The folios are not pinned.
+ */
+static ssize_t smb_extract_xarray_to_rdma(struct iov_iter *iter,
+ struct smb_extract_to_rdma *rdma,
+ ssize_t maxsize)
+{
+ struct xarray *xa = iter->xarray;
+ struct folio *folio;
+ loff_t start = iter->xarray_start + iter->iov_offset;
+ pgoff_t index = start / PAGE_SIZE;
+ ssize_t ret = 0;
+ size_t off, len;
+ XA_STATE(xas, xa, index);
+
+ rcu_read_lock();
+
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ if (xas_retry(&xas, folio))
+ continue;
+ if (WARN_ON(xa_is_value(folio)))
+ break;
+ if (WARN_ON(folio_test_hugetlb(folio)))
+ break;
+
+ off = offset_in_folio(folio, start);
+ len = min_t(size_t, maxsize, folio_size(folio) - off);
+
+ if (!smb_set_sge(rdma, folio_page(folio, 0), off, len)) {
+ rcu_read_unlock();
+ return -EIO;
+ }
+
+ maxsize -= len;
+ ret += len;
+ if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
+ break;
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Extract page fragments from up to the given amount of the source iterator
+ * and build up an RDMA list that refers to all of those bits. The RDMA list
+ * is appended to, up to the maximum number of elements set in the parameter
+ * block.
+ *
+ * The extracted page fragments are not pinned or ref'd in any way; if an
+ * IOVEC/UBUF-type iterator is to be used, it should be converted to a
+ * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some
+ * way.
+ */
+static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
+ struct smb_extract_to_rdma *rdma)
+{
+ ssize_t ret;
+ int before = rdma->nr_sge;
+
+ switch (iov_iter_type(iter)) {
+ case ITER_BVEC:
+ ret = smb_extract_bvec_to_rdma(iter, rdma, len);
+ break;
+ case ITER_KVEC:
+ ret = smb_extract_kvec_to_rdma(iter, rdma, len);
+ break;
+ case ITER_XARRAY:
+ ret = smb_extract_xarray_to_rdma(iter, rdma, len);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+
+ if (ret > 0) {
+ iov_iter_advance(iter, ret);
+ } else if (ret < 0) {
+ while (rdma->nr_sge > before) {
+ struct ib_sge *sge = &rdma->sge[rdma->nr_sge--];
+
+ ib_dma_unmap_single(rdma->device, sge->addr, sge->length,
+ rdma->direction);
+ sge->addr = 0;
+ }
+ }
+
+ return ret;
+}
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
index 207ef979cd51..83f239f376f0 100644
--- a/fs/cifs/smbdirect.h
+++ b/fs/cifs/smbdirect.h
@@ -288,8 +288,7 @@ struct smbd_mr {
struct list_head list;
enum mr_state state;
struct ib_mr *mr;
- struct scatterlist *sgl;
- int sgl_count;
+ struct sg_table sgt;
enum dma_data_direction dir;
union {
struct ib_reg_wr wr;
@@ -302,8 +301,8 @@ struct smbd_mr {
/* Interfaces to register and deregister MR for RDMA read/write */
struct smbd_mr *smbd_register_mr(
- struct smbd_connection *info, struct page *pages[], int num_pages,
- int offset, int tailsz, bool writing, bool need_invalidate);
+ struct smbd_connection *info, struct iov_iter *iter,
+ bool writing, bool need_invalidate);
int smbd_deregister_mr(struct smbd_mr *mr);
#else
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 3851d0aaa288..b42050c68e6c 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -270,26 +270,7 @@ smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst)
for (i = 0; i < nvec; i++)
buflen += iov[i].iov_len;
- /*
- * Add in the page array if there is one. The caller needs to make
- * sure rq_offset and rq_tailsz are set correctly. If a buffer of
- * multiple pages ends at page boundary, rq_tailsz needs to be set to
- * PAGE_SIZE.
- */
- if (rqst->rq_npages) {
- if (rqst->rq_npages == 1)
- buflen += rqst->rq_tailsz;
- else {
- /*
- * If there is more than one page, calculate the
- * buffer length based on rq_offset and rq_tailsz
- */
- buflen += rqst->rq_pagesz * (rqst->rq_npages - 1) -
- rqst->rq_offset;
- buflen += rqst->rq_tailsz;
- }
- }
-
+ buflen += iov_iter_count(&rqst->rq_iter);
return buflen;
}
@@ -376,23 +357,15 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
total_len += sent;
- /* now walk the page array and send each page in it */
- for (i = 0; i < rqst[j].rq_npages; i++) {
- struct bio_vec bvec;
-
- bvec.bv_page = rqst[j].rq_pages[i];
- rqst_page_get_length(&rqst[j], i, &bvec.bv_len,
- &bvec.bv_offset);
-
- iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE,
- &bvec, 1, bvec.bv_len);
+ if (iov_iter_count(&rqst[j].rq_iter) > 0) {
+ smb_msg.msg_iter = rqst[j].rq_iter;
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
break;
-
total_len += sent;
}
- }
+
+}
unmask:
sigprocmask(SIG_SETMASK, &oldmask, NULL);
@@ -1034,15 +1007,40 @@ cifs_cancelled_callback(struct mid_q_entry *mid)
struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
{
uint index = 0;
+ unsigned int min_in_flight = UINT_MAX, max_in_flight = 0;
+ struct TCP_Server_Info *server = NULL;
+ int i;
if (!ses)
return NULL;
- /* round robin */
- index = (uint)atomic_inc_return(&ses->chan_seq);
-
spin_lock(&ses->chan_lock);
- index %= ses->chan_count;
+ for (i = 0; i < ses->chan_count; i++) {
+ server = ses->chans[i].server;
+ if (!server)
+ continue;
+
+ /*
+ * strictly speaking, we should pick up req_lock to read
+ * server->in_flight. But it shouldn't matter much here if we
+ * race while reading this data. The worst that can happen is
+ * that we could use a channel that's not least loaded. Avoiding
+ * taking the lock could help reduce wait time, which is
+ * important for this function
+ */
+ if (server->in_flight < min_in_flight) {
+ min_in_flight = server->in_flight;
+ index = i;
+ }
+ if (server->in_flight > max_in_flight)
+ max_in_flight = server->in_flight;
+ }
+
+ /* if all channels are equally loaded, fall back to round-robin */
+ if (min_in_flight == max_in_flight) {
+ index = (uint)atomic_inc_return(&ses->chan_seq);
+ index %= ses->chan_count;
+ }
spin_unlock(&ses->chan_lock);
return ses->chans[index].server;
@@ -1640,11 +1638,11 @@ int
cifs_discard_remaining_data(struct TCP_Server_Info *server)
{
unsigned int rfclen = server->pdu_size;
- int remaining = rfclen + HEADER_PREAMBLE_SIZE(server) -
+ size_t remaining = rfclen + HEADER_PREAMBLE_SIZE(server) -
server->total_read;
while (remaining > 0) {
- int length;
+ ssize_t length;
length = cifs_discard_from_socket(server,
min_t(size_t, remaining,
@@ -1790,10 +1788,15 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return cifs_readv_discard(server, mid);
}
- length = rdata->read_into_pages(server, rdata, data_len);
- if (length < 0)
- return length;
-
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (rdata->mr)
+ length = data_len; /* An RDMA read is already done. */
+ else
+#endif
+ length = cifs_read_iter_from_socket(server, &rdata->iter,
+ data_len);
+ if (length > 0)
+ rdata->got_bytes += length;
server->total_read += length;
cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 61cd6c2628fa..a9b14f81d655 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -54,6 +54,7 @@
#include <net/ipv6.h>
#include <trace/events/dlm.h>
+#include <trace/events/sock.h>
#include "dlm_internal.h"
#include "lowcomms.h"
@@ -502,6 +503,8 @@ static void lowcomms_data_ready(struct sock *sk)
{
struct connection *con = sock2con(sk);
+ trace_sk_data_ready(sk);
+
set_bit(CF_RECV_INTR, &con->flags);
lowcomms_queue_rwork(con);
}
@@ -533,6 +536,8 @@ static void lowcomms_state_change(struct sock *sk)
static void lowcomms_listen_data_ready(struct sock *sk)
{
+ trace_sk_data_ready(sk);
+
queue_work(io_workqueue, &listen_con.rwork);
}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index bd3f3c755b24..c16f0d660cb7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -260,22 +260,6 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
return i;
}
-struct extent_crypt_result {
- struct completion completion;
- int rc;
-};
-
-static void extent_crypt_complete(struct crypto_async_request *req, int rc)
-{
- struct extent_crypt_result *ecr = req->data;
-
- if (rc == -EINPROGRESS)
- return;
-
- ecr->rc = rc;
- complete(&ecr->completion);
-}
-
/**
* crypt_scatterlist
* @crypt_stat: Pointer to the crypt_stat struct to initialize.
@@ -293,7 +277,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
unsigned char *iv, int op)
{
struct skcipher_request *req = NULL;
- struct extent_crypt_result ecr;
+ DECLARE_CRYPTO_WAIT(ecr);
int rc = 0;
if (unlikely(ecryptfs_verbosity > 0)) {
@@ -303,8 +287,6 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
crypt_stat->key_size);
}
- init_completion(&ecr.completion);
-
mutex_lock(&crypt_stat->cs_tfm_mutex);
req = skcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
if (!req) {
@@ -315,7 +297,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- extent_crypt_complete, &ecr);
+ crypto_req_done, &ecr);
/* Consider doing this once, when the file is opened */
if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
rc = crypto_skcipher_setkey(crypt_stat->tfm, crypt_stat->key,
@@ -334,13 +316,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
skcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
rc = op == ENCRYPT ? crypto_skcipher_encrypt(req) :
crypto_skcipher_decrypt(req);
- if (rc == -EINPROGRESS || rc == -EBUSY) {
- struct extent_crypt_result *ecr = req->base.data;
-
- wait_for_completion(&ecr->completion);
- rc = ecr->rc;
- reinit_completion(&ecr->completion);
- }
+ rc = crypto_wait_req(rc, &ecr);
out:
skcipher_request_free(req);
return rc;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index e782b4f1d104..2748a82de42a 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -37,10 +37,10 @@
#include "aops.h"
-void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
- unsigned int from, unsigned int len)
+void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio,
+ unsigned int from, unsigned int len)
{
- struct buffer_head *head = page_buffers(page);
+ struct buffer_head *head = folio_buffers(folio);
unsigned int bsize = head->b_size;
struct buffer_head *bh;
unsigned int to = from + len;
@@ -127,7 +127,6 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
{
struct inode *inode = page->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
if (PageChecked(page)) {
ClearPageChecked(page);
@@ -135,7 +134,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
create_empty_buffers(page, inode->i_sb->s_blocksize,
BIT(BH_Dirty)|BIT(BH_Uptodate));
}
- gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize);
+ gfs2_trans_add_databufs(ip, page_folio(page), 0, PAGE_SIZE);
}
return gfs2_write_jdata_page(page, wbc);
}
diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h
index ff9877a68780..09db1914425e 100644
--- a/fs/gfs2/aops.h
+++ b/fs/gfs2/aops.h
@@ -9,7 +9,7 @@
#include "incore.h"
extern void adjust_fs_space(struct inode *inode);
-extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
- unsigned int from, unsigned int len);
+extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio,
+ unsigned int from, unsigned int len);
#endif /* __AOPS_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e7537fd305dd..eedf6926c652 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -956,26 +956,40 @@ hole_found:
goto out;
}
-static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
- unsigned len)
+static struct folio *
+gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
{
+ struct inode *inode = iter->inode;
unsigned int blockmask = i_blocksize(inode) - 1;
struct gfs2_sbd *sdp = GFS2_SB(inode);
unsigned int blocks;
+ struct folio *folio;
+ int status;
blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
- return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
+ status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
+ if (status)
+ return ERR_PTR(status);
+
+ folio = iomap_get_folio(iter, pos);
+ if (IS_ERR(folio))
+ gfs2_trans_end(sdp);
+ return folio;
}
-static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
- unsigned copied, struct page *page)
+static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
+ unsigned copied, struct folio *folio)
{
struct gfs2_trans *tr = current->journal_info;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- if (page && !gfs2_is_stuffed(ip))
- gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
+ if (!gfs2_is_stuffed(ip))
+ gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos),
+ copied);
+
+ folio_unlock(folio);
+ folio_put(folio);
if (tr->tr_num_buf_new)
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
@@ -983,9 +997,9 @@ static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
gfs2_trans_end(sdp);
}
-static const struct iomap_page_ops gfs2_iomap_page_ops = {
- .page_prepare = gfs2_iomap_page_prepare,
- .page_done = gfs2_iomap_page_done,
+static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
+ .get_folio = gfs2_iomap_get_folio,
+ .put_folio = gfs2_iomap_put_folio,
};
static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
@@ -1061,7 +1075,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
}
if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
- iomap->page_ops = &gfs2_iomap_page_ops;
+ iomap->folio_ops = &gfs2_iomap_folio_ops;
return 0;
out_trans_end:
@@ -1277,7 +1291,7 @@ int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
/*
* NOTE: Never call gfs2_block_zero_range with an open transaction because it
* uses iomap write to perform its actions, which begin their own transactions
- * (iomap_begin, page_prepare, etc.)
+ * (iomap_begin, get_folio, etc.)
*/
static int gfs2_block_zero_range(struct inode *inode, loff_t from,
unsigned int length)
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 2e215e8c3c88..6fe9ca253b70 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -83,26 +83,8 @@ static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)
return 0;
}
-static int gfs2_dentry_delete(const struct dentry *dentry)
-{
- struct gfs2_inode *ginode;
-
- if (d_really_is_negative(dentry))
- return 0;
-
- ginode = GFS2_I(d_inode(dentry));
- if (!gfs2_holder_initialized(&ginode->i_iopen_gh))
- return 0;
-
- if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
- return 1;
-
- return 0;
-}
-
const struct dentry_operations gfs2_dops = {
.d_revalidate = gfs2_drevalidate,
.d_hash = gfs2_dhash,
- .d_delete = gfs2_dentry_delete,
};
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 524f3c96b9a4..5adc7d85dbf3 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -67,7 +67,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
static struct dentry *gfs2_root;
static struct workqueue_struct *glock_workqueue;
-struct workqueue_struct *gfs2_delete_workqueue;
static LIST_HEAD(lru_list);
static atomic_t lru_count = ATOMIC_INIT(0);
static DEFINE_SPINLOCK(lru_lock);
@@ -274,9 +273,8 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
struct address_space *mapping = gfs2_glock2aspace(gl);
lockref_mark_dead(&gl->gl_lockref);
-
- gfs2_glock_remove_from_lru(gl);
spin_unlock(&gl->gl_lockref.lock);
+ gfs2_glock_remove_from_lru(gl);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
if (mapping) {
truncate_inode_pages_final(mapping);
@@ -883,6 +881,7 @@ void glock_set_object(struct gfs2_glock *gl, void *object)
/**
* glock_clear_object - clear the gl_object field of a glock
* @gl: the glock
+ * @object: object the glock currently points at
*/
void glock_clear_object(struct gfs2_glock *gl, void *object)
{
@@ -892,8 +891,7 @@ void glock_clear_object(struct gfs2_glock *gl, void *object)
prev_object = gl->gl_object;
gl->gl_object = NULL;
spin_unlock(&gl->gl_lockref.lock);
- if (gfs2_assert_warn(gl->gl_name.ln_sbd,
- prev_object == object || prev_object == NULL)) {
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) {
pr_warn("glock=%u/%llx\n",
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number);
@@ -977,6 +975,26 @@ static bool gfs2_try_evict(struct gfs2_glock *gl)
return evicted;
}
+bool gfs2_queue_try_to_evict(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+ if (test_and_set_bit(GLF_TRY_TO_EVICT, &gl->gl_flags))
+ return false;
+ return queue_delayed_work(sdp->sd_delete_wq,
+ &gl->gl_delete, 0);
+}
+
+static bool gfs2_queue_verify_evict(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+ if (test_and_set_bit(GLF_VERIFY_EVICT, &gl->gl_flags))
+ return false;
+ return queue_delayed_work(sdp->sd_delete_wq,
+ &gl->gl_delete, 5 * HZ);
+}
+
static void delete_work_func(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
@@ -985,11 +1003,7 @@ static void delete_work_func(struct work_struct *work)
struct inode *inode;
u64 no_addr = gl->gl_name.ln_number;
- spin_lock(&gl->gl_lockref.lock);
- clear_bit(GLF_PENDING_DELETE, &gl->gl_flags);
- spin_unlock(&gl->gl_lockref.lock);
-
- if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
+ if (test_and_clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) {
/*
* If we can evict the inode, give the remote node trying to
* delete the inode some time before verifying that the delete
@@ -1008,22 +1022,28 @@ static void delete_work_func(struct work_struct *work)
* step entirely.
*/
if (gfs2_try_evict(gl)) {
- if (gfs2_queue_delete_work(gl, 5 * HZ))
+ if (test_bit(SDF_DEACTIVATING, &sdp->sd_flags))
+ goto out;
+ if (gfs2_queue_verify_evict(gl))
return;
}
goto out;
}
- inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino,
- GFS2_BLKST_UNLINKED);
- if (IS_ERR(inode)) {
- if (PTR_ERR(inode) == -EAGAIN &&
- (gfs2_queue_delete_work(gl, 5 * HZ)))
+ if (test_and_clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) {
+ inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino,
+ GFS2_BLKST_UNLINKED);
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) == -EAGAIN &&
+ !test_bit(SDF_DEACTIVATING, &sdp->sd_flags) &&
+ gfs2_queue_verify_evict(gl))
return;
- } else {
- d_prune_aliases(inode);
- iput(inode);
+ } else {
+ d_prune_aliases(inode);
+ iput(inode);
+ }
}
+
out:
gfs2_glock_put(gl);
}
@@ -1985,26 +2005,26 @@ add_back_to_lru:
static long gfs2_scan_glock_lru(int nr)
{
- struct gfs2_glock *gl;
- LIST_HEAD(skipped);
+ struct gfs2_glock *gl, *next;
LIST_HEAD(dispose);
long freed = 0;
spin_lock(&lru_lock);
- while ((nr-- >= 0) && !list_empty(&lru_list)) {
- gl = list_first_entry(&lru_list, struct gfs2_glock, gl_lru);
-
+ list_for_each_entry_safe(gl, next, &lru_list, gl_lru) {
+ if (nr-- <= 0)
+ break;
/* Test for being demotable */
if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
- list_move(&gl->gl_lru, &dispose);
- atomic_dec(&lru_count);
- freed++;
- continue;
+ if (!spin_trylock(&gl->gl_lockref.lock))
+ continue;
+ if (!gl->gl_lockref.count) {
+ list_move(&gl->gl_lru, &dispose);
+ atomic_dec(&lru_count);
+ freed++;
+ }
+ spin_unlock(&gl->gl_lockref.lock);
}
-
- list_move(&gl->gl_lru, &skipped);
}
- list_splice(&skipped, &lru_list);
if (!list_empty(&dispose))
gfs2_dispose_glock_lru(&dispose);
spin_unlock(&lru_lock);
@@ -2063,37 +2083,21 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
rhashtable_walk_exit(&iter);
}
-bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay)
-{
- bool queued;
-
- spin_lock(&gl->gl_lockref.lock);
- queued = queue_delayed_work(gfs2_delete_workqueue,
- &gl->gl_delete, delay);
- if (queued)
- set_bit(GLF_PENDING_DELETE, &gl->gl_flags);
- spin_unlock(&gl->gl_lockref.lock);
- return queued;
-}
-
void gfs2_cancel_delete_work(struct gfs2_glock *gl)
{
- if (cancel_delayed_work(&gl->gl_delete)) {
- clear_bit(GLF_PENDING_DELETE, &gl->gl_flags);
+ clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags);
+ clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags);
+ if (cancel_delayed_work(&gl->gl_delete))
gfs2_glock_put(gl);
- }
-}
-
-bool gfs2_delete_work_queued(const struct gfs2_glock *gl)
-{
- return test_bit(GLF_PENDING_DELETE, &gl->gl_flags);
}
static void flush_delete_work(struct gfs2_glock *gl)
{
if (gl->gl_name.ln_type == LM_TYPE_IOPEN) {
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
if (cancel_delayed_work(&gl->gl_delete)) {
- queue_delayed_work(gfs2_delete_workqueue,
+ queue_delayed_work(sdp->sd_delete_wq,
&gl->gl_delete, 0);
}
}
@@ -2102,7 +2106,7 @@ static void flush_delete_work(struct gfs2_glock *gl)
void gfs2_flush_delete_work(struct gfs2_sbd *sdp)
{
glock_hash_walk(flush_delete_work, sdp);
- flush_workqueue(gfs2_delete_workqueue);
+ flush_workqueue(sdp->sd_delete_wq);
}
/**
@@ -2308,14 +2312,16 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'o';
if (test_bit(GLF_BLOCKING, gflags))
*p++ = 'b';
- if (test_bit(GLF_PENDING_DELETE, gflags))
- *p++ = 'P';
if (test_bit(GLF_FREEING, gflags))
*p++ = 'x';
if (test_bit(GLF_INSTANTIATE_NEEDED, gflags))
*p++ = 'n';
if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags))
*p++ = 'N';
+ if (test_bit(GLF_TRY_TO_EVICT, gflags))
+ *p++ = 'e';
+ if (test_bit(GLF_VERIFY_EVICT, gflags))
+ *p++ = 'E';
*p = 0;
return buf;
}
@@ -2465,18 +2471,9 @@ int __init gfs2_glock_init(void)
rhashtable_destroy(&gl_hash_table);
return -ENOMEM;
}
- gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
- WQ_MEM_RECLAIM | WQ_FREEZABLE,
- 0);
- if (!gfs2_delete_workqueue) {
- destroy_workqueue(glock_workqueue);
- rhashtable_destroy(&gl_hash_table);
- return -ENOMEM;
- }
ret = register_shrinker(&glock_shrinker, "gfs2-glock");
if (ret) {
- destroy_workqueue(gfs2_delete_workqueue);
destroy_workqueue(glock_workqueue);
rhashtable_destroy(&gl_hash_table);
return ret;
@@ -2493,7 +2490,6 @@ void gfs2_glock_exit(void)
unregister_shrinker(&glock_shrinker);
rhashtable_destroy(&gl_hash_table);
destroy_workqueue(glock_workqueue);
- destroy_workqueue(gfs2_delete_workqueue);
}
static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index f37ac087e2c1..1f1ba92c15a8 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -144,7 +144,6 @@ struct gfs2_glock_aspace {
struct address_space mapping;
};
-extern struct workqueue_struct *gfs2_delete_workqueue;
static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
struct gfs2_holder *gh;
@@ -268,9 +267,8 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-extern bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay);
+extern bool gfs2_queue_try_to_evict(struct gfs2_glock *gl);
extern void gfs2_cancel_delete_work(struct gfs2_glock *gl);
-extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl);
extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp);
extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d78b61ecc1cd..ad14818a790a 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -193,7 +193,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl)
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
int error;
- if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
+ if (!rgd || !test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
return 0;
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
@@ -222,9 +222,12 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
const unsigned bsize = sdp->sd_sb.sb_bsize;
- loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK;
- loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
+ loff_t start, end;
+ if (!rgd)
+ return;
+ start = (rgd->rd_addr * bsize) & PAGE_MASK;
+ end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
gfs2_rgrp_brelse(rgd);
WARN_ON_ONCE(!(flags & DIO_METADATA));
truncate_inode_pages_range(mapping, start, end);
@@ -645,23 +648,18 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
struct gfs2_inode *ip = gl->gl_object;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- if (!remote || sb_rdonly(sdp->sd_vfs))
+ if (!remote || sb_rdonly(sdp->sd_vfs) ||
+ test_bit(SDF_DEACTIVATING, &sdp->sd_flags))
return;
if (gl->gl_demote_state == LM_ST_UNLOCKED &&
gl->gl_state == LM_ST_SHARED && ip) {
gl->gl_lockref.count++;
- if (!queue_delayed_work(gfs2_delete_workqueue,
- &gl->gl_delete, 0))
+ if (!gfs2_queue_try_to_evict(gl))
gl->gl_lockref.count--;
}
}
-static int iopen_go_demote_ok(const struct gfs2_glock *gl)
-{
- return !gfs2_delete_work_queued(gl);
-}
-
/**
* inode_go_free - wake up anyone waiting for dlm's unlock ast to free it
* @gl: glock being freed
@@ -767,7 +765,6 @@ const struct gfs2_glock_operations gfs2_iopen_glops = {
.go_type = LM_TYPE_IOPEN,
.go_callback = iopen_go_callback,
.go_dump = inode_go_dump,
- .go_demote_ok = iopen_go_demote_ok,
.go_flags = GLOF_LRU | GLOF_NONDISK,
.go_subclass = 1,
};
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index c26765080f28..79485329118b 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -329,8 +329,9 @@ enum {
GLF_LRU = 13,
GLF_OBJECT = 14, /* Used only for tracing */
GLF_BLOCKING = 15,
- GLF_PENDING_DELETE = 17,
- GLF_FREEING = 18, /* Wait for glock to be freed */
+ GLF_FREEING = 16, /* Wait for glock to be freed */
+ GLF_TRY_TO_EVICT = 17, /* iopen glocks only */
+ GLF_VERIFY_EVICT = 18, /* iopen glocks only */
};
struct gfs2_glock {
@@ -605,6 +606,8 @@ enum {
SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */
SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are
withdrawing */
+ SDF_DEACTIVATING = 15,
+ SDF_EVICTING = 16,
};
enum gfs2_freeze_state {
@@ -771,6 +774,10 @@ struct gfs2_sbd {
struct completion sd_journal_ready;
+ /* Workqueue stuff */
+
+ struct workqueue_struct *sd_delete_wq;
+
/* Daemon stuff */
struct task_struct *sd_logd_process;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 713efa3bb732..1291b5ee3584 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -225,6 +225,10 @@ fail:
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
if (gfs2_holder_initialized(&i_gh))
gfs2_glock_dq_uninit(&i_gh);
+ if (ip->i_gl) {
+ gfs2_glock_put(ip->i_gl);
+ ip->i_gl = NULL;
+ }
iget_failed(inode);
return ERR_PTR(error);
}
@@ -816,6 +820,10 @@ fail_gunlock3:
fail_gunlock2:
gfs2_glock_put(io_gl);
fail_free_inode:
+ if (ip->i_gl) {
+ gfs2_glock_put(ip->i_gl);
+ ip->i_gl = NULL;
+ }
gfs2_rs_deltree(&ip->i_res);
gfs2_qa_put(ip);
fail_free_acls:
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c0cf1d2d0ef5..6de901c3b89b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1197,9 +1197,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name);
+ sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname);
+ error = -ENOMEM;
+ if (!sdp->sd_delete_wq)
+ goto fail_free;
+
error = gfs2_sys_fs_add(sdp);
if (error)
- goto fail_free;
+ goto fail_delete_wq;
gfs2_create_debugfs_file(sdp);
@@ -1309,6 +1315,8 @@ fail_lm:
fail_debug:
gfs2_delete_debugfs_file(sdp);
gfs2_sys_fs_del(sdp);
+fail_delete_wq:
+ destroy_workqueue(sdp->sd_delete_wq);
fail_free:
free_sbd(sdp);
sb->s_fs_info = NULL;
@@ -1720,6 +1728,55 @@ static int gfs2_meta_init_fs_context(struct fs_context *fc)
return 0;
}
+/**
+ * gfs2_evict_inodes - evict inodes cooperatively
+ * @sb: the superblock
+ *
+ * When evicting an inode with a zero link count, we are trying to upgrade the
+ * inode's iopen glock from SH to EX mode in order to determine if we can
+ * delete the inode. The other nodes are supposed to evict the inode from
+ * their caches if they can, and to poke the inode's inode glock if they cannot
+ * do so. Either behavior allows gfs2_upgrade_iopen_glock() to proceed
+ * quickly, but if the other nodes are not cooperating, the lock upgrading
+ * attempt will time out. Since inodes are evicted sequentially, this can add
+ * up quickly.
+ *
+ * Function evict_inodes() tries to keep the s_inode_list_lock list locked over
+ * a long time, which prevents other inodes from being evicted concurrently.
+ * This precludes the cooperative behavior we are looking for. This special
+ * version of evict_inodes() avoids that.
+ *
+ * Modeled after drop_pagecache_sb().
+ */
+static void gfs2_evict_inodes(struct super_block *sb)
+{
+ struct inode *inode, *toput_inode = NULL;
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+
+ set_bit(SDF_EVICTING, &sdp->sd_flags);
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ spin_lock(&inode->i_lock);
+ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) &&
+ !need_resched()) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ atomic_inc(&inode->i_count);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&sb->s_inode_list_lock);
+
+ iput(toput_inode);
+ toput_inode = inode;
+
+ cond_resched();
+ spin_lock(&sb->s_inode_list_lock);
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+ iput(toput_inode);
+}
+
static void gfs2_kill_sb(struct super_block *sb)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
@@ -1735,6 +1792,18 @@ static void gfs2_kill_sb(struct super_block *sb)
sdp->sd_root_dir = NULL;
sdp->sd_master_dir = NULL;
shrink_dcache_sb(sb);
+
+ gfs2_evict_inodes(sb);
+
+ /*
+ * Flush and then drain the delete workqueue here (via
+ * destroy_workqueue()) to ensure that any delete work that
+ * may be running will also see the SDF_DEACTIVATING flag.
+ */
+ set_bit(SDF_DEACTIVATING, &sdp->sd_flags);
+ gfs2_flush_delete_work(sdp);
+ destroy_workqueue(sdp->sd_delete_wq);
+
kill_block_super(sb);
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f602fb844951..3b9b76e980ad 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1879,7 +1879,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
*/
ip = gl->gl_object;
- if (ip || !gfs2_queue_delete_work(gl, 0))
+ if (ip || !gfs2_queue_try_to_evict(gl))
gfs2_glock_put(gl);
else
found++;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 999cc146d708..a83fa62106f0 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -138,8 +138,10 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
return -EIO;
error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
- if (error || gfs2_withdrawn(sdp))
+ if (error) {
+ gfs2_consist(sdp);
return error;
+ }
if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
gfs2_consist(sdp);
@@ -151,7 +153,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
gfs2_log_pointers_init(sdp, head.lh_blkno);
error = gfs2_quota_init(sdp);
- if (!error && !gfs2_withdrawn(sdp))
+ if (!error && gfs2_withdrawn(sdp))
+ error = -EIO;
+ if (!error)
set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
return error;
}
@@ -529,7 +533,9 @@ void gfs2_make_fs_ro(struct gfs2_sbd *sdp)
{
int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
- gfs2_flush_delete_work(sdp);
+ if (!test_bit(SDF_DEACTIVATING, &sdp->sd_flags))
+ gfs2_flush_delete_work(sdp);
+
if (!log_write_allowed && current == sdp->sd_quotad_process)
fs_warn(sdp, "The quotad daemon is withdrawing.\n");
else if (sdp->sd_quotad_process)
@@ -933,6 +939,7 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
static int gfs2_drop_inode(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
if (inode->i_nlink &&
gfs2_holder_initialized(&ip->i_iopen_gh)) {
@@ -952,11 +959,17 @@ static int gfs2_drop_inode(struct inode *inode)
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
gfs2_glock_hold(gl);
- if (!gfs2_queue_delete_work(gl, 0))
+ if (!gfs2_queue_try_to_evict(gl))
gfs2_glock_queue_put(gl);
return 0;
}
+ /*
+ * No longer cache inodes when trying to evict them all.
+ */
+ if (test_bit(SDF_EVICTING, &sdp->sd_flags))
+ return 1;
+
return generic_drop_inode(inode);
}
@@ -1175,15 +1188,23 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode)
gfs2_glock_dq_wait(gh);
/*
- * If there are no other lock holders, we'll get the lock immediately.
+ * If there are no other lock holders, we will immediately get
+ * exclusive access to the iopen glock here.
+ *
* Otherwise, the other nodes holding the lock will be notified about
- * our locking request. If they don't have the inode open, they'll
- * evict the cached inode and release the lock. Otherwise, if they
- * poke the inode glock, we'll take this as an indication that they
- * still need the iopen glock and that they'll take care of deleting
- * the inode when they're done. As a last resort, if another node
- * keeps holding the iopen glock without showing any activity on the
- * inode glock, we'll eventually time out.
+ * our locking request. If they do not have the inode open, they are
+ * expected to evict the cached inode and release the lock, allowing us
+ * to proceed.
+ *
+ * Otherwise, if they cannot evict the inode, they are expected to poke
+ * the inode glock (note: not the iopen glock). We will notice that
+ * and stop waiting for the iopen glock immediately. The other node(s)
+ * are then expected to take care of deleting the inode when they no
+ * longer use it.
+ *
+ * As a last resort, if another node keeps holding the iopen glock
+ * without showing any activity on the inode glock, we will eventually
+ * time out and fail the iopen glock upgrade.
*
* Note that we're passing the LM_FLAG_TRY_1CB flag to the first
* locking request as an optimization to notify lock holders as soon as
@@ -1401,10 +1422,8 @@ static void gfs2_evict_inode(struct inode *inode)
if (gfs2_rs_active(&ip->i_res))
gfs2_rs_deltree(&ip->i_res);
- if (gfs2_holder_initialized(&gh)) {
- glock_clear_object(ip->i_gl, ip);
+ if (gfs2_holder_initialized(&gh))
gfs2_glock_dq_uninit(&gh);
- }
if (ret && ret != GLR_TRYFAILED && ret != -EROFS)
fs_warn(sdp, "gfs2_evict_inode: %d\n", ret);
out:
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d87ea98cf535..c40118ea4bbc 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -87,6 +87,7 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf)
"Withdraw In Prog: %d\n"
"Remote Withdraw: %d\n"
"Withdraw Recovery: %d\n"
+ "Deactivating: %d\n"
"sd_log_error: %d\n"
"sd_log_flush_lock: %d\n"
"sd_log_num_revoke: %u\n"
@@ -115,6 +116,7 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf)
test_bit(SDF_WITHDRAW_IN_PROG, &f),
test_bit(SDF_REMOTE_WITHDRAW, &f),
test_bit(SDF_WITHDRAW_RECOVERY, &f),
+ test_bit(SDF_DEACTIVATING, &f),
sdp->sd_log_error,
rwsem_is_locked(&sdp->sd_log_flush_lock),
sdp->sd_log_num_revoke,
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 356193e44cf0..d3c300563eb8 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
}
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
+/**
+ * iomap_get_folio - get a folio reference for writing
+ * @iter: iteration structure
+ * @pos: start offset of write
+ *
+ * Returns a locked reference to the folio at @pos, or an error pointer if the
+ * folio could not be obtained.
+ */
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
+{
+ unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
+ struct folio *folio;
+
+ if (iter->flags & IOMAP_NOWAIT)
+ fgp |= FGP_NOWAIT;
+
+ folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+ fgp, mapping_gfp_mask(iter->inode->i_mapping));
+ if (folio)
+ return folio;
+
+ if (iter->flags & IOMAP_NOWAIT)
+ return ERR_PTR(-EAGAIN);
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL_GPL(iomap_get_folio);
+
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
{
trace_iomap_release_folio(folio->mapping->host, folio_pos(folio),
@@ -575,6 +602,30 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return 0;
}
+static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
+ size_t len)
+{
+ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
+
+ if (folio_ops && folio_ops->get_folio)
+ return folio_ops->get_folio(iter, pos, len);
+ else
+ return iomap_get_folio(iter, pos);
+}
+
+static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
+ struct folio *folio)
+{
+ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
+
+ if (folio_ops && folio_ops->put_folio) {
+ folio_ops->put_folio(iter->inode, pos, ret, folio);
+ } else {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+}
+
static int iomap_write_begin_inline(const struct iomap_iter *iter,
struct folio *folio)
{
@@ -587,15 +638,11 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
size_t len, struct folio **foliop)
{
- const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
+ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
struct folio *folio;
- unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
int status = 0;
- if (iter->flags & IOMAP_NOWAIT)
- fgp |= FGP_NOWAIT;
-
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
@@ -606,18 +653,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
if (!mapping_large_folio_support(iter->inode->i_mapping))
len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
- if (page_ops && page_ops->page_prepare) {
- status = page_ops->page_prepare(iter->inode, pos, len);
- if (status)
- return status;
- }
-
- folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
- fgp, mapping_gfp_mask(iter->inode->i_mapping));
- if (!folio) {
- status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
- goto out_no_page;
- }
+ folio = __iomap_get_folio(iter, pos, len);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
/*
* Now we have a locked folio, before we do anything with it we need to
@@ -629,9 +667,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
* could do the wrong thing here (zero a page range incorrectly or fail
* to zero) and corrupt data.
*/
- if (page_ops && page_ops->iomap_valid) {
- bool iomap_valid = page_ops->iomap_valid(iter->inode,
- &iter->iomap);
+ if (folio_ops && folio_ops->iomap_valid) {
+ bool iomap_valid = folio_ops->iomap_valid(iter->inode,
+ &iter->iomap);
if (!iomap_valid) {
iter->iomap.flags |= IOMAP_F_STALE;
status = 0;
@@ -656,13 +694,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
return 0;
out_unlock:
- folio_unlock(folio);
- folio_put(folio);
+ __iomap_put_folio(iter, pos, 0, folio);
iomap_write_failed(iter->inode, pos, len);
-out_no_page:
- if (page_ops && page_ops->page_done)
- page_ops->page_done(iter->inode, pos, 0, NULL);
return status;
}
@@ -712,7 +746,6 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter,
static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
- const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t old_size = iter->inode->i_size;
size_t ret;
@@ -735,14 +768,10 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
i_size_write(iter->inode, pos + ret);
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
- folio_unlock(folio);
+ __iomap_put_folio(iter, pos, ret, folio);
if (old_size < pos)
pagecache_isize_extended(iter->inode, old_size, pos);
- if (page_ops && page_ops->page_done)
- page_ops->page_done(iter->inode, pos, ret, &folio->page);
- folio_put(folio);
-
if (ret < len)
iomap_write_failed(iter->inode, pos + ret, len - ret);
return ret;
diff --git a/fs/ksmbd/Kconfig b/fs/ksmbd/Kconfig
index e1fe17747ed6..7055cb5d2880 100644
--- a/fs/ksmbd/Kconfig
+++ b/fs/ksmbd/Kconfig
@@ -33,14 +33,16 @@ config SMB_SERVER
in ksmbd-tools, available from
https://github.com/cifsd-team/ksmbd-tools.
More detail about how to run the ksmbd kernel server is
- available via README file
+ available via the README file
(https://github.com/cifsd-team/ksmbd-tools/blob/master/README).
ksmbd kernel server includes support for auto-negotiation,
Secure negotiate, Pre-authentication integrity, oplock/lease,
compound requests, multi-credit, packet signing, RDMA(smbdirect),
smb3 encryption, copy-offload, secure per-user session
- establishment via NTLM or NTLMv2.
+ establishment via Kerberos or NTLMv2.
+
+if SMB_SERVER
config SMB_SERVER_SMBDIRECT
bool "Support for SMB Direct protocol"
@@ -54,6 +56,8 @@ config SMB_SERVER_SMBDIRECT
SMB Direct allows transferring SMB packets over RDMA. If unsure,
say N.
+endif
+
config SMB_SERVER_CHECK_CAP_NET_ADMIN
bool "Enable check network administration capability"
depends on SMB_SERVER
diff --git a/fs/ksmbd/asn1.c b/fs/ksmbd/asn1.c
index c03eba090368..cc6384f79675 100644
--- a/fs/ksmbd/asn1.c
+++ b/fs/ksmbd/asn1.c
@@ -208,9 +208,9 @@ int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen,
return 0;
}
-int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen,
- unsigned char tag, const void *value,
- size_t vlen)
+static int ksmbd_neg_token_alloc(void *context, size_t hdrlen,
+ unsigned char tag, const void *value,
+ size_t vlen)
{
struct ksmbd_conn *conn = context;
@@ -223,17 +223,16 @@ int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen,
return 0;
}
-int ksmbd_neg_token_targ_resp_token(void *context, size_t hdrlen,
+int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen,
unsigned char tag, const void *value,
size_t vlen)
{
- struct ksmbd_conn *conn = context;
-
- conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL);
- if (!conn->mechToken)
- return -ENOMEM;
+ return ksmbd_neg_token_alloc(context, hdrlen, tag, value, vlen);
+}
- memcpy(conn->mechToken, value, vlen);
- conn->mechToken[vlen] = '\0';
- return 0;
+int ksmbd_neg_token_targ_resp_token(void *context, size_t hdrlen,
+ unsigned char tag, const void *value,
+ size_t vlen)
+{
+ return ksmbd_neg_token_alloc(context, hdrlen, tag, value, vlen);
}
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index 56be077e5d8a..5b10b03800c1 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -114,7 +114,7 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work)
if (conn->ops->get_cmd_val(work) != SMB2_CANCEL_HE) {
requests_queue = &conn->requests;
- work->syncronous = true;
+ work->synchronous = true;
}
if (requests_queue) {
@@ -139,7 +139,7 @@ int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work)
spin_lock(&conn->request_lock);
if (!work->multiRsp) {
list_del_init(&work->request_entry);
- if (work->syncronous == false)
+ if (!work->synchronous)
list_del_init(&work->async_request_entry);
ret = 0;
}
@@ -312,7 +312,7 @@ int ksmbd_conn_handler_loop(void *p)
max_allowed_pdu_size = SMB3_MAX_MSGSIZE;
if (pdu_size > max_allowed_pdu_size) {
- pr_err_ratelimited("PDU length(%u) excceed maximum allowed pdu size(%u) on connection(%d)\n",
+ pr_err_ratelimited("PDU length(%u) exceeded maximum allowed pdu size(%u) on connection(%d)\n",
pdu_size, max_allowed_pdu_size,
conn->status);
break;
diff --git a/fs/ksmbd/ksmbd_work.h b/fs/ksmbd/ksmbd_work.h
index 5ece58e40c97..3234f2cf6327 100644
--- a/fs/ksmbd/ksmbd_work.h
+++ b/fs/ksmbd/ksmbd_work.h
@@ -68,7 +68,7 @@ struct ksmbd_work {
/* Request is encrypted */
bool encrypted:1;
/* Is this SYNC or ASYNC ksmbd_work */
- bool syncronous:1;
+ bool synchronous:1;
bool need_invalidate_rkey:1;
unsigned int remote_key;
diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c
index 92b1603b5abe..1ca2aae4c299 100644
--- a/fs/ksmbd/mgmt/user_session.c
+++ b/fs/ksmbd/mgmt/user_session.c
@@ -25,20 +25,19 @@ static DECLARE_RWSEM(sessions_table_lock);
struct ksmbd_session_rpc {
int id;
unsigned int method;
- struct list_head list;
};
static void free_channel_list(struct ksmbd_session *sess)
{
- struct channel *chann, *tmp;
+ struct channel *chann;
+ unsigned long index;
- write_lock(&sess->chann_lock);
- list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list,
- chann_list) {
- list_del(&chann->chann_list);
+ xa_for_each(&sess->ksmbd_chann_list, index, chann) {
+ xa_erase(&sess->ksmbd_chann_list, index);
kfree(chann);
}
- write_unlock(&sess->chann_lock);
+
+ xa_destroy(&sess->ksmbd_chann_list);
}
static void __session_rpc_close(struct ksmbd_session *sess,
@@ -58,15 +57,14 @@ static void __session_rpc_close(struct ksmbd_session *sess,
static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess)
{
struct ksmbd_session_rpc *entry;
+ long index;
- while (!list_empty(&sess->rpc_handle_list)) {
- entry = list_entry(sess->rpc_handle_list.next,
- struct ksmbd_session_rpc,
- list);
-
- list_del(&entry->list);
+ xa_for_each(&sess->rpc_handle_list, index, entry) {
+ xa_erase(&sess->rpc_handle_list, index);
__session_rpc_close(sess, entry);
}
+
+ xa_destroy(&sess->rpc_handle_list);
}
static int __rpc_method(char *rpc_name)
@@ -102,13 +100,13 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
entry = kzalloc(sizeof(struct ksmbd_session_rpc), GFP_KERNEL);
if (!entry)
- return -EINVAL;
+ return -ENOMEM;
- list_add(&entry->list, &sess->rpc_handle_list);
entry->method = method;
entry->id = ksmbd_ipc_id_alloc();
if (entry->id < 0)
goto free_entry;
+ xa_store(&sess->rpc_handle_list, entry->id, entry, GFP_KERNEL);
resp = ksmbd_rpc_open(sess, entry->id);
if (!resp)
@@ -117,9 +115,9 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
kvfree(resp);
return entry->id;
free_id:
+ xa_erase(&sess->rpc_handle_list, entry->id);
ksmbd_rpc_id_free(entry->id);
free_entry:
- list_del(&entry->list);
kfree(entry);
return -EINVAL;
}
@@ -128,24 +126,17 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id)
{
struct ksmbd_session_rpc *entry;
- list_for_each_entry(entry, &sess->rpc_handle_list, list) {
- if (entry->id == id) {
- list_del(&entry->list);
- __session_rpc_close(sess, entry);
- break;
- }
- }
+ entry = xa_erase(&sess->rpc_handle_list, id);
+ if (entry)
+ __session_rpc_close(sess, entry);
}
int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id)
{
struct ksmbd_session_rpc *entry;
- list_for_each_entry(entry, &sess->rpc_handle_list, list) {
- if (entry->id == id)
- return entry->method;
- }
- return 0;
+ entry = xa_load(&sess->rpc_handle_list, id);
+ return entry ? entry->method : 0;
}
void ksmbd_session_destroy(struct ksmbd_session *sess)
@@ -190,21 +181,15 @@ int ksmbd_session_register(struct ksmbd_conn *conn,
static int ksmbd_chann_del(struct ksmbd_conn *conn, struct ksmbd_session *sess)
{
- struct channel *chann, *tmp;
-
- write_lock(&sess->chann_lock);
- list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list,
- chann_list) {
- if (chann->conn == conn) {
- list_del(&chann->chann_list);
- kfree(chann);
- write_unlock(&sess->chann_lock);
- return 0;
- }
- }
- write_unlock(&sess->chann_lock);
+ struct channel *chann;
+
+ chann = xa_erase(&sess->ksmbd_chann_list, (long)conn);
+ if (!chann)
+ return -ENOENT;
- return -ENOENT;
+ kfree(chann);
+
+ return 0;
}
void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
@@ -234,7 +219,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
return;
sess_destroy:
- if (list_empty(&sess->ksmbd_chann_list)) {
+ if (xa_empty(&sess->ksmbd_chann_list)) {
xa_erase(&conn->sessions, sess->id);
ksmbd_session_destroy(sess);
}
@@ -320,6 +305,9 @@ static struct ksmbd_session *__session_create(int protocol)
struct ksmbd_session *sess;
int ret;
+ if (protocol != CIFDS_SESSION_FLAG_SMB2)
+ return NULL;
+
sess = kzalloc(sizeof(struct ksmbd_session), GFP_KERNEL);
if (!sess)
return NULL;
@@ -329,30 +317,20 @@ static struct ksmbd_session *__session_create(int protocol)
set_session_flag(sess, protocol);
xa_init(&sess->tree_conns);
- INIT_LIST_HEAD(&sess->ksmbd_chann_list);
- INIT_LIST_HEAD(&sess->rpc_handle_list);
+ xa_init(&sess->ksmbd_chann_list);
+ xa_init(&sess->rpc_handle_list);
sess->sequence_number = 1;
- rwlock_init(&sess->chann_lock);
-
- switch (protocol) {
- case CIFDS_SESSION_FLAG_SMB2:
- ret = __init_smb2_session(sess);
- break;
- default:
- ret = -EINVAL;
- break;
- }
+ ret = __init_smb2_session(sess);
if (ret)
goto error;
ida_init(&sess->tree_conn_ida);
- if (protocol == CIFDS_SESSION_FLAG_SMB2) {
- down_write(&sessions_table_lock);
- hash_add(sessions_table, &sess->hlist, sess->id);
- up_write(&sessions_table_lock);
- }
+ down_write(&sessions_table_lock);
+ hash_add(sessions_table, &sess->hlist, sess->id);
+ up_write(&sessions_table_lock);
+
return sess;
error:
diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h
index 8934b8ee275b..b6a9e7a6aae4 100644
--- a/fs/ksmbd/mgmt/user_session.h
+++ b/fs/ksmbd/mgmt/user_session.h
@@ -21,7 +21,6 @@ struct ksmbd_file_table;
struct channel {
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
struct ksmbd_conn *conn;
- struct list_head chann_list;
};
struct preauth_session {
@@ -50,11 +49,10 @@ struct ksmbd_session {
char sess_key[CIFS_KEY_SIZE];
struct hlist_node hlist;
- rwlock_t chann_lock;
- struct list_head ksmbd_chann_list;
+ struct xarray ksmbd_chann_list;
struct xarray tree_conns;
struct ida tree_conn_ida;
- struct list_head rpc_handle_list;
+ struct xarray rpc_handle_list;
__u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE];
__u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE];
diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c
index 6e25ace36568..fbdde426dd01 100644
--- a/fs/ksmbd/smb2misc.c
+++ b/fs/ksmbd/smb2misc.c
@@ -149,15 +149,11 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
break;
case SMB2_LOCK:
{
- int lock_count;
+ unsigned short lock_count;
- /*
- * smb2_lock request size is 48 included single
- * smb2_lock_element structure size.
- */
- lock_count = le16_to_cpu(((struct smb2_lock_req *)hdr)->LockCount) - 1;
+ lock_count = le16_to_cpu(((struct smb2_lock_req *)hdr)->LockCount);
if (lock_count > 0) {
- *off = __SMB2_HEADER_STRUCTURE_SIZE + 48;
+ *off = offsetof(struct smb2_lock_req, locks);
*len = sizeof(struct smb2_lock_element) * lock_count;
}
break;
@@ -412,20 +408,19 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
goto validate_credit;
/*
- * windows client also pad up to 8 bytes when compounding.
- * If pad is longer than eight bytes, log the server behavior
- * (once), since may indicate a problem but allow it and
- * continue since the frame is parseable.
+ * SMB2 NEGOTIATE request will be validated when message
+ * handling proceeds.
*/
- if (clc_len < len) {
- ksmbd_debug(SMB,
- "cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n",
- len, clc_len, command,
- le64_to_cpu(hdr->MessageId));
+ if (command == SMB2_NEGOTIATE_HE)
+ goto validate_credit;
+
+ /*
+ * Allow a message that padded to 8byte boundary.
+ */
+ if (clc_len < len && (len - clc_len) < 8)
goto validate_credit;
- }
- ksmbd_debug(SMB,
+ pr_err_ratelimited(
"cli req too short, len %d not %d. cmd:%d mid:%llu\n",
len, clc_len, command,
le64_to_cpu(hdr->MessageId));
diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c
index e401302478c3..aed7704a0672 100644
--- a/fs/ksmbd/smb2ops.c
+++ b/fs/ksmbd/smb2ops.c
@@ -26,7 +26,7 @@ static struct smb_version_values smb21_server_values = {
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
@@ -52,7 +52,7 @@ static struct smb_version_values smb30_server_values = {
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
@@ -79,7 +79,7 @@ static struct smb_version_values smb302_server_values = {
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
@@ -106,7 +106,7 @@ static struct smb_version_values smb311_server_values = {
.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
.header_size = sizeof(struct smb2_hdr),
.max_header_size = MAX_SMB2_HDR_SIZE,
- .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .read_rsp_size = sizeof(struct smb2_read_rsp),
.lock_cmd = SMB2_LOCK,
.cap_unix = 0,
.cap_nt_find = SMB2_NT_FIND,
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 4ef6e1e59a40..0685c1c77b9f 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -75,14 +75,7 @@ static inline bool check_session_id(struct ksmbd_conn *conn, u64 id)
struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn *conn)
{
- struct channel *chann;
-
- list_for_each_entry(chann, &sess->ksmbd_chann_list, chann_list) {
- if (chann->conn == conn)
- return chann;
- }
-
- return NULL;
+ return xa_load(&sess->ksmbd_chann_list, (long)conn);
}
/**
@@ -281,8 +274,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
le16_to_cpu(rsp->SecurityBufferOffset));
inc_rfc1001_len(work->response_buf,
sizeof(struct smb2_negotiate_rsp) -
- sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) +
- AUTH_GSS_LENGTH);
+ sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH);
rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE;
if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY)
rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE;
@@ -506,7 +498,7 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work)
rsp_hdr->SessionId = rcv_hdr->SessionId;
memcpy(rsp_hdr->Signature, rcv_hdr->Signature, 16);
- work->syncronous = true;
+ work->synchronous = true;
if (work->async_id) {
ksmbd_release_id(&conn->async_ida, work->async_id);
work->async_id = 0;
@@ -596,6 +588,7 @@ static void destroy_previous_session(struct ksmbd_conn *conn,
struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id);
struct ksmbd_user *prev_user;
struct channel *chann;
+ long index;
if (!prev_sess)
return;
@@ -609,10 +602,8 @@ static void destroy_previous_session(struct ksmbd_conn *conn,
return;
prev_sess->state = SMB2_SESSION_EXPIRED;
- write_lock(&prev_sess->chann_lock);
- list_for_each_entry(chann, &prev_sess->ksmbd_chann_list, chann_list)
+ xa_for_each(&prev_sess->ksmbd_chann_list, index, chann)
chann->conn->status = KSMBD_SESS_EXITING;
- write_unlock(&prev_sess->chann_lock);
}
/**
@@ -653,7 +644,7 @@ int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg)
pr_err("Failed to alloc async message id\n");
return id;
}
- work->syncronous = false;
+ work->synchronous = false;
work->async_id = id;
rsp_hdr->Id.AsyncId = cpu_to_le64(id);
@@ -1213,8 +1204,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) +
le16_to_cpu(rsp->SecurityBufferOffset));
inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) -
- sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) +
- AUTH_GSS_LENGTH);
+ sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH);
rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE;
conn->use_spnego = true;
@@ -1520,19 +1510,14 @@ static int ntlm_authenticate(struct ksmbd_work *work)
binding_session:
if (conn->dialect >= SMB30_PROT_ID) {
- read_lock(&sess->chann_lock);
chann = lookup_chann_list(sess, conn);
- read_unlock(&sess->chann_lock);
if (!chann) {
chann = kmalloc(sizeof(struct channel), GFP_KERNEL);
if (!chann)
return -ENOMEM;
chann->conn = conn;
- INIT_LIST_HEAD(&chann->chann_list);
- write_lock(&sess->chann_lock);
- list_add(&chann->chann_list, &sess->ksmbd_chann_list);
- write_unlock(&sess->chann_lock);
+ xa_store(&sess->ksmbd_chann_list, (long)conn, chann, GFP_KERNEL);
}
}
@@ -1607,19 +1592,14 @@ static int krb5_authenticate(struct ksmbd_work *work)
}
if (conn->dialect >= SMB30_PROT_ID) {
- read_lock(&sess->chann_lock);
chann = lookup_chann_list(sess, conn);
- read_unlock(&sess->chann_lock);
if (!chann) {
chann = kmalloc(sizeof(struct channel), GFP_KERNEL);
if (!chann)
return -ENOMEM;
chann->conn = conn;
- INIT_LIST_HEAD(&chann->chann_list);
- write_lock(&sess->chann_lock);
- list_add(&chann->chann_list, &sess->ksmbd_chann_list);
- write_unlock(&sess->chann_lock);
+ xa_store(&sess->ksmbd_chann_list, (long)conn, chann, GFP_KERNEL);
}
}
@@ -6645,7 +6625,7 @@ int smb2_cancel(struct ksmbd_work *work)
struct ksmbd_conn *conn = work->conn;
struct smb2_hdr *hdr = smb2_get_msg(work->request_buf);
struct smb2_hdr *chdr;
- struct ksmbd_work *cancel_work = NULL, *iter;
+ struct ksmbd_work *iter;
struct list_head *command_list;
ksmbd_debug(SMB, "smb2 cancel called on mid %llu, async flags 0x%x\n",
@@ -6667,7 +6647,9 @@ int smb2_cancel(struct ksmbd_work *work)
"smb2 with AsyncId %llu cancelled command = 0x%x\n",
le64_to_cpu(hdr->Id.AsyncId),
le16_to_cpu(chdr->Command));
- cancel_work = iter;
+ iter->state = KSMBD_WORK_CANCELLED;
+ if (iter->cancel_fn)
+ iter->cancel_fn(iter->cancel_argv);
break;
}
spin_unlock(&conn->request_lock);
@@ -6686,18 +6668,12 @@ int smb2_cancel(struct ksmbd_work *work)
"smb2 with mid %llu cancelled command = 0x%x\n",
le64_to_cpu(hdr->MessageId),
le16_to_cpu(chdr->Command));
- cancel_work = iter;
+ iter->state = KSMBD_WORK_CANCELLED;
break;
}
spin_unlock(&conn->request_lock);
}
- if (cancel_work) {
- cancel_work->state = KSMBD_WORK_CANCELLED;
- if (cancel_work->cancel_fn)
- cancel_work->cancel_fn(cancel_work->cancel_argv);
- }
-
/* For SMB2_CANCEL command itself send no response*/
work->send_no_response = 1;
return 0;
@@ -7062,6 +7038,14 @@ skip:
ksmbd_vfs_posix_lock_wait(flock);
+ spin_lock(&work->conn->request_lock);
+ spin_lock(&fp->f_lock);
+ list_del(&work->fp_entry);
+ work->cancel_fn = NULL;
+ kfree(argv);
+ spin_unlock(&fp->f_lock);
+ spin_unlock(&work->conn->request_lock);
+
if (work->state != KSMBD_WORK_ACTIVE) {
list_del(&smb_lock->llist);
spin_lock(&work->conn->llist_lock);
@@ -7070,9 +7054,6 @@ skip:
locks_free_lock(flock);
if (work->state == KSMBD_WORK_CANCELLED) {
- spin_lock(&fp->f_lock);
- list_del(&work->fp_entry);
- spin_unlock(&fp->f_lock);
rsp->hdr.Status =
STATUS_CANCELLED;
kfree(smb_lock);
@@ -7094,9 +7075,6 @@ skip:
list_del(&smb_lock->clist);
spin_unlock(&work->conn->llist_lock);
- spin_lock(&fp->f_lock);
- list_del(&work->fp_entry);
- spin_unlock(&fp->f_lock);
goto retry;
} else if (!rc) {
spin_lock(&work->conn->llist_lock);
@@ -8410,14 +8388,11 @@ int smb3_check_sign_req(struct ksmbd_work *work)
if (le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) {
signing_key = work->sess->smb3signingkey;
} else {
- read_lock(&work->sess->chann_lock);
chann = lookup_chann_list(work->sess, conn);
if (!chann) {
- read_unlock(&work->sess->chann_lock);
return 0;
}
signing_key = chann->smb3signingkey;
- read_unlock(&work->sess->chann_lock);
}
if (!signing_key) {
@@ -8477,14 +8452,11 @@ void smb3_set_sign_rsp(struct ksmbd_work *work)
le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) {
signing_key = work->sess->smb3signingkey;
} else {
- read_lock(&work->sess->chann_lock);
chann = lookup_chann_list(work->sess, work->conn);
if (!chann) {
- read_unlock(&work->sess->chann_lock);
return;
}
signing_key = chann->smb3signingkey;
- read_unlock(&work->sess->chann_lock);
}
if (!signing_key)
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index aa1300b7bfc2..5ea9229dad2c 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -952,9 +952,9 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap,
* ksmbd_vfs_setxattr() - vfs helper for smb set extended attributes value
* @idmap: idmap of the relevant mount
* @dentry: dentry to set XATTR at
- * @name: xattr name for setxattr
- * @value: xattr value to set
- * @size: size of xattr value
+ * @attr_name: xattr name for setxattr
+ * @attr_value: xattr value to set
+ * @attr_size: size of xattr value
* @flags: destination buffer length
*
* Return: 0 on success, otherwise error
diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c
index 1d8126443a7f..054a7d2e0f48 100644
--- a/fs/ksmbd/vfs_cache.c
+++ b/fs/ksmbd/vfs_cache.c
@@ -365,12 +365,11 @@ static void __put_fd_final(struct ksmbd_work *work, struct ksmbd_file *fp)
static void set_close_state_blocked_works(struct ksmbd_file *fp)
{
- struct ksmbd_work *cancel_work, *ctmp;
+ struct ksmbd_work *cancel_work;
spin_lock(&fp->f_lock);
- list_for_each_entry_safe(cancel_work, ctmp, &fp->blocked_works,
+ list_for_each_entry(cancel_work, &fp->blocked_works,
fp_entry) {
- list_del(&cancel_work->fp_entry);
cancel_work->state = KSMBD_WORK_CLOSED;
cancel_work->cancel_fn(cancel_work->cancel_argv);
}
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 914ea1c3537d..9a47303b2cba 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -685,17 +685,16 @@ module_exit(exit_nlm);
/**
* nlmsvc_dispatch - Process an NLM Request
* @rqstp: incoming request
- * @statp: pointer to location of accept_stat field in RPC Reply buffer
*
* Return values:
* %0: Processing complete; do not send a Reply
* %1: Processing complete; send Reply in rqstp->rq_res
*/
-static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+static int nlmsvc_dispatch(struct svc_rqst *rqstp)
{
const struct svc_procedure *procp = rqstp->rq_procinfo;
+ __be32 *statp = rqstp->rq_accept_statp;
- svcxdr_init_decode(rqstp);
if (!procp->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
@@ -705,7 +704,6 @@ static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp)
if (*statp != rpc_success)
return 1;
- svcxdr_init_encode(rqstp);
if (!procp->pc_encode(rqstp, &rqstp->rq_res_stream))
goto out_encode_err;
@@ -723,7 +721,7 @@ out_encode_err:
/*
* Define NLM program and procedures
*/
-static unsigned int nlmsvc_version1_count[17];
+static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version1_count[17]);
static const struct svc_version nlmsvc_version1 = {
.vs_vers = 1,
.vs_nproc = 17,
@@ -732,26 +730,31 @@ static const struct svc_version nlmsvc_version1 = {
.vs_dispatch = nlmsvc_dispatch,
.vs_xdrsize = NLMSVC_XDRSIZE,
};
-static unsigned int nlmsvc_version3_count[24];
+
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nlmsvc_version3_count[ARRAY_SIZE(nlmsvc_procedures)]);
static const struct svc_version nlmsvc_version3 = {
.vs_vers = 3,
- .vs_nproc = 24,
+ .vs_nproc = ARRAY_SIZE(nlmsvc_procedures),
.vs_proc = nlmsvc_procedures,
.vs_count = nlmsvc_version3_count,
.vs_dispatch = nlmsvc_dispatch,
.vs_xdrsize = NLMSVC_XDRSIZE,
};
+
#ifdef CONFIG_LOCKD_V4
-static unsigned int nlmsvc_version4_count[24];
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nlmsvc_version4_count[ARRAY_SIZE(nlmsvc_procedures4)]);
static const struct svc_version nlmsvc_version4 = {
.vs_vers = 4,
- .vs_nproc = 24,
+ .vs_nproc = ARRAY_SIZE(nlmsvc_procedures4),
.vs_proc = nlmsvc_procedures4,
.vs_count = nlmsvc_version4_count,
.vs_dispatch = nlmsvc_dispatch,
.vs_xdrsize = NLMSVC_XDRSIZE,
};
#endif
+
static const struct svc_version *nlmsvc_version[] = {
[1] = &nlmsvc_version1,
[3] = &nlmsvc_version3,
diff --git a/fs/namei.c b/fs/namei.c
index 5855dc6edbd5..edfedfbccaef 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1459,11 +1459,11 @@ EXPORT_SYMBOL(follow_down_one);
* point, the filesystem owning that dentry may be queried as to whether the
* caller is permitted to proceed or not.
*/
-int follow_down(struct path *path)
+int follow_down(struct path *path, unsigned int flags)
{
struct vfsmount *mnt = path->mnt;
bool jumped;
- int ret = traverse_mounts(path, &jumped, NULL, 0);
+ int ret = traverse_mounts(path, &jumped, NULL, flags);
if (path->mnt != mnt)
mntput(mnt);
@@ -2865,7 +2865,7 @@ int path_pts(struct path *path)
path->dentry = child;
dput(parent);
- follow_down(path);
+ follow_down(path, 0);
return 0;
}
#endif
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index f684c0cd1ec5..386d6fb92793 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -3,6 +3,7 @@
netfs-y := \
buffered_read.o \
io.o \
+ iterator.o \
main.o \
objects.o
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
new file mode 100644
index 000000000000..f00d43b8ac0a
--- /dev/null
+++ b/fs/netfs/iterator.c
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Iterator helpers.
+ *
+ * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/scatterlist.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_extract_user_iter - Extract the pages from a user iterator into a bvec
+ * @orig: The original iterator
+ * @orig_len: The amount of iterator to copy
+ * @new: The iterator to be set up
+ * @extraction_flags: Flags to qualify the request
+ *
+ * Extract the page fragments from the given amount of the source iterator and
+ * build up a second iterator that refers to all of those bits. This allows
+ * the original iterator to disposed of.
+ *
+ * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA be
+ * allowed on the pages extracted.
+ *
+ * On success, the number of elements in the bvec is returned, the original
+ * iterator will have been advanced by the amount extracted.
+ *
+ * The iov_iter_extract_mode() function should be used to query how cleanup
+ * should be performed.
+ */
+ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
+ struct iov_iter *new,
+ iov_iter_extraction_t extraction_flags)
+{
+ struct bio_vec *bv = NULL;
+ struct page **pages;
+ unsigned int cur_npages;
+ unsigned int max_pages;
+ unsigned int npages = 0;
+ unsigned int i;
+ ssize_t ret;
+ size_t count = orig_len, offset, len;
+ size_t bv_size, pg_size;
+
+ if (WARN_ON_ONCE(!iter_is_ubuf(orig) && !iter_is_iovec(orig)))
+ return -EIO;
+
+ max_pages = iov_iter_npages(orig, INT_MAX);
+ bv_size = array_size(max_pages, sizeof(*bv));
+ bv = kvmalloc(bv_size, GFP_KERNEL);
+ if (!bv)
+ return -ENOMEM;
+
+ /* Put the page list at the end of the bvec list storage. bvec
+ * elements are larger than page pointers, so as long as we work
+ * 0->last, we should be fine.
+ */
+ pg_size = array_size(max_pages, sizeof(*pages));
+ pages = (void *)bv + bv_size - pg_size;
+
+ while (count && npages < max_pages) {
+ ret = iov_iter_extract_pages(orig, &pages, count,
+ max_pages - npages, extraction_flags,
+ &offset);
+ if (ret < 0) {
+ pr_err("Couldn't get user pages (rc=%zd)\n", ret);
+ break;
+ }
+
+ if (ret > count) {
+ pr_err("get_pages rc=%zd more than %zu\n", ret, count);
+ break;
+ }
+
+ count -= ret;
+ ret += offset;
+ cur_npages = DIV_ROUND_UP(ret, PAGE_SIZE);
+
+ if (npages + cur_npages > max_pages) {
+ pr_err("Out of bvec array capacity (%u vs %u)\n",
+ npages + cur_npages, max_pages);
+ break;
+ }
+
+ for (i = 0; i < cur_npages; i++) {
+ len = ret > PAGE_SIZE ? PAGE_SIZE : ret;
+ bvec_set_page(bv + npages + i, *pages++, len - offset, offset);
+ ret -= len;
+ offset = 0;
+ }
+
+ npages += cur_npages;
+ }
+
+ iov_iter_bvec(new, orig->data_source, bv, npages, orig_len - count);
+ return npages;
+}
+EXPORT_SYMBOL_GPL(netfs_extract_user_iter);
+
+/*
+ * Extract and pin a list of up to sg_max pages from UBUF- or IOVEC-class
+ * iterators, and add them to the scatterlist.
+ */
+static ssize_t netfs_extract_user_to_sg(struct iov_iter *iter,
+ ssize_t maxsize,
+ struct sg_table *sgtable,
+ unsigned int sg_max,
+ iov_iter_extraction_t extraction_flags)
+{
+ struct scatterlist *sg = sgtable->sgl + sgtable->nents;
+ struct page **pages;
+ unsigned int npages;
+ ssize_t ret = 0, res;
+ size_t len, off;
+
+ /* We decant the page list into the tail of the scatterlist */
+ pages = (void *)sgtable->sgl + array_size(sg_max, sizeof(struct scatterlist));
+ pages -= sg_max;
+
+ do {
+ res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max,
+ extraction_flags, &off);
+ if (res < 0)
+ goto failed;
+
+ len = res;
+ maxsize -= len;
+ ret += len;
+ npages = DIV_ROUND_UP(off + len, PAGE_SIZE);
+ sg_max -= npages;
+
+ for (; npages < 0; npages--) {
+ struct page *page = *pages;
+ size_t seg = min_t(size_t, PAGE_SIZE - off, len);
+
+ *pages++ = NULL;
+ sg_set_page(sg, page, len, off);
+ sgtable->nents++;
+ sg++;
+ len -= seg;
+ off = 0;
+ }
+ } while (maxsize > 0 && sg_max > 0);
+
+ return ret;
+
+failed:
+ while (sgtable->nents > sgtable->orig_nents)
+ put_page(sg_page(&sgtable->sgl[--sgtable->nents]));
+ return res;
+}
+
+/*
+ * Extract up to sg_max pages from a BVEC-type iterator and add them to the
+ * scatterlist. The pages are not pinned.
+ */
+static ssize_t netfs_extract_bvec_to_sg(struct iov_iter *iter,
+ ssize_t maxsize,
+ struct sg_table *sgtable,
+ unsigned int sg_max,
+ iov_iter_extraction_t extraction_flags)
+{
+ const struct bio_vec *bv = iter->bvec;
+ struct scatterlist *sg = sgtable->sgl + sgtable->nents;
+ unsigned long start = iter->iov_offset;
+ unsigned int i;
+ ssize_t ret = 0;
+
+ for (i = 0; i < iter->nr_segs; i++) {
+ size_t off, len;
+
+ len = bv[i].bv_len;
+ if (start >= len) {
+ start -= len;
+ continue;
+ }
+
+ len = min_t(size_t, maxsize, len - start);
+ off = bv[i].bv_offset + start;
+
+ sg_set_page(sg, bv[i].bv_page, len, off);
+ sgtable->nents++;
+ sg++;
+ sg_max--;
+
+ ret += len;
+ maxsize -= len;
+ if (maxsize <= 0 || sg_max == 0)
+ break;
+ start = 0;
+ }
+
+ if (ret > 0)
+ iov_iter_advance(iter, ret);
+ return ret;
+}
+
+/*
+ * Extract up to sg_max pages from a KVEC-type iterator and add them to the
+ * scatterlist. This can deal with vmalloc'd buffers as well as kmalloc'd or
+ * static buffers. The pages are not pinned.
+ */
+static ssize_t netfs_extract_kvec_to_sg(struct iov_iter *iter,
+ ssize_t maxsize,
+ struct sg_table *sgtable,
+ unsigned int sg_max,
+ iov_iter_extraction_t extraction_flags)
+{
+ const struct kvec *kv = iter->kvec;
+ struct scatterlist *sg = sgtable->sgl + sgtable->nents;
+ unsigned long start = iter->iov_offset;
+ unsigned int i;
+ ssize_t ret = 0;
+
+ for (i = 0; i < iter->nr_segs; i++) {
+ struct page *page;
+ unsigned long kaddr;
+ size_t off, len, seg;
+
+ len = kv[i].iov_len;
+ if (start >= len) {
+ start -= len;
+ continue;
+ }
+
+ kaddr = (unsigned long)kv[i].iov_base + start;
+ off = kaddr & ~PAGE_MASK;
+ len = min_t(size_t, maxsize, len - start);
+ kaddr &= PAGE_MASK;
+
+ maxsize -= len;
+ ret += len;
+ do {
+ seg = min_t(size_t, len, PAGE_SIZE - off);
+ if (is_vmalloc_or_module_addr((void *)kaddr))
+ page = vmalloc_to_page((void *)kaddr);
+ else
+ page = virt_to_page(kaddr);
+
+ sg_set_page(sg, page, len, off);
+ sgtable->nents++;
+ sg++;
+ sg_max--;
+
+ len -= seg;
+ kaddr += PAGE_SIZE;
+ off = 0;
+ } while (len > 0 && sg_max > 0);
+
+ if (maxsize <= 0 || sg_max == 0)
+ break;
+ start = 0;
+ }
+
+ if (ret > 0)
+ iov_iter_advance(iter, ret);
+ return ret;
+}
+
+/*
+ * Extract up to sg_max folios from an XARRAY-type iterator and add them to
+ * the scatterlist. The pages are not pinned.
+ */
+static ssize_t netfs_extract_xarray_to_sg(struct iov_iter *iter,
+ ssize_t maxsize,
+ struct sg_table *sgtable,
+ unsigned int sg_max,
+ iov_iter_extraction_t extraction_flags)
+{
+ struct scatterlist *sg = sgtable->sgl + sgtable->nents;
+ struct xarray *xa = iter->xarray;
+ struct folio *folio;
+ loff_t start = iter->xarray_start + iter->iov_offset;
+ pgoff_t index = start / PAGE_SIZE;
+ ssize_t ret = 0;
+ size_t offset, len;
+ XA_STATE(xas, xa, index);
+
+ rcu_read_lock();
+
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ if (xas_retry(&xas, folio))
+ continue;
+ if (WARN_ON(xa_is_value(folio)))
+ break;
+ if (WARN_ON(folio_test_hugetlb(folio)))
+ break;
+
+ offset = offset_in_folio(folio, start);
+ len = min_t(size_t, maxsize, folio_size(folio) - offset);
+
+ sg_set_page(sg, folio_page(folio, 0), len, offset);
+ sgtable->nents++;
+ sg++;
+ sg_max--;
+
+ maxsize -= len;
+ ret += len;
+ if (maxsize <= 0 || sg_max == 0)
+ break;
+ }
+
+ rcu_read_unlock();
+ if (ret > 0)
+ iov_iter_advance(iter, ret);
+ return ret;
+}
+
+/**
+ * netfs_extract_iter_to_sg - Extract pages from an iterator and add ot an sglist
+ * @iter: The iterator to extract from
+ * @maxsize: The amount of iterator to copy
+ * @sgtable: The scatterlist table to fill in
+ * @sg_max: Maximum number of elements in @sgtable that may be filled
+ * @extraction_flags: Flags to qualify the request
+ *
+ * Extract the page fragments from the given amount of the source iterator and
+ * add them to a scatterlist that refers to all of those bits, to a maximum
+ * addition of @sg_max elements.
+ *
+ * The pages referred to by UBUF- and IOVEC-type iterators are extracted and
+ * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE-
+ * and DISCARD-type are not supported.
+ *
+ * No end mark is placed on the scatterlist; that's left to the caller.
+ *
+ * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
+ * be allowed on the pages extracted.
+ *
+ * If successul, @sgtable->nents is updated to include the number of elements
+ * added and the number of bytes added is returned. @sgtable->orig_nents is
+ * left unaltered.
+ *
+ * The iov_iter_extract_mode() function should be used to query how cleanup
+ * should be performed.
+ */
+ssize_t netfs_extract_iter_to_sg(struct iov_iter *iter, size_t maxsize,
+ struct sg_table *sgtable, unsigned int sg_max,
+ iov_iter_extraction_t extraction_flags)
+{
+ if (maxsize == 0)
+ return 0;
+
+ switch (iov_iter_type(iter)) {
+ case ITER_UBUF:
+ case ITER_IOVEC:
+ return netfs_extract_user_to_sg(iter, maxsize, sgtable, sg_max,
+ extraction_flags);
+ case ITER_BVEC:
+ return netfs_extract_bvec_to_sg(iter, maxsize, sgtable, sg_max,
+ extraction_flags);
+ case ITER_KVEC:
+ return netfs_extract_kvec_to_sg(iter, maxsize, sgtable, sg_max,
+ extraction_flags);
+ case ITER_XARRAY:
+ return netfs_extract_xarray_to_sg(iter, maxsize, sgtable, sg_max,
+ extraction_flags);
+ default:
+ pr_err("%s(%u) unsupported\n", __func__, iov_iter_type(iter));
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+}
+EXPORT_SYMBOL_GPL(netfs_extract_iter_to_sg);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index d0cccddb7d08..321af81c456e 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -980,14 +980,11 @@ out_invalidcred:
}
static int
-nfs_callback_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+nfs_callback_dispatch(struct svc_rqst *rqstp)
{
const struct svc_procedure *procp = rqstp->rq_procinfo;
- svcxdr_init_decode(rqstp);
- svcxdr_init_encode(rqstp);
-
- *statp = procp->pc_func(rqstp);
+ *rqstp->rq_accept_statp = procp->pc_func(rqstp);
return 1;
}
@@ -1072,7 +1069,8 @@ static const struct svc_procedure nfs4_callback_procedures1[] = {
}
};
-static unsigned int nfs4_callback_count1[ARRAY_SIZE(nfs4_callback_procedures1)];
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nfs4_callback_count1[ARRAY_SIZE(nfs4_callback_procedures1)]);
const struct svc_version nfs4_callback_version1 = {
.vs_vers = 1,
.vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
@@ -1084,7 +1082,8 @@ const struct svc_version nfs4_callback_version1 = {
.vs_need_cong_ctrl = true,
};
-static unsigned int nfs4_callback_count4[ARRAY_SIZE(nfs4_callback_procedures1)];
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nfs4_callback_count4[ARRAY_SIZE(nfs4_callback_procedures1)]);
const struct svc_version nfs4_callback_version4 = {
.vs_vers = 4,
.vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f8e420464b77..a41c3ee4549c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -203,14 +203,14 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
{
struct nfs_cache_array *array;
- array = kmap_atomic(page);
+ array = kmap_local_page(page);
array->change_attr = change_attr;
array->last_cookie = last_cookie;
array->size = 0;
array->page_full = 0;
array->page_is_eof = 0;
array->cookies_are_ordered = 1;
- kunmap_atomic(array);
+ kunmap_local(array);
}
/*
@@ -221,11 +221,11 @@ static void nfs_readdir_clear_array(struct page *page)
struct nfs_cache_array *array;
unsigned int i;
- array = kmap_atomic(page);
+ array = kmap_local_page(page);
for (i = 0; i < array->size; i++)
kfree(array->array[i].name);
array->size = 0;
- kunmap_atomic(array);
+ kunmap_local(array);
}
static void nfs_readdir_free_folio(struct folio *folio)
@@ -371,14 +371,14 @@ static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie)
static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
u64 change_attr)
{
- struct nfs_cache_array *array = kmap_atomic(page);
+ struct nfs_cache_array *array = kmap_local_page(page);
int ret = true;
if (array->change_attr != change_attr)
ret = false;
if (nfs_readdir_array_index_cookie(array) != last_cookie)
ret = false;
- kunmap_atomic(array);
+ kunmap_local(array);
return ret;
}
@@ -418,9 +418,9 @@ static u64 nfs_readdir_page_last_cookie(struct page *page)
struct nfs_cache_array *array;
u64 ret;
- array = kmap_atomic(page);
+ array = kmap_local_page(page);
ret = array->last_cookie;
- kunmap_atomic(array);
+ kunmap_local(array);
return ret;
}
@@ -429,9 +429,9 @@ static bool nfs_readdir_page_needs_filling(struct page *page)
struct nfs_cache_array *array;
bool ret;
- array = kmap_atomic(page);
+ array = kmap_local_page(page);
ret = !nfs_readdir_array_is_full(array);
- kunmap_atomic(array);
+ kunmap_local(array);
return ret;
}
@@ -439,9 +439,9 @@ static void nfs_readdir_page_set_eof(struct page *page)
{
struct nfs_cache_array *array;
- array = kmap_atomic(page);
+ array = kmap_local_page(page);
nfs_readdir_array_set_eof(array);
- kunmap_atomic(array);
+ kunmap_local(array);
}
static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
@@ -568,14 +568,14 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
struct nfs_cache_array *array;
int status;
- array = kmap_atomic(desc->page);
+ array = kmap_local_page(desc->page);
if (desc->dir_cookie == 0)
status = nfs_readdir_search_for_pos(array, desc);
else
status = nfs_readdir_search_for_cookie(array, desc);
- kunmap_atomic(array);
+ kunmap_local(array);
return status;
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1707f46b1335..9a18c5a69ace 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -343,14 +343,12 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
struct nfs_page *req;
unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
/* XXX do we need to do the eof zeroing found in async_filler? */
- req = nfs_create_request(dreq->ctx, pagevec[i],
- pgbase, req_len);
+ req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
+ pgbase, pos, req_len);
if (IS_ERR(req)) {
result = PTR_ERR(req);
break;
}
- req->wb_index = pos >> PAGE_SHIFT;
- req->wb_offset = pos & ~PAGE_MASK;
if (!nfs_pageio_add_request(&desc, req)) {
result = desc.pg_error;
nfs_release_request(req);
@@ -802,8 +800,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
struct nfs_page *req;
unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
- req = nfs_create_request(dreq->ctx, pagevec[i],
- pgbase, req_len);
+ req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
+ pgbase, pos, req_len);
if (IS_ERR(req)) {
result = PTR_ERR(req);
break;
@@ -816,8 +814,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
}
nfs_lock_request(req);
- req->wb_index = pos >> PAGE_SHIFT;
- req->wb_offset = pos & ~PAGE_MASK;
if (!nfs_pageio_add_request(&desc, req)) {
result = desc.pg_error;
nfs_unlock_and_release_request(req);
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 1a9d5aa51dfb..d6a6d1ebb8fd 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -42,7 +42,7 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent)
dprintk("%s: max fh len %d inode %p parent %p",
__func__, *max_len, inode, parent);
- if (*max_len < len || IS_AUTOMOUNT(inode)) {
+ if (*max_len < len) {
dprintk("%s: fh len %d too small, required %d\n",
__func__, *max_len, len);
*max_len = len;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index b0f3c9339e70..893625eacab9 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -277,27 +277,28 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync);
* and that the new data won't completely replace the old data in
* that range of the file.
*/
-static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len)
+static bool nfs_folio_is_full_write(struct folio *folio, loff_t pos,
+ unsigned int len)
{
- unsigned int pglen = nfs_page_length(page);
- unsigned int offset = pos & (PAGE_SIZE - 1);
+ unsigned int pglen = nfs_folio_length(folio);
+ unsigned int offset = offset_in_folio(folio, pos);
unsigned int end = offset + len;
return !pglen || (end >= pglen && !offset);
}
-static bool nfs_want_read_modify_write(struct file *file, struct page *page,
- loff_t pos, unsigned int len)
+static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
+ loff_t pos, unsigned int len)
{
/*
* Up-to-date pages, those with ongoing or full-page write
* don't need read/modify/write
*/
- if (PageUptodate(page) || PagePrivate(page) ||
- nfs_full_page_write(page, pos, len))
+ if (folio_test_uptodate(folio) || folio_test_private(folio) ||
+ nfs_folio_is_full_write(folio, pos, len))
return false;
- if (pnfs_ld_read_whole_page(file->f_mapping->host))
+ if (pnfs_ld_read_whole_page(file_inode(file)))
return true;
/* Open for reading too? */
if (file->f_mode & FMODE_READ)
@@ -305,6 +306,15 @@ static bool nfs_want_read_modify_write(struct file *file, struct page *page,
return false;
}
+static struct folio *
+nfs_folio_grab_cache_write_begin(struct address_space *mapping, pgoff_t index)
+{
+ unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
+
+ return __filemap_get_folio(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+}
+
/*
* This does the "real" work of the write. We must allocate and lock the
* page to be sent back to the generic routine, which then copies the
@@ -314,32 +324,31 @@ static bool nfs_want_read_modify_write(struct file *file, struct page *page,
* increment the page use counts until he is done with the page.
*/
static int nfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, struct page **pagep,
+ void **fsdata)
{
- int ret;
- pgoff_t index = pos >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
int once_thru = 0;
+ int ret;
dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
start:
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
+ folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT);
+ if (!folio)
return -ENOMEM;
- *pagep = page;
+ *pagep = &folio->page;
- ret = nfs_flush_incompatible(file, page);
+ ret = nfs_flush_incompatible(file, folio);
if (ret) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
} else if (!once_thru &&
- nfs_want_read_modify_write(file, page, pos, len)) {
+ nfs_want_read_modify_write(file, folio, pos, len)) {
once_thru = 1;
- ret = nfs_read_folio(file, page_folio(page));
- put_page(page);
+ ret = nfs_read_folio(file, folio);
+ folio_put(folio);
if (!ret)
goto start;
}
@@ -347,11 +356,12 @@ start:
}
static int nfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
- unsigned offset = pos & (PAGE_SIZE - 1);
struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct folio *folio = page_folio(page);
+ unsigned offset = offset_in_folio(folio, pos);
int status;
dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
@@ -361,26 +371,26 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
* Zero any uninitialised parts of the page, and then mark the page
* as up to date if it turns out that we're extending the file.
*/
- if (!PageUptodate(page)) {
- unsigned pglen = nfs_page_length(page);
+ if (!folio_test_uptodate(folio)) {
+ size_t fsize = folio_size(folio);
+ unsigned pglen = nfs_folio_length(folio);
unsigned end = offset + copied;
if (pglen == 0) {
- zero_user_segments(page, 0, offset,
- end, PAGE_SIZE);
- SetPageUptodate(page);
+ folio_zero_segments(folio, 0, offset, end, fsize);
+ folio_mark_uptodate(folio);
} else if (end >= pglen) {
- zero_user_segment(page, end, PAGE_SIZE);
+ folio_zero_segment(folio, end, fsize);
if (offset == 0)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
} else
- zero_user_segment(page, pglen, PAGE_SIZE);
+ folio_zero_segment(folio, pglen, fsize);
}
- status = nfs_updatepage(file, page, offset, copied);
+ status = nfs_update_folio(file, folio, offset, copied);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (status < 0)
return status;
@@ -402,14 +412,16 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
static void nfs_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
+ struct inode *inode = folio_file_mapping(folio)->host;
dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
folio->index, offset, length);
if (offset != 0 || length < folio_size(folio))
return;
/* Cancel any unstarted writes on this page */
- nfs_wb_folio_cancel(folio->mapping->host, folio);
+ nfs_wb_folio_cancel(inode, folio);
folio_wait_fscache(folio);
+ trace_nfs_invalidate_folio(inode, folio);
}
/*
@@ -423,8 +435,13 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp)
dfprintk(PAGECACHE, "NFS: release_folio(%p)\n", folio);
/* If the private flag is set, then the folio is not freeable */
- if (folio_test_private(folio))
- return false;
+ if (folio_test_private(folio)) {
+ if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL ||
+ current_is_kswapd())
+ return false;
+ if (nfs_wb_folio(folio_file_mapping(folio)->host, folio) < 0)
+ return false;
+ }
return nfs_fscache_release_folio(folio, gfp);
}
@@ -465,12 +482,15 @@ static void nfs_check_dirty_writeback(struct folio *folio,
static int nfs_launder_folio(struct folio *folio)
{
struct inode *inode = folio->mapping->host;
+ int ret;
dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n",
inode->i_ino, folio_pos(folio));
folio_wait_fscache(folio);
- return nfs_wb_page(inode, &folio->page);
+ ret = nfs_wb_folio(inode, folio);
+ trace_nfs_launder_folio_done(inode, folio, ret);
+ return ret;
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -547,22 +567,22 @@ const struct address_space_operations nfs_file_aops = {
*/
static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
struct file *filp = vmf->vma->vm_file;
struct inode *inode = file_inode(filp);
unsigned pagelen;
vm_fault_t ret = VM_FAULT_NOPAGE;
struct address_space *mapping;
+ struct folio *folio = page_folio(vmf->page);
dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n",
- filp, filp->f_mapping->host->i_ino,
- (long long)page_offset(page));
+ filp, filp->f_mapping->host->i_ino,
+ (long long)folio_file_pos(folio));
sb_start_pagefault(inode->i_sb);
/* make sure the cache has finished storing the page */
- if (PageFsCache(page) &&
- wait_on_page_fscache_killable(vmf->page) < 0) {
+ if (folio_test_fscache(folio) &&
+ folio_wait_fscache_killable(folio) < 0) {
ret = VM_FAULT_RETRY;
goto out;
}
@@ -571,25 +591,25 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
nfs_wait_bit_killable,
TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
- lock_page(page);
- mapping = page_file_mapping(page);
+ folio_lock(folio);
+ mapping = folio_file_mapping(folio);
if (mapping != inode->i_mapping)
goto out_unlock;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- pagelen = nfs_page_length(page);
+ pagelen = nfs_folio_length(folio);
if (pagelen == 0)
goto out_unlock;
ret = VM_FAULT_LOCKED;
- if (nfs_flush_incompatible(filp, page) == 0 &&
- nfs_updatepage(filp, page, 0, pagelen) == 0)
+ if (nfs_flush_incompatible(filp, folio) == 0 &&
+ nfs_update_folio(filp, folio, 0, pagelen) == 0)
goto out;
ret = VM_FAULT_SIGBUS;
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
out:
sb_end_pagefault(inode->i_sb);
return ret;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 4974cd18ca46..ce8f8934bca5 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -862,6 +862,8 @@ fl_pnfs_update_layout(struct inode *ino,
status = filelayout_check_deviceid(lo, fl, gfp_flags);
if (status) {
+ pnfs_error_mark_layout_for_return(ino, lseg);
+ pnfs_set_lo_fail(lseg);
pnfs_put_lseg(lseg);
lseg = NULL;
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 41468c21291d..2a65fe2a63ab 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -760,17 +760,18 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
* Record the page as unstable (an extra writeback period) and mark its
* inode as dirty.
*/
-static inline
-void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
+static inline void nfs_folio_mark_unstable(struct folio *folio,
+ struct nfs_commit_info *cinfo)
{
- if (!cinfo->dreq) {
- struct inode *inode = page_file_mapping(page)->host;
+ if (folio && !cinfo->dreq) {
+ struct inode *inode = folio_file_mapping(folio)->host;
+ long nr = folio_nr_pages(folio);
/* This page is really still in write-back - just that the
* writeback is happening on the server now.
*/
- inc_node_page_state(page, NR_WRITEBACK);
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
+ node_stat_mod_folio(folio, NR_WRITEBACK, nr);
+ wb_stat_mod(&inode_to_bdi(inode)->wb, WB_WRITEBACK, nr);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
}
@@ -795,6 +796,24 @@ unsigned int nfs_page_length(struct page *page)
}
/*
+ * Determine the number of bytes of data the page contains
+ */
+static inline size_t nfs_folio_length(struct folio *folio)
+{
+ loff_t i_size = i_size_read(folio_file_mapping(folio)->host);
+
+ if (i_size > 0) {
+ pgoff_t index = folio_index(folio) >> folio_order(folio);
+ pgoff_t end_index = (i_size - 1) >> folio_shift(folio);
+ if (index < end_index)
+ return folio_size(folio);
+ if (index == end_index)
+ return offset_in_folio(folio, i_size - 1) + 1;
+ }
+ return 0;
+}
+
+/*
* Convert a umode to a dirent->d_type
*/
static inline
@@ -807,11 +826,10 @@ unsigned char nfs_umode_to_dtype(umode_t mode)
* Determine the number of pages in an array of length 'len' and
* with a base offset of 'base'
*/
-static inline
-unsigned int nfs_page_array_len(unsigned int base, size_t len)
+static inline unsigned int nfs_page_array_len(unsigned int base, size_t len)
{
- return ((unsigned long)len + (unsigned long)base +
- PAGE_SIZE - 1) >> PAGE_SHIFT;
+ return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
}
/*
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index ecb428512fe1..93e306bf4430 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -460,7 +460,8 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
if (err >= 0)
break;
- if (err == -ENOTSUPP &&
+ if ((err == -ENOTSUPP ||
+ err == -NFS4ERR_OFFLOAD_DENIED) &&
nfs42_files_from_same_server(src, dst)) {
err = -EOPNOTSUPP;
break;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d9c332019d06..22a93ae46cd7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -10604,7 +10604,9 @@ static void nfs4_disable_swap(struct inode *inode)
/* The state manager thread will now exit once it is
* woken.
*/
- wake_up_var(&NFS_SERVER(inode)->nfs_client->cl_state);
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+
+ nfs4_schedule_state_manager(clp);
}
static const struct inode_operations nfs4_dir_inode_operations = {
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 214bc56f92d2..d27919d7241d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -292,32 +292,34 @@ TRACE_DEFINE_ENUM(NFS4CLNT_MOVED);
TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED);
TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED);
TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER);
+TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_AVAILABLE);
TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING);
TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ);
TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW);
+TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_DELAYED);
#define show_nfs4_clp_state(state) \
__print_flags(state, "|", \
- { NFS4CLNT_MANAGER_RUNNING, "MANAGER_RUNNING" }, \
- { NFS4CLNT_CHECK_LEASE, "CHECK_LEASE" }, \
- { NFS4CLNT_LEASE_EXPIRED, "LEASE_EXPIRED" }, \
- { NFS4CLNT_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \
- { NFS4CLNT_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \
- { NFS4CLNT_DELEGRETURN, "DELEGRETURN" }, \
- { NFS4CLNT_SESSION_RESET, "SESSION_RESET" }, \
- { NFS4CLNT_LEASE_CONFIRM, "LEASE_CONFIRM" }, \
- { NFS4CLNT_SERVER_SCOPE_MISMATCH, \
- "SERVER_SCOPE_MISMATCH" }, \
- { NFS4CLNT_PURGE_STATE, "PURGE_STATE" }, \
- { NFS4CLNT_BIND_CONN_TO_SESSION, \
- "BIND_CONN_TO_SESSION" }, \
- { NFS4CLNT_MOVED, "MOVED" }, \
- { NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \
- { NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \
- { NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \
- { NFS4CLNT_RECALL_RUNNING, "RECALL_RUNNING" }, \
- { NFS4CLNT_RECALL_ANY_LAYOUT_READ, "RECALL_ANY_LAYOUT_READ" }, \
- { NFS4CLNT_RECALL_ANY_LAYOUT_RW, "RECALL_ANY_LAYOUT_RW" })
+ { BIT(NFS4CLNT_MANAGER_RUNNING), "MANAGER_RUNNING" }, \
+ { BIT(NFS4CLNT_CHECK_LEASE), "CHECK_LEASE" }, \
+ { BIT(NFS4CLNT_LEASE_EXPIRED), "LEASE_EXPIRED" }, \
+ { BIT(NFS4CLNT_RECLAIM_REBOOT), "RECLAIM_REBOOT" }, \
+ { BIT(NFS4CLNT_RECLAIM_NOGRACE), "RECLAIM_NOGRACE" }, \
+ { BIT(NFS4CLNT_DELEGRETURN), "DELEGRETURN" }, \
+ { BIT(NFS4CLNT_SESSION_RESET), "SESSION_RESET" }, \
+ { BIT(NFS4CLNT_LEASE_CONFIRM), "LEASE_CONFIRM" }, \
+ { BIT(NFS4CLNT_SERVER_SCOPE_MISMATCH), "SERVER_SCOPE_MISMATCH" }, \
+ { BIT(NFS4CLNT_PURGE_STATE), "PURGE_STATE" }, \
+ { BIT(NFS4CLNT_BIND_CONN_TO_SESSION), "BIND_CONN_TO_SESSION" }, \
+ { BIT(NFS4CLNT_MOVED), "MOVED" }, \
+ { BIT(NFS4CLNT_LEASE_MOVED), "LEASE_MOVED" }, \
+ { BIT(NFS4CLNT_DELEGATION_EXPIRED), "DELEGATION_EXPIRED" }, \
+ { BIT(NFS4CLNT_RUN_MANAGER), "RUN_MANAGER" }, \
+ { BIT(NFS4CLNT_MANAGER_AVAILABLE), "MANAGER_AVAILABLE" }, \
+ { BIT(NFS4CLNT_RECALL_RUNNING), "RECALL_RUNNING" }, \
+ { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_READ), "RECALL_ANY_LAYOUT_READ" }, \
+ { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_RW), "RECALL_ANY_LAYOUT_RW" }, \
+ { BIT(NFS4CLNT_DELEGRETURN_DELAYED), "DELERETURN_DELAYED" })
TRACE_EVENT(nfs4_state_mgr,
TP_PROTO(
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 642f6921852f..a778713343df 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -152,8 +152,6 @@ DEFINE_NFS_INODE_EVENT(nfs_getattr_enter);
DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit);
DEFINE_NFS_INODE_EVENT(nfs_setattr_enter);
DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit);
-DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter);
-DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit);
DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter);
DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit);
DEFINE_NFS_INODE_EVENT(nfs_fsync_enter);
@@ -933,13 +931,13 @@ TRACE_EVENT(nfs_sillyrename_unlink,
)
);
-TRACE_EVENT(nfs_aop_readpage,
+DECLARE_EVENT_CLASS(nfs_folio_event,
TP_PROTO(
const struct inode *inode,
- struct page *page
+ struct folio *folio
),
- TP_ARGS(inode, page),
+ TP_ARGS(inode, folio),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -947,6 +945,7 @@ TRACE_EVENT(nfs_aop_readpage,
__field(u64, fileid)
__field(u64, version)
__field(loff_t, offset)
+ __field(u32, count)
),
TP_fast_assign(
@@ -956,26 +955,36 @@ TRACE_EVENT(nfs_aop_readpage,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = page_index(page) << PAGE_SHIFT;
+ __entry->offset = folio_file_pos(folio);
+ __entry->count = nfs_folio_length(folio);
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld",
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "offset=%lld count=%u",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle, __entry->version,
- __entry->offset
+ __entry->offset, __entry->count
)
);
-TRACE_EVENT(nfs_aop_readpage_done,
+#define DEFINE_NFS_FOLIO_EVENT(name) \
+ DEFINE_EVENT(nfs_folio_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ struct folio *folio \
+ ), \
+ TP_ARGS(inode, folio))
+
+DECLARE_EVENT_CLASS(nfs_folio_event_done,
TP_PROTO(
const struct inode *inode,
- struct page *page,
+ struct folio *folio,
int ret
),
- TP_ARGS(inode, page, ret),
+ TP_ARGS(inode, folio, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -984,6 +993,7 @@ TRACE_EVENT(nfs_aop_readpage_done,
__field(u64, fileid)
__field(u64, version)
__field(loff_t, offset)
+ __field(u32, count)
),
TP_fast_assign(
@@ -993,19 +1003,39 @@ TRACE_EVENT(nfs_aop_readpage_done,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
- __entry->offset = page_index(page) << PAGE_SHIFT;
+ __entry->offset = folio_file_pos(folio);
+ __entry->count = nfs_folio_length(folio);
__entry->ret = ret;
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld ret=%d",
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+ "offset=%lld count=%u ret=%d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle, __entry->version,
- __entry->offset, __entry->ret
+ __entry->offset, __entry->count, __entry->ret
)
);
+#define DEFINE_NFS_FOLIO_EVENT_DONE(name) \
+ DEFINE_EVENT(nfs_folio_event_done, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ struct folio *folio, \
+ int ret \
+ ), \
+ TP_ARGS(inode, folio, ret))
+
+DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done);
+
TRACE_EVENT(nfs_aop_readahead,
TP_PROTO(
const struct inode *inode,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 779bfc37233c..64fa8de199de 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -32,6 +32,42 @@
static struct kmem_cache *nfs_page_cachep;
static const struct rpc_call_ops nfs_pgio_common_ops;
+struct nfs_page_iter_page {
+ const struct nfs_page *req;
+ size_t count;
+};
+
+static void nfs_page_iter_page_init(struct nfs_page_iter_page *i,
+ const struct nfs_page *req)
+{
+ i->req = req;
+ i->count = 0;
+}
+
+static void nfs_page_iter_page_advance(struct nfs_page_iter_page *i, size_t sz)
+{
+ const struct nfs_page *req = i->req;
+ size_t tmp = i->count + sz;
+
+ i->count = (tmp < req->wb_bytes) ? tmp : req->wb_bytes;
+}
+
+static struct page *nfs_page_iter_page_get(struct nfs_page_iter_page *i)
+{
+ const struct nfs_page *req = i->req;
+ struct page *page;
+
+ if (i->count != req->wb_bytes) {
+ size_t base = i->count + req->wb_pgbase;
+ size_t len = PAGE_SIZE - offset_in_page(base);
+
+ page = nfs_page_to_page(req, base);
+ nfs_page_iter_page_advance(i, len);
+ return page;
+ }
+ return NULL;
+}
+
static struct nfs_pgio_mirror *
nfs_pgio_get_mirror(struct nfs_pageio_descriptor *desc, u32 idx)
{
@@ -391,7 +427,7 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
* has extra ref from the write/commit path to handle handoff
* between write and commit lists. */
if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) {
- inode = page_file_mapping(req->wb_page)->host;
+ inode = nfs_page_to_inode(req);
set_bit(PG_INODE_REF, &req->wb_flags);
kref_get(&req->wb_kref);
atomic_long_inc(&NFS_I(inode)->nrequests);
@@ -431,10 +467,9 @@ out:
nfs_release_request(head);
}
-static struct nfs_page *
-__nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page,
- unsigned int pgbase, unsigned int offset,
- unsigned int count)
+static struct nfs_page *nfs_page_create(struct nfs_lock_context *l_ctx,
+ unsigned int pgbase, pgoff_t index,
+ unsigned int offset, unsigned int count)
{
struct nfs_page *req;
struct nfs_open_context *ctx = l_ctx->open_context;
@@ -453,42 +488,90 @@ __nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page,
/* Initialize the request struct. Initially, we assume a
* long write-back delay. This will be adjusted in
* update_nfs_request below if the region is not locked. */
- req->wb_page = page;
- if (page) {
- req->wb_index = page_index(page);
- get_page(page);
- }
- req->wb_offset = offset;
- req->wb_pgbase = pgbase;
- req->wb_bytes = count;
+ req->wb_pgbase = pgbase;
+ req->wb_index = index;
+ req->wb_offset = offset;
+ req->wb_bytes = count;
kref_init(&req->wb_kref);
req->wb_nio = 0;
return req;
}
+static void nfs_page_assign_folio(struct nfs_page *req, struct folio *folio)
+{
+ if (folio != NULL) {
+ req->wb_folio = folio;
+ folio_get(folio);
+ set_bit(PG_FOLIO, &req->wb_flags);
+ }
+}
+
+static void nfs_page_assign_page(struct nfs_page *req, struct page *page)
+{
+ if (page != NULL) {
+ req->wb_page = page;
+ get_page(page);
+ }
+}
+
/**
- * nfs_create_request - Create an NFS read/write request.
+ * nfs_page_create_from_page - Create an NFS read/write request.
* @ctx: open context to use
* @page: page to write
- * @offset: starting offset within the page for the write
+ * @pgbase: starting offset within the page for the write
+ * @offset: file offset for the write
* @count: number of bytes to read/write
*
* The page must be locked by the caller. This makes sure we never
* create two different requests for the same page.
* User should ensure it is safe to sleep in this function.
*/
-struct nfs_page *
-nfs_create_request(struct nfs_open_context *ctx, struct page *page,
- unsigned int offset, unsigned int count)
+struct nfs_page *nfs_page_create_from_page(struct nfs_open_context *ctx,
+ struct page *page,
+ unsigned int pgbase, loff_t offset,
+ unsigned int count)
+{
+ struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx);
+ struct nfs_page *ret;
+
+ if (IS_ERR(l_ctx))
+ return ERR_CAST(l_ctx);
+ ret = nfs_page_create(l_ctx, pgbase, offset >> PAGE_SHIFT,
+ offset_in_page(offset), count);
+ if (!IS_ERR(ret)) {
+ nfs_page_assign_page(ret, page);
+ nfs_page_group_init(ret, NULL);
+ }
+ nfs_put_lock_context(l_ctx);
+ return ret;
+}
+
+/**
+ * nfs_page_create_from_folio - Create an NFS read/write request.
+ * @ctx: open context to use
+ * @folio: folio to write
+ * @offset: starting offset within the folio for the write
+ * @count: number of bytes to read/write
+ *
+ * The page must be locked by the caller. This makes sure we never
+ * create two different requests for the same page.
+ * User should ensure it is safe to sleep in this function.
+ */
+struct nfs_page *nfs_page_create_from_folio(struct nfs_open_context *ctx,
+ struct folio *folio,
+ unsigned int offset,
+ unsigned int count)
{
struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx);
struct nfs_page *ret;
if (IS_ERR(l_ctx))
return ERR_CAST(l_ctx);
- ret = __nfs_create_request(l_ctx, page, offset, offset, count);
- if (!IS_ERR(ret))
+ ret = nfs_page_create(l_ctx, offset, folio_index(folio), offset, count);
+ if (!IS_ERR(ret)) {
+ nfs_page_assign_folio(ret, folio);
nfs_page_group_init(ret, NULL);
+ }
nfs_put_lock_context(l_ctx);
return ret;
}
@@ -501,10 +584,16 @@ nfs_create_subreq(struct nfs_page *req,
{
struct nfs_page *last;
struct nfs_page *ret;
+ struct folio *folio = nfs_page_to_folio(req);
+ struct page *page = nfs_page_to_page(req, pgbase);
- ret = __nfs_create_request(req->wb_lock_context, req->wb_page,
- pgbase, offset, count);
+ ret = nfs_page_create(req->wb_lock_context, pgbase, req->wb_index,
+ offset, count);
if (!IS_ERR(ret)) {
+ if (folio)
+ nfs_page_assign_folio(ret, folio);
+ else
+ nfs_page_assign_page(ret, page);
/* find the last request */
for (last = req->wb_head;
last->wb_this_page != req->wb_head;
@@ -512,7 +601,6 @@ nfs_create_subreq(struct nfs_page *req,
;
nfs_lock_request(ret);
- ret->wb_index = req->wb_index;
nfs_page_group_init(ret, last);
ret->wb_nio = req->wb_nio;
}
@@ -551,11 +639,16 @@ void nfs_unlock_and_release_request(struct nfs_page *req)
*/
static void nfs_clear_request(struct nfs_page *req)
{
+ struct folio *folio = nfs_page_to_folio(req);
struct page *page = req->wb_page;
struct nfs_lock_context *l_ctx = req->wb_lock_context;
struct nfs_open_context *ctx;
- if (page != NULL) {
+ if (folio != NULL) {
+ folio_put(folio);
+ req->wb_folio = NULL;
+ clear_bit(PG_FOLIO, &req->wb_flags);
+ } else if (page != NULL) {
put_page(page);
req->wb_page = NULL;
}
@@ -693,13 +786,14 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
/**
* nfs_pgio_rpcsetup - Set up arguments for a pageio call
* @hdr: The pageio hdr
+ * @pgbase: base
* @count: Number of bytes to read
* @how: How to commit data (writes only)
* @cinfo: Commit information for the call (writes only)
*/
-static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
- unsigned int count,
- int how, struct nfs_commit_info *cinfo)
+static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, unsigned int pgbase,
+ unsigned int count, int how,
+ struct nfs_commit_info *cinfo)
{
struct nfs_page *req = hdr->req;
@@ -710,7 +804,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
hdr->args.offset = req_offset(req);
/* pnfs_set_layoutcommit needs this */
hdr->mds_offset = hdr->args.offset;
- hdr->args.pgbase = req->wb_pgbase;
+ hdr->args.pgbase = pgbase;
hdr->args.pages = hdr->page_array.pagevec;
hdr->args.count = count;
hdr->args.context = get_nfs_open_context(nfs_req_openctx(req));
@@ -896,9 +990,10 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
struct nfs_commit_info cinfo;
struct nfs_page_array *pg_array = &hdr->page_array;
unsigned int pagecount, pageused;
+ unsigned int pg_base = offset_in_page(mirror->pg_base);
gfp_t gfp_flags = nfs_io_gfp_mask();
- pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
+ pagecount = nfs_page_array_len(pg_base, mirror->pg_count);
pg_array->npages = pagecount;
if (pagecount <= ARRAY_SIZE(pg_array->page_array))
@@ -918,16 +1013,26 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
last_page = NULL;
pageused = 0;
while (!list_empty(head)) {
+ struct nfs_page_iter_page i;
+ struct page *page;
+
req = nfs_list_entry(head->next);
nfs_list_move_request(req, &hdr->pages);
- if (!last_page || last_page != req->wb_page) {
- pageused++;
- if (pageused > pagecount)
- break;
- *pages++ = last_page = req->wb_page;
+ if (req->wb_pgbase == 0)
+ last_page = NULL;
+
+ nfs_page_iter_page_init(&i, req);
+ while ((page = nfs_page_iter_page_get(&i)) != NULL) {
+ if (last_page != page) {
+ pageused++;
+ if (pageused > pagecount)
+ goto full;
+ *pages++ = last_page = page;
+ }
}
}
+full:
if (WARN_ON_ONCE(pageused != pagecount)) {
nfs_pgio_error(hdr);
desc->pg_error = -EINVAL;
@@ -939,7 +1044,8 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
desc->pg_ioflags &= ~FLUSH_COND_STABLE;
/* Set up the argument struct */
- nfs_pgio_rpcsetup(hdr, mirror->pg_count, desc->pg_ioflags, &cinfo);
+ nfs_pgio_rpcsetup(hdr, pg_base, mirror->pg_count, desc->pg_ioflags,
+ &cinfo);
desc->pg_rpc_callops = &nfs_pgio_common_ops;
return 0;
}
@@ -1035,6 +1141,24 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
return l1->lockowner == l2->lockowner;
}
+static bool nfs_page_is_contiguous(const struct nfs_page *prev,
+ const struct nfs_page *req)
+{
+ size_t prev_end = prev->wb_pgbase + prev->wb_bytes;
+
+ if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
+ return false;
+ if (req->wb_pgbase == 0)
+ return prev_end == nfs_page_max_length(prev);
+ if (req->wb_pgbase == prev_end) {
+ struct folio *folio = nfs_page_to_folio(req);
+ if (folio)
+ return folio == nfs_page_to_folio(prev);
+ return req->wb_page == prev->wb_page;
+ }
+ return false;
+}
+
/**
* nfs_coalesce_size - test two requests for compatibility
* @prev: pointer to nfs_page
@@ -1063,16 +1187,8 @@ static unsigned int nfs_coalesce_size(struct nfs_page *prev,
!nfs_match_lock_context(req->wb_lock_context,
prev->wb_lock_context))
return 0;
- if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
+ if (!nfs_page_is_contiguous(prev, req))
return 0;
- if (req->wb_page == prev->wb_page) {
- if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
- return 0;
- } else {
- if (req->wb_pgbase != 0 ||
- prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE)
- return 0;
- }
}
return pgio->pg_ops->pg_test(pgio, prev, req);
}
@@ -1412,16 +1528,21 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
{
struct nfs_pgio_mirror *mirror;
struct nfs_page *prev;
+ struct folio *folio;
u32 midx;
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
mirror = nfs_pgio_get_mirror(desc, midx);
if (!list_empty(&mirror->pg_list)) {
prev = nfs_list_entry(mirror->pg_list.prev);
- if (index != prev->wb_index + 1) {
- nfs_pageio_complete(desc);
- break;
- }
+ folio = nfs_page_to_folio(prev);
+ if (folio) {
+ if (index == folio_next_index(folio))
+ continue;
+ } else if (index == prev->wb_index + 1)
+ continue;
+ nfs_pageio_complete(desc);
+ break;
}
}
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a5db5158c634..306cba0b9e69 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -511,7 +511,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
spin_lock(&inode->i_lock);
pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
- pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
+ pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e3e6a41f19de..d886c8226d8f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -193,7 +193,7 @@ struct pnfs_commit_ops {
void (*recover_commit_reqs) (struct list_head *list,
struct nfs_commit_info *cinfo);
struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
- struct page *page);
+ struct folio *folio);
};
struct pnfs_layout_hdr {
@@ -395,7 +395,7 @@ void pnfs_generic_rw_release(void *data);
void pnfs_generic_recover_commit_reqs(struct list_head *dst,
struct nfs_commit_info *cinfo);
struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
- struct page *page);
+ struct folio *folio);
int pnfs_generic_commit_pagelist(struct inode *inode,
struct list_head *mds_pages,
int how,
@@ -557,13 +557,13 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
static inline struct nfs_page *
pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
- struct page *page)
+ struct folio *folio)
{
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs)
return NULL;
- return fl_cinfo->ops->search_commit_reqs(cinfo, page);
+ return fl_cinfo->ops->search_commit_reqs(cinfo, folio);
}
/* Should the pNFS client commit and return the layout upon a setattr */
@@ -864,7 +864,7 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
static inline struct nfs_page *
pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
- struct page *page)
+ struct folio *folio)
{
return NULL;
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 5d035dd2d7bf..a0112ad4937a 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -353,7 +353,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
static struct nfs_page *
pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
- unsigned int nbuckets, struct page *page)
+ unsigned int nbuckets, struct folio *folio)
{
struct nfs_page *req;
struct pnfs_commit_bucket *b;
@@ -363,11 +363,11 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
* request is found */
for (i = 0, b = buckets; i < nbuckets; i++, b++) {
list_for_each_entry(req, &b->written, wb_list) {
- if (req->wb_page == page)
+ if (nfs_page_to_folio(req) == folio)
return req->wb_head;
}
list_for_each_entry(req, &b->committing, wb_list) {
- if (req->wb_page == page)
+ if (nfs_page_to_folio(req) == folio)
return req->wb_head;
}
}
@@ -375,14 +375,14 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
}
/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request
- * for @page
+ * for @folio
* @cinfo - commit info for current inode
- * @page - page to search for matching head request
+ * @folio - page to search for matching head request
*
* Return: the head request if one is found, otherwise %NULL.
*/
-struct nfs_page *
-pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
+struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
+ struct folio *folio)
{
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
struct pnfs_commit_array *array;
@@ -390,7 +390,7 @@ pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page
list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) {
req = pnfs_bucket_search_commit_reqs(array->buckets,
- array->nbuckets, page);
+ array->nbuckets, folio);
if (req)
return req;
}
@@ -1180,7 +1180,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
nfs_request_add_commit_list_locked(req, list, cinfo);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- nfs_mark_page_unstable(req->wb_page, cinfo);
+ nfs_folio_mark_unstable(nfs_page_to_folio(req), cinfo);
return;
out_resched:
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 8ae2c8d1219d..c380cff4108e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -49,12 +49,11 @@ static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
kmem_cache_free(nfs_rdata_cachep, rhdr);
}
-static
-int nfs_return_empty_page(struct page *page)
+static int nfs_return_empty_folio(struct folio *folio)
{
- zero_user(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
- unlock_page(page);
+ folio_zero_segment(folio, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
}
@@ -111,18 +110,18 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
static void nfs_readpage_release(struct nfs_page *req, int error)
{
struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
- struct page *page = req->wb_page;
+ struct folio *folio = nfs_page_to_folio(req);
dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(inode), req->wb_bytes,
(long long)req_offset(req));
if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT)
- SetPageError(page);
+ folio_set_error(folio);
if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
- if (PageUptodate(page))
- nfs_fscache_write_page(inode, page);
- unlock_page(page);
+ if (folio_test_uptodate(folio))
+ nfs_fscache_write_page(inode, &folio->page);
+ folio_unlock(folio);
}
nfs_release_request(req);
}
@@ -135,7 +134,7 @@ struct nfs_readdesc {
static void nfs_page_group_set_uptodate(struct nfs_page *req)
{
if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
- SetPageUptodate(req->wb_page);
+ folio_mark_uptodate(nfs_page_to_folio(req));
}
static void nfs_read_completion(struct nfs_pgio_header *hdr)
@@ -147,7 +146,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
goto out;
while (!list_empty(&hdr->pages)) {
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
- struct page *page = req->wb_page;
+ struct folio *folio = nfs_page_to_folio(req);
unsigned long start = req->wb_pgbase;
unsigned long end = req->wb_pgbase + req->wb_bytes;
@@ -157,14 +156,14 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
if (bytes > hdr->good_bytes) {
/* nothing in this request was good, so zero
* the full extent of the request */
- zero_user_segment(page, start, end);
+ folio_zero_segment(folio, start, end);
} else if (hdr->good_bytes - bytes < req->wb_bytes) {
/* part of this request has good bytes, but
* not all. zero the bad bytes */
start += hdr->good_bytes - bytes;
WARN_ON(start < req->wb_pgbase);
- zero_user_segment(page, start, end);
+ folio_zero_segment(folio, start, end);
}
}
error = 0;
@@ -281,33 +280,34 @@ static void nfs_readpage_result(struct rpc_task *task,
nfs_readpage_retry(task, hdr);
}
-static int
-readpage_async_filler(struct nfs_readdesc *desc, struct page *page)
+static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
- unsigned int rsize = NFS_SERVER(inode)->rsize;
+ struct inode *inode = folio_file_mapping(folio)->host;
+ struct nfs_server *server = NFS_SERVER(inode);
+ size_t fsize = folio_size(folio);
+ unsigned int rsize = server->rsize;
struct nfs_page *new;
unsigned int len, aligned_len;
int error;
- len = nfs_page_length(page);
+ len = nfs_folio_length(folio);
if (len == 0)
- return nfs_return_empty_page(page);
+ return nfs_return_empty_folio(folio);
- aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE);
+ aligned_len = min_t(unsigned int, ALIGN(len, rsize), fsize);
- if (!IS_SYNC(page->mapping->host)) {
- error = nfs_fscache_read_page(page->mapping->host, page);
+ if (!IS_SYNC(inode)) {
+ error = nfs_fscache_read_page(inode, &folio->page);
if (error == 0)
goto out_unlock;
}
- new = nfs_create_request(desc->ctx, page, 0, aligned_len);
+ new = nfs_page_create_from_folio(desc->ctx, folio, 0, aligned_len);
if (IS_ERR(new))
goto out_error;
- if (len < PAGE_SIZE)
- zero_user_segment(page, len, PAGE_SIZE);
+ if (len < fsize)
+ folio_zero_segment(folio, len, fsize);
if (!nfs_pageio_add_request(&desc->pgio, new)) {
nfs_list_remove_request(new);
error = desc->pgio.pg_error;
@@ -318,7 +318,7 @@ readpage_async_filler(struct nfs_readdesc *desc, struct page *page)
out_error:
error = PTR_ERR(new);
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
out:
return error;
}
@@ -331,61 +331,54 @@ out:
*/
int nfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
struct nfs_readdesc desc;
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = file_inode(file);
int ret;
- trace_nfs_aop_readpage(inode, page);
+ trace_nfs_aop_readpage(inode, folio);
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
/*
* Try to flush any pending writes to the file..
*
- * NOTE! Because we own the page lock, there cannot
+ * NOTE! Because we own the folio lock, there cannot
* be any new pending writes generated at this point
- * for this page (other pages can be written to).
+ * for this folio (other folios can be written to).
*/
- ret = nfs_wb_page(inode, page);
+ ret = nfs_wb_folio(inode, folio);
if (ret)
goto out_unlock;
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
goto out_unlock;
ret = -ESTALE;
if (NFS_STALE(inode))
goto out_unlock;
- if (file == NULL) {
- ret = -EBADF;
- desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ);
- if (desc.ctx == NULL)
- goto out_unlock;
- } else
- desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
+ desc.ctx = get_nfs_open_context(nfs_file_open_context(file));
xchg(&desc.ctx->error, 0);
nfs_pageio_init_read(&desc.pgio, inode, false,
&nfs_async_read_completion_ops);
- ret = readpage_async_filler(&desc, page);
+ ret = readpage_async_filler(&desc, folio);
if (ret)
goto out;
nfs_pageio_complete_read(&desc.pgio);
ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0;
if (!ret) {
- ret = wait_on_page_locked_killable(page);
- if (!PageUptodate(page) && !ret)
+ ret = folio_wait_locked_killable(folio);
+ if (!folio_test_uptodate(folio) && !ret)
ret = xchg(&desc.ctx->error, 0);
}
out:
put_nfs_open_context(desc.ctx);
- trace_nfs_aop_readpage_done(inode, page, ret);
+ trace_nfs_aop_readpage_done(inode, folio, ret);
return ret;
out_unlock:
- unlock_page(page);
- trace_nfs_aop_readpage_done(inode, page, ret);
+ folio_unlock(folio);
+ trace_nfs_aop_readpage_done(inode, folio, ret);
return ret;
}
@@ -395,7 +388,7 @@ void nfs_readahead(struct readahead_control *ractl)
struct file *file = ractl->file;
struct nfs_readdesc desc;
struct inode *inode = ractl->mapping->host;
- struct page *page;
+ struct folio *folio;
int ret;
trace_nfs_aop_readahead(inode, readahead_pos(ractl), nr_pages);
@@ -416,9 +409,8 @@ void nfs_readahead(struct readahead_control *ractl)
nfs_pageio_init_read(&desc.pgio, inode, false,
&nfs_async_read_completion_ops);
- while ((page = readahead_page(ractl)) != NULL) {
- ret = readpage_async_filler(&desc, page);
- put_page(page);
+ while ((folio = readahead_folio(ractl)) != NULL) {
+ ret = readpage_async_filler(&desc, folio);
if (ret)
break;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1a80d548253a..b508c985eb14 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -64,7 +64,7 @@ static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
struct inode *inode);
static struct nfs_page *
nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
- struct page *page);
+ struct folio *folio);
static struct kmem_cache *nfs_wdata_cachep;
static mempool_t *nfs_wdata_mempool;
@@ -171,31 +171,28 @@ nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
return 0;
}
-static struct nfs_page *
-nfs_page_private_request(struct page *page)
+static struct nfs_page *nfs_folio_private_request(struct folio *folio)
{
- if (!PagePrivate(page))
- return NULL;
- return (struct nfs_page *)page_private(page);
+ return folio_get_private(folio);
}
-/*
- * nfs_page_find_head_request_locked - find head request associated with @page
+/**
+ * nfs_folio_find_private_request - find head request associated with a folio
+ * @folio: pointer to folio
*
* must be called while holding the inode lock.
*
* returns matching head request with reference held, or NULL if not found.
*/
-static struct nfs_page *
-nfs_page_find_private_request(struct page *page)
+static struct nfs_page *nfs_folio_find_private_request(struct folio *folio)
{
- struct address_space *mapping = page_file_mapping(page);
+ struct address_space *mapping = folio_file_mapping(folio);
struct nfs_page *req;
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return NULL;
spin_lock(&mapping->private_lock);
- req = nfs_page_private_request(page);
+ req = nfs_folio_private_request(folio);
if (req) {
WARN_ON_ONCE(req->wb_head != req);
kref_get(&req->wb_kref);
@@ -204,18 +201,17 @@ nfs_page_find_private_request(struct page *page)
return req;
}
-static struct nfs_page *
-nfs_page_find_swap_request(struct page *page)
+static struct nfs_page *nfs_folio_find_swap_request(struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = folio_file_mapping(folio)->host;
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_page *req = NULL;
- if (!PageSwapCache(page))
+ if (!folio_test_swapcache(folio))
return NULL;
mutex_lock(&nfsi->commit_mutex);
- if (PageSwapCache(page)) {
+ if (folio_test_swapcache(folio)) {
req = nfs_page_search_commits_for_head_request_locked(nfsi,
- page);
+ folio);
if (req) {
WARN_ON_ONCE(req->wb_head != req);
kref_get(&req->wb_kref);
@@ -225,29 +221,30 @@ nfs_page_find_swap_request(struct page *page)
return req;
}
-/*
- * nfs_page_find_head_request - find head request associated with @page
+/**
+ * nfs_folio_find_head_request - find head request associated with a folio
+ * @folio: pointer to folio
*
* returns matching head request with reference held, or NULL if not found.
*/
-static struct nfs_page *nfs_page_find_head_request(struct page *page)
+static struct nfs_page *nfs_folio_find_head_request(struct folio *folio)
{
struct nfs_page *req;
- req = nfs_page_find_private_request(page);
+ req = nfs_folio_find_private_request(folio);
if (!req)
- req = nfs_page_find_swap_request(page);
+ req = nfs_folio_find_swap_request(folio);
return req;
}
-static struct nfs_page *nfs_find_and_lock_page_request(struct page *page)
+static struct nfs_page *nfs_folio_find_and_lock_request(struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = folio_file_mapping(folio)->host;
struct nfs_page *req, *head;
int ret;
for (;;) {
- req = nfs_page_find_head_request(page);
+ req = nfs_folio_find_head_request(folio);
if (!req)
return req;
head = nfs_page_group_lock_head(req);
@@ -261,9 +258,9 @@ static struct nfs_page *nfs_find_and_lock_page_request(struct page *page)
return ERR_PTR(ret);
}
/* Ensure that nobody removed the request before we locked it */
- if (head == nfs_page_private_request(page))
+ if (head == nfs_folio_private_request(folio))
break;
- if (PageSwapCache(page))
+ if (folio_test_swapcache(folio))
break;
nfs_unlock_and_release_request(head);
}
@@ -271,18 +268,19 @@ static struct nfs_page *nfs_find_and_lock_page_request(struct page *page)
}
/* Adjust the file length if we're writing beyond the end */
-static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
+static void nfs_grow_file(struct folio *folio, unsigned int offset,
+ unsigned int count)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = folio_file_mapping(folio)->host;
loff_t end, i_size;
pgoff_t end_index;
spin_lock(&inode->i_lock);
i_size = i_size_read(inode);
- end_index = (i_size - 1) >> PAGE_SHIFT;
- if (i_size > 0 && page_index(page) < end_index)
+ end_index = ((i_size - 1) >> folio_shift(folio)) << folio_order(folio);
+ if (i_size > 0 && folio_index(folio) < end_index)
goto out;
- end = page_file_offset(page) + ((loff_t)offset+count);
+ end = folio_file_pos(folio) + (loff_t)offset + (loff_t)count;
if (i_size >= end)
goto out;
trace_nfs_size_grow(inode, end);
@@ -308,11 +306,11 @@ static void nfs_set_pageerror(struct address_space *mapping)
spin_unlock(&inode->i_lock);
}
-static void nfs_mapping_set_error(struct page *page, int error)
+static void nfs_mapping_set_error(struct folio *folio, int error)
{
- struct address_space *mapping = page_file_mapping(page);
+ struct address_space *mapping = folio_file_mapping(folio);
- SetPageError(page);
+ folio_set_error(folio);
filemap_set_wb_err(mapping, error);
if (mapping->host)
errseq_set(&mapping->host->i_sb->s_wb_err,
@@ -359,9 +357,9 @@ nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
*/
static bool nfs_page_group_covers_page(struct nfs_page *req)
{
+ unsigned int len = nfs_folio_length(nfs_page_to_folio(req));
struct nfs_page *tmp;
unsigned int pos = 0;
- unsigned int len = nfs_page_length(req->wb_page);
nfs_page_group_lock(req);
@@ -381,11 +379,13 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
*/
static void nfs_mark_uptodate(struct nfs_page *req)
{
- if (PageUptodate(req->wb_page))
+ struct folio *folio = nfs_page_to_folio(req);
+
+ if (folio_test_uptodate(folio))
return;
if (!nfs_page_group_covers_page(req))
return;
- SetPageUptodate(req->wb_page);
+ folio_mark_uptodate(folio);
}
static int wb_priority(struct writeback_control *wbc)
@@ -407,35 +407,34 @@ int nfs_congestion_kb;
#define NFS_CONGESTION_OFF_THRESH \
(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
-static void nfs_set_page_writeback(struct page *page)
+static void nfs_folio_set_writeback(struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
- struct nfs_server *nfss = NFS_SERVER(inode);
- int ret = test_set_page_writeback(page);
-
- WARN_ON_ONCE(ret != 0);
+ struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host);
- if (atomic_long_inc_return(&nfss->writeback) >
- NFS_CONGESTION_ON_THRESH)
+ folio_start_writeback(folio);
+ if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH)
nfss->write_congested = 1;
}
-static void nfs_end_page_writeback(struct nfs_page *req)
+static void nfs_folio_end_writeback(struct folio *folio)
{
- struct inode *inode = page_file_mapping(req->wb_page)->host;
- struct nfs_server *nfss = NFS_SERVER(inode);
- bool is_done;
+ struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host);
- is_done = nfs_page_group_sync_on_bit(req, PG_WB_END);
- nfs_unlock_request(req);
- if (!is_done)
- return;
-
- end_page_writeback(req->wb_page);
- if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
+ folio_end_writeback(folio);
+ if (atomic_long_dec_return(&nfss->writeback) <
+ NFS_CONGESTION_OFF_THRESH)
nfss->write_congested = 0;
}
+static void nfs_page_end_writeback(struct nfs_page *req)
+{
+ if (nfs_page_group_sync_on_bit(req, PG_WB_END)) {
+ nfs_unlock_request(req);
+ nfs_folio_end_writeback(nfs_page_to_folio(req));
+ } else
+ nfs_unlock_request(req);
+}
+
/*
* nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests
*
@@ -550,7 +549,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode)
/*
* nfs_lock_and_join_requests - join all subreqs to the head req
- * @page: the page used to lookup the "page group" of nfs_page structures
+ * @folio: the folio used to lookup the "page group" of nfs_page structures
*
* This function joins all sub requests to the head request by first
* locking all requests in the group, cancelling any pending operations
@@ -560,13 +559,12 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode)
*
* Returns a locked, referenced pointer to the head request - which after
* this call is guaranteed to be the only request associated with the page.
- * Returns NULL if no requests are found for @page, or a ERR_PTR if an
+ * Returns NULL if no requests are found for @folio, or a ERR_PTR if an
* error was encountered.
*/
-static struct nfs_page *
-nfs_lock_and_join_requests(struct page *page)
+static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = folio_file_mapping(folio)->host;
struct nfs_page *head;
int ret;
@@ -575,7 +573,7 @@ nfs_lock_and_join_requests(struct page *page)
* reference to the whole page group - the group will not be destroyed
* until the head reference is released.
*/
- head = nfs_find_and_lock_page_request(page);
+ head = nfs_folio_find_and_lock_request(folio);
if (IS_ERR_OR_NULL(head))
return head;
@@ -593,11 +591,10 @@ nfs_lock_and_join_requests(struct page *page)
static void nfs_write_error(struct nfs_page *req, int error)
{
- trace_nfs_write_error(page_file_mapping(req->wb_page)->host, req,
- error);
- nfs_mapping_set_error(req->wb_page, error);
+ trace_nfs_write_error(nfs_page_to_inode(req), req, error);
+ nfs_mapping_set_error(nfs_page_to_folio(req), error);
nfs_inode_remove_request(req);
- nfs_end_page_writeback(req);
+ nfs_page_end_writeback(req);
nfs_release_request(req);
}
@@ -605,21 +602,21 @@ static void nfs_write_error(struct nfs_page *req, int error)
* Find an associated nfs write request, and prepare to flush it out
* May return an error if the user signalled nfs_wait_on_request().
*/
-static int nfs_page_async_flush(struct page *page,
+static int nfs_page_async_flush(struct folio *folio,
struct writeback_control *wbc,
struct nfs_pageio_descriptor *pgio)
{
struct nfs_page *req;
int ret = 0;
- req = nfs_lock_and_join_requests(page);
+ req = nfs_lock_and_join_requests(folio);
if (!req)
goto out;
ret = PTR_ERR(req);
if (IS_ERR(req))
goto out;
- nfs_set_page_writeback(page);
+ nfs_folio_set_writeback(folio);
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
/* If there is a fatal error that covers this write, just exit */
@@ -637,12 +634,12 @@ static int nfs_page_async_flush(struct page *page,
goto out_launder;
if (wbc->sync_mode == WB_SYNC_NONE)
ret = AOP_WRITEPAGE_ACTIVATE;
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
nfs_redirty_request(req);
pgio->pg_error = 0;
} else
- nfs_add_stats(page_file_mapping(page)->host,
- NFSIOS_WRITEPAGES, 1);
+ nfs_add_stats(folio_file_mapping(folio)->host,
+ NFSIOS_WRITEPAGES, 1);
out:
return ret;
out_launder:
@@ -650,21 +647,21 @@ out_launder:
return 0;
}
-static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
+static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
struct nfs_pageio_descriptor *pgio)
{
- nfs_pageio_cond_complete(pgio, page_index(page));
- return nfs_page_async_flush(page, wbc, pgio);
+ nfs_pageio_cond_complete(pgio, folio_index(folio));
+ return nfs_page_async_flush(folio, wbc, pgio);
}
/*
* Write an mmapped page to the server.
*/
-static int nfs_writepage_locked(struct page *page,
+static int nfs_writepage_locked(struct folio *folio,
struct writeback_control *wbc)
{
struct nfs_pageio_descriptor pgio;
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = folio_file_mapping(folio)->host;
int err;
if (wbc->sync_mode == WB_SYNC_NONE &&
@@ -672,9 +669,9 @@ static int nfs_writepage_locked(struct page *page,
return AOP_WRITEPAGE_ACTIVATE;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
- nfs_pageio_init_write(&pgio, inode, 0,
- false, &nfs_async_write_completion_ops);
- err = nfs_do_writepage(page, wbc, &pgio);
+ nfs_pageio_init_write(&pgio, inode, 0, false,
+ &nfs_async_write_completion_ops);
+ err = nfs_do_writepage(folio, wbc, &pgio);
pgio.pg_error = 0;
nfs_pageio_complete(&pgio);
return err;
@@ -682,19 +679,22 @@ static int nfs_writepage_locked(struct page *page,
int nfs_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
int ret;
- ret = nfs_writepage_locked(page, wbc);
+ ret = nfs_writepage_locked(folio, wbc);
if (ret != AOP_WRITEPAGE_ACTIVATE)
unlock_page(page);
return ret;
}
-static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
+static int nfs_writepages_callback(struct page *page,
+ struct writeback_control *wbc, void *data)
{
+ struct folio *folio = page_folio(page);
int ret;
- ret = nfs_do_writepage(page, wbc, data);
+ ret = nfs_do_writepage(folio, wbc, data);
if (ret != AOP_WRITEPAGE_ACTIVATE)
unlock_page(page);
return ret;
@@ -750,10 +750,11 @@ out_err:
/*
* Insert a write request into an inode
*/
-static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+static void nfs_inode_add_request(struct nfs_page *req)
{
- struct address_space *mapping = page_file_mapping(req->wb_page);
- struct nfs_inode *nfsi = NFS_I(inode);
+ struct folio *folio = nfs_page_to_folio(req);
+ struct address_space *mapping = folio_file_mapping(folio);
+ struct nfs_inode *nfsi = NFS_I(mapping->host);
WARN_ON_ONCE(req->wb_this_page != req);
@@ -765,10 +766,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
* with invalidate/truncate.
*/
spin_lock(&mapping->private_lock);
- if (likely(!PageSwapCache(req->wb_page))) {
+ if (likely(!folio_test_swapcache(folio))) {
set_bit(PG_MAPPED, &req->wb_flags);
- SetPagePrivate(req->wb_page);
- set_page_private(req->wb_page, (unsigned long)req);
+ folio_set_private(folio);
+ folio->private = req;
}
spin_unlock(&mapping->private_lock);
atomic_long_inc(&nfsi->nrequests);
@@ -785,47 +786,43 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
*/
static void nfs_inode_remove_request(struct nfs_page *req)
{
- struct address_space *mapping = page_file_mapping(req->wb_page);
- struct inode *inode = mapping->host;
- struct nfs_inode *nfsi = NFS_I(inode);
- struct nfs_page *head;
-
if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
- head = req->wb_head;
+ struct folio *folio = nfs_page_to_folio(req->wb_head);
+ struct address_space *mapping = folio_file_mapping(folio);
spin_lock(&mapping->private_lock);
- if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
- set_page_private(head->wb_page, 0);
- ClearPagePrivate(head->wb_page);
- clear_bit(PG_MAPPED, &head->wb_flags);
+ if (likely(folio && !folio_test_swapcache(folio))) {
+ folio->private = NULL;
+ folio_clear_private(folio);
+ clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
}
spin_unlock(&mapping->private_lock);
}
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
nfs_release_request(req);
- atomic_long_dec(&nfsi->nrequests);
+ atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests);
}
}
-static void
-nfs_mark_request_dirty(struct nfs_page *req)
+static void nfs_mark_request_dirty(struct nfs_page *req)
{
- if (req->wb_page)
- __set_page_dirty_nobuffers(req->wb_page);
+ struct folio *folio = nfs_page_to_folio(req);
+ if (folio)
+ filemap_dirty_folio(folio_mapping(folio), folio);
}
/*
* nfs_page_search_commits_for_head_request_locked
*
- * Search through commit lists on @inode for the head request for @page.
+ * Search through commit lists on @inode for the head request for @folio.
* Must be called while holding the inode (which is cinfo) lock.
*
* Returns the head request if found, or NULL if not found.
*/
static struct nfs_page *
nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
- struct page *page)
+ struct folio *folio)
{
struct nfs_page *freq, *t;
struct nfs_commit_info cinfo;
@@ -834,13 +831,13 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
nfs_init_cinfo_from_inode(&cinfo, inode);
/* search through pnfs commit lists */
- freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+ freq = pnfs_search_commit_reqs(inode, &cinfo, folio);
if (freq)
return freq->wb_head;
/* Linearly search the commit list for the correct request */
list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
- if (freq->wb_page == page)
+ if (nfs_page_to_folio(freq) == folio)
return freq->wb_head;
}
@@ -888,8 +885,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- if (req->wb_page)
- nfs_mark_page_unstable(req->wb_page, cinfo);
+ nfs_folio_mark_unstable(nfs_page_to_folio(req), cinfo);
}
EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -948,12 +944,15 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
nfs_request_add_commit_list(req, cinfo);
}
-static void
-nfs_clear_page_commit(struct page *page)
+static void nfs_folio_clear_commit(struct folio *folio)
{
- dec_node_page_state(page, NR_WRITEBACK);
- dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
- WB_WRITEBACK);
+ if (folio) {
+ long nr = folio_nr_pages(folio);
+
+ node_stat_mod_folio(folio, NR_WRITEBACK, -nr);
+ wb_stat_mod(&inode_to_bdi(folio_file_mapping(folio)->host)->wb,
+ WB_WRITEBACK, -nr);
+ }
}
/* Called holding the request lock on @req */
@@ -971,7 +970,7 @@ nfs_clear_request_commit(struct nfs_page *req)
nfs_request_remove_commit_list(req, &cinfo);
}
mutex_unlock(&NFS_I(inode)->commit_mutex);
- nfs_clear_page_commit(req->wb_page);
+ nfs_folio_clear_commit(nfs_page_to_folio(req));
}
}
@@ -1003,7 +1002,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
(hdr->good_bytes < bytes)) {
trace_nfs_comp_error(hdr->inode, req, hdr->error);
- nfs_mapping_set_error(req->wb_page, hdr->error);
+ nfs_mapping_set_error(nfs_page_to_folio(req),
+ hdr->error);
goto remove_req;
}
if (nfs_write_need_commit(hdr)) {
@@ -1017,7 +1017,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
remove_req:
nfs_inode_remove_request(req);
next:
- nfs_end_page_writeback(req);
+ nfs_page_end_writeback(req);
nfs_release_request(req);
}
out:
@@ -1093,10 +1093,9 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
* If the attempt fails, then the existing request is flushed out
* to disk.
*/
-static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
- struct page *page,
- unsigned int offset,
- unsigned int bytes)
+static struct nfs_page *nfs_try_to_update_request(struct folio *folio,
+ unsigned int offset,
+ unsigned int bytes)
{
struct nfs_page *req;
unsigned int rqend;
@@ -1105,7 +1104,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
end = offset + bytes;
- req = nfs_lock_and_join_requests(page);
+ req = nfs_lock_and_join_requests(folio);
if (IS_ERR_OR_NULL(req))
return req;
@@ -1138,7 +1137,7 @@ out_flushme:
*/
nfs_mark_request_dirty(req);
nfs_unlock_and_release_request(req);
- error = nfs_wb_page(inode, page);
+ error = nfs_wb_folio(folio_file_mapping(folio)->host, folio);
return (error < 0) ? ERR_PTR(error) : NULL;
}
@@ -1149,40 +1148,42 @@ out_flushme:
* if we have to add a new request. Also assumes that the caller has
* already called nfs_flush_incompatible() if necessary.
*/
-static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
- struct page *page, unsigned int offset, unsigned int bytes)
+static struct nfs_page *nfs_setup_write_request(struct nfs_open_context *ctx,
+ struct folio *folio,
+ unsigned int offset,
+ unsigned int bytes)
{
- struct inode *inode = page_file_mapping(page)->host;
- struct nfs_page *req;
+ struct nfs_page *req;
- req = nfs_try_to_update_request(inode, page, offset, bytes);
+ req = nfs_try_to_update_request(folio, offset, bytes);
if (req != NULL)
goto out;
- req = nfs_create_request(ctx, page, offset, bytes);
+ req = nfs_page_create_from_folio(ctx, folio, offset, bytes);
if (IS_ERR(req))
goto out;
- nfs_inode_add_request(inode, req);
+ nfs_inode_add_request(req);
out:
return req;
}
-static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
- unsigned int offset, unsigned int count)
+static int nfs_writepage_setup(struct nfs_open_context *ctx,
+ struct folio *folio, unsigned int offset,
+ unsigned int count)
{
- struct nfs_page *req;
+ struct nfs_page *req;
- req = nfs_setup_write_request(ctx, page, offset, count);
+ req = nfs_setup_write_request(ctx, folio, offset, count);
if (IS_ERR(req))
return PTR_ERR(req);
/* Update file length */
- nfs_grow_file(page, offset, count);
+ nfs_grow_file(folio, offset, count);
nfs_mark_uptodate(req);
nfs_mark_request_dirty(req);
nfs_unlock_and_release_request(req);
return 0;
}
-int nfs_flush_incompatible(struct file *file, struct page *page)
+int nfs_flush_incompatible(struct file *file, struct folio *folio)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct nfs_lock_context *l_ctx;
@@ -1198,12 +1199,12 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
* dropped page.
*/
do {
- req = nfs_page_find_head_request(page);
+ req = nfs_folio_find_head_request(folio);
if (req == NULL)
return 0;
l_ctx = req->wb_lock_context;
- do_flush = req->wb_page != page ||
- !nfs_match_open_context(nfs_req_openctx(req), ctx);
+ do_flush = nfs_page_to_folio(req) != folio ||
+ !nfs_match_open_context(nfs_req_openctx(req), ctx);
if (l_ctx && flctx &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock))) {
@@ -1212,7 +1213,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
nfs_release_request(req);
if (!do_flush)
return 0;
- status = nfs_wb_page(page_file_mapping(page)->host, page);
+ status = nfs_wb_folio(folio_file_mapping(folio)->host, folio);
} while (status == 0);
return status;
}
@@ -1284,9 +1285,9 @@ out:
* the PageUptodate() flag. In this case, we will need to turn off
* write optimisations that depend on the page contents being correct.
*/
-static bool nfs_write_pageuptodate(struct page *page, struct inode *inode,
- unsigned int pagelen)
+static bool nfs_folio_write_uptodate(struct folio *folio, unsigned int pagelen)
{
+ struct inode *inode = folio_file_mapping(folio)->host;
struct nfs_inode *nfsi = NFS_I(inode);
if (nfs_have_delegated_attributes(inode))
@@ -1300,7 +1301,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode,
out:
if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0)
return false;
- return PageUptodate(page) != 0;
+ return folio_test_uptodate(folio) != 0;
}
static bool
@@ -1318,16 +1319,17 @@ is_whole_file_wrlock(struct file_lock *fl)
* If the file is opened for synchronous writes then we can just skip the rest
* of the checks.
*/
-static int nfs_can_extend_write(struct file *file, struct page *page,
- struct inode *inode, unsigned int pagelen)
+static int nfs_can_extend_write(struct file *file, struct folio *folio,
+ unsigned int pagelen)
{
- int ret;
+ struct inode *inode = file_inode(file);
struct file_lock_context *flctx = locks_inode_context(inode);
struct file_lock *fl;
+ int ret;
if (file->f_flags & O_DSYNC)
return 0;
- if (!nfs_write_pageuptodate(page, inode, pagelen))
+ if (!nfs_folio_write_uptodate(folio, pagelen))
return 0;
if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
return 1;
@@ -1359,33 +1361,33 @@ static int nfs_can_extend_write(struct file *file, struct page *page,
* XXX: Keep an eye on generic_file_read to make sure it doesn't do bad
* things with a page scheduled for an RPC call (e.g. invalidate it).
*/
-int nfs_updatepage(struct file *file, struct page *page,
- unsigned int offset, unsigned int count)
+int nfs_update_folio(struct file *file, struct folio *folio,
+ unsigned int offset, unsigned int count)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct address_space *mapping = page_file_mapping(page);
- struct inode *inode = mapping->host;
- unsigned int pagelen = nfs_page_length(page);
+ struct address_space *mapping = folio_file_mapping(folio);
+ struct inode *inode = mapping->host;
+ unsigned int pagelen = nfs_folio_length(folio);
int status = 0;
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
- dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n",
- file, count, (long long)(page_file_offset(page) + offset));
+ dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count,
+ (long long)(folio_file_pos(folio) + offset));
if (!count)
goto out;
- if (nfs_can_extend_write(file, page, inode, pagelen)) {
+ if (nfs_can_extend_write(file, folio, pagelen)) {
count = max(count + offset, pagelen);
offset = 0;
}
- status = nfs_writepage_setup(ctx, page, offset, count);
+ status = nfs_writepage_setup(ctx, folio, offset, count);
if (status < 0)
nfs_set_pageerror(mapping);
out:
- dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
+ dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
return status;
}
@@ -1421,13 +1423,13 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
*/
static void nfs_redirty_request(struct nfs_page *req)
{
- struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host);
+ struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
/* Bump the transmission count */
req->wb_nio++;
nfs_mark_request_dirty(req);
atomic_long_inc(&nfsi->redirtied_pages);
- nfs_end_page_writeback(req);
+ nfs_page_end_writeback(req);
nfs_release_request(req);
}
@@ -1785,18 +1787,18 @@ void nfs_retry_commit(struct list_head *page_list,
req = nfs_list_entry(page_list->next);
nfs_list_remove_request(req);
nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
- if (!cinfo->dreq)
- nfs_clear_page_commit(req->wb_page);
+ nfs_folio_clear_commit(nfs_page_to_folio(req));
nfs_unlock_and_release_request(req);
}
}
EXPORT_SYMBOL_GPL(nfs_retry_commit);
-static void
-nfs_commit_resched_write(struct nfs_commit_info *cinfo,
- struct nfs_page *req)
+static void nfs_commit_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
{
- __set_page_dirty_nobuffers(req->wb_page);
+ struct folio *folio = nfs_page_to_folio(req);
+
+ filemap_dirty_folio(folio_mapping(folio), folio);
}
/*
@@ -1847,12 +1849,13 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
int status = data->task.tk_status;
struct nfs_commit_info cinfo;
struct nfs_server *nfss;
+ struct folio *folio;
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- if (req->wb_page)
- nfs_clear_page_commit(req->wb_page);
+ folio = nfs_page_to_folio(req);
+ nfs_folio_clear_commit(folio);
dprintk("NFS: commit (%s/%llu %d@%lld)",
nfs_req_openctx(req)->dentry->d_sb->s_id,
@@ -1860,10 +1863,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
req->wb_bytes,
(long long)req_offset(req));
if (status < 0) {
- if (req->wb_page) {
+ if (folio) {
trace_nfs_commit_error(data->inode, req,
status);
- nfs_mapping_set_error(req->wb_page, status);
+ nfs_mapping_set_error(folio, status);
nfs_inode_remove_request(req);
}
dprintk_cont(", error = %d\n", status);
@@ -1874,7 +1877,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
* returned by the server against all stored verfs. */
if (nfs_write_match_verf(verf, req)) {
/* We have a match */
- if (req->wb_page)
+ if (folio)
nfs_inode_remove_request(req);
dprintk_cont(" OK\n");
goto next;
@@ -2055,7 +2058,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
/* blocking call to cancel all requests and join to a single (head)
* request */
- req = nfs_lock_and_join_requests(&folio->page);
+ req = nfs_lock_and_join_requests(folio);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
@@ -2071,13 +2074,18 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
return ret;
}
-/*
- * Write back all requests on one page - we do this before reading it.
+/**
+ * nfs_wb_folio - Write back all requests on one page
+ * @inode: pointer to page
+ * @folio: pointer to folio
+ *
+ * Assumes that the folio has been locked by the caller, and will
+ * not unlock it.
*/
-int nfs_wb_page(struct inode *inode, struct page *page)
+int nfs_wb_folio(struct inode *inode, struct folio *folio)
{
- loff_t range_start = page_file_offset(page);
- loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
+ loff_t range_start = folio_file_pos(folio);
+ loff_t range_end = range_start + (loff_t)folio_size(folio) - 1;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
@@ -2086,25 +2094,25 @@ int nfs_wb_page(struct inode *inode, struct page *page)
};
int ret;
- trace_nfs_writeback_page_enter(inode);
+ trace_nfs_writeback_folio(inode, folio);
for (;;) {
- wait_on_page_writeback(page);
- if (clear_page_dirty_for_io(page)) {
- ret = nfs_writepage_locked(page, &wbc);
+ folio_wait_writeback(folio);
+ if (folio_clear_dirty_for_io(folio)) {
+ ret = nfs_writepage_locked(folio, &wbc);
if (ret < 0)
goto out_error;
continue;
}
ret = 0;
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
break;
ret = nfs_commit_inode(inode, FLUSH_SYNC);
if (ret < 0)
goto out_error;
}
out_error:
- trace_nfs_writeback_page_exit(inode, ret);
+ trace_nfs_writeback_folio_done(inode, folio, ret);
return ret;
}
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
deleted file mode 100644
index 76bee0a0d308..000000000000
--- a/fs/nfsd/fault_inject.c
+++ /dev/null
@@ -1,142 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
- *
- * Uses debugfs to create fault injection points for client testing
- */
-
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/nsproxy.h>
-#include <linux/sunrpc/addr.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-
-#include "state.h"
-#include "netns.h"
-
-struct nfsd_fault_inject_op {
- char *file;
- u64 (*get)(void);
- u64 (*set_val)(u64);
- u64 (*set_clnt)(struct sockaddr_storage *, size_t);
-};
-
-static struct dentry *debug_dir;
-
-static ssize_t fault_inject_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
-{
- static u64 val;
- char read_buf[25];
- size_t size;
- loff_t pos = *ppos;
- struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
-
- if (!pos)
- val = op->get();
- size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
-
- return simple_read_from_buffer(buf, len, ppos, read_buf, size);
-}
-
-static ssize_t fault_inject_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
-{
- char write_buf[INET6_ADDRSTRLEN];
- size_t size = min(sizeof(write_buf) - 1, len);
- struct net *net = current->nsproxy->net_ns;
- struct sockaddr_storage sa;
- struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
- u64 val;
- char *nl;
-
- if (copy_from_user(write_buf, buf, size))
- return -EFAULT;
- write_buf[size] = '\0';
-
- /* Deal with any embedded newlines in the string */
- nl = strchr(write_buf, '\n');
- if (nl) {
- size = nl - write_buf;
- *nl = '\0';
- }
-
- size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
- if (size > 0) {
- val = op->set_clnt(&sa, size);
- if (val)
- pr_info("NFSD [%s]: Client %s had %llu state object(s)\n",
- op->file, write_buf, val);
- } else {
- val = simple_strtoll(write_buf, NULL, 0);
- if (val == 0)
- pr_info("NFSD Fault Injection: %s (all)", op->file);
- else
- pr_info("NFSD Fault Injection: %s (n = %llu)",
- op->file, val);
- val = op->set_val(val);
- pr_info("NFSD: %s: found %llu", op->file, val);
- }
- return len; /* on success, claim we got the whole input */
-}
-
-static const struct file_operations fops_nfsd = {
- .owner = THIS_MODULE,
- .read = fault_inject_read,
- .write = fault_inject_write,
-};
-
-void nfsd_fault_inject_cleanup(void)
-{
- debugfs_remove_recursive(debug_dir);
-}
-
-static struct nfsd_fault_inject_op inject_ops[] = {
- {
- .file = "forget_clients",
- .get = nfsd_inject_print_clients,
- .set_val = nfsd_inject_forget_clients,
- .set_clnt = nfsd_inject_forget_client,
- },
- {
- .file = "forget_locks",
- .get = nfsd_inject_print_locks,
- .set_val = nfsd_inject_forget_locks,
- .set_clnt = nfsd_inject_forget_client_locks,
- },
- {
- .file = "forget_openowners",
- .get = nfsd_inject_print_openowners,
- .set_val = nfsd_inject_forget_openowners,
- .set_clnt = nfsd_inject_forget_client_openowners,
- },
- {
- .file = "forget_delegations",
- .get = nfsd_inject_print_delegations,
- .set_val = nfsd_inject_forget_delegations,
- .set_clnt = nfsd_inject_forget_client_delegations,
- },
- {
- .file = "recall_delegations",
- .get = nfsd_inject_print_delegations,
- .set_val = nfsd_inject_recall_delegations,
- .set_clnt = nfsd_inject_recall_client_delegations,
- },
-};
-
-void nfsd_fault_inject_init(void)
-{
- unsigned int i;
- struct nfsd_fault_inject_op *op;
- umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
-
- debug_dir = debugfs_create_dir("nfsd", NULL);
-
- for (i = 0; i < ARRAY_SIZE(inject_ops); i++) {
- op = &inject_ops[i];
- debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd);
- }
-}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index c0950edb26b0..6e8712bd7c99 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -331,37 +331,27 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
return nf;
}
+/**
+ * nfsd_file_check_write_error - check for writeback errors on a file
+ * @nf: nfsd_file to check for writeback errors
+ *
+ * Check whether a nfsd_file has an unseen error. Reset the write
+ * verifier if so.
+ */
static void
-nfsd_file_fsync(struct nfsd_file *nf)
-{
- struct file *file = nf->nf_file;
- int ret;
-
- if (!file || !(file->f_mode & FMODE_WRITE))
- return;
- ret = vfs_fsync(file, 1);
- trace_nfsd_file_fsync(nf, ret);
- if (ret)
- nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
-}
-
-static int
nfsd_file_check_write_error(struct nfsd_file *nf)
{
struct file *file = nf->nf_file;
- if (!file || !(file->f_mode & FMODE_WRITE))
- return 0;
- return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
+ if ((file->f_mode & FMODE_WRITE) &&
+ filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)))
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
}
static void
nfsd_file_hash_remove(struct nfsd_file *nf)
{
trace_nfsd_file_unhash(nf);
-
- if (nfsd_file_check_write_error(nf))
- nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
nfsd_file_rhash_params);
}
@@ -387,23 +377,12 @@ nfsd_file_free(struct nfsd_file *nf)
this_cpu_add(nfsd_file_total_age, age);
nfsd_file_unhash(nf);
-
- /*
- * We call fsync here in order to catch writeback errors. It's not
- * strictly required by the protocol, but an nfsd_file could get
- * evicted from the cache before a COMMIT comes in. If another
- * task were to open that file in the interim and scrape the error,
- * then the client may never see it. By calling fsync here, we ensure
- * that writeback happens before the entry is freed, and that any
- * errors reported result in the write verifier changing.
- */
- nfsd_file_fsync(nf);
-
if (nf->nf_mark)
nfsd_file_mark_put(nf->nf_mark);
if (nf->nf_file) {
get_file(nf->nf_file);
filp_close(nf->nf_file, NULL);
+ nfsd_file_check_write_error(nf);
fput(nf->nf_file);
}
@@ -452,7 +431,7 @@ static bool nfsd_file_lru_remove(struct nfsd_file *nf)
struct nfsd_file *
nfsd_file_get(struct nfsd_file *nf)
{
- if (likely(refcount_inc_not_zero(&nf->nf_ref)))
+ if (nf && refcount_inc_not_zero(&nf->nf_ref))
return nf;
return NULL;
}
@@ -1107,8 +1086,7 @@ retry:
rcu_read_lock();
nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
nfsd_file_rhash_params);
- if (nf)
- nf = nfsd_file_get(nf);
+ nf = nfsd_file_get(nf);
rcu_read_unlock();
if (nf) {
@@ -1159,6 +1137,7 @@ wait_for_construction:
out:
if (status == nfs_ok) {
this_cpu_inc(nfsd_file_acquisitions);
+ nfsd_file_check_write_error(nf);
*pnf = nf;
} else {
if (refcount_dec_and_test(&nf->nf_ref))
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 995cb2c90b1a..12b2b9bc07bf 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -377,10 +377,11 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
},
};
-static unsigned int nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)];
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]);
const struct svc_version nfsd_acl_version2 = {
.vs_vers = 2,
- .vs_nproc = 5,
+ .vs_nproc = ARRAY_SIZE(nfsd_acl_procedures2),
.vs_proc = nfsd_acl_procedures2,
.vs_count = nfsd_acl_count2,
.vs_dispatch = nfsd_dispatch,
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 887803735e2a..73adca47d373 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -266,10 +266,11 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
},
};
-static unsigned int nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)];
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]);
const struct svc_version nfsd_acl_version3 = {
.vs_vers = 3,
- .vs_nproc = 3,
+ .vs_nproc = ARRAY_SIZE(nfsd_acl_procedures3),
.vs_proc = nfsd_acl_procedures3,
.vs_count = nfsd_acl_count3,
.vs_dispatch = nfsd_dispatch,
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index f41992ecd0d7..e6bb8eeb5bc2 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -1064,10 +1064,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
},
};
-static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures3)];
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nfsd_count3[ARRAY_SIZE(nfsd_procedures3)]);
const struct svc_version nfsd_version3 = {
.vs_vers = 3,
- .vs_nproc = 22,
+ .vs_nproc = ARRAY_SIZE(nfsd_procedures3),
.vs_proc = nfsd_procedures3,
.vs_dispatch = nfsd_dispatch,
.vs_count = nfsd_count3,
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 3564d1c6f610..e8a80052cb1b 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -323,11 +323,11 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
if (ls->ls_recalled)
goto out_unlock;
- ls->ls_recalled = true;
- atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
if (list_empty(&ls->ls_layouts))
goto out_unlock;
+ ls->ls_recalled = true;
+ atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid);
refcount_inc(&ls->ls_stid.sc_count);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f189ba7995f5..5ae670807449 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1214,8 +1214,10 @@ out:
return status;
out_put_dst:
nfsd_file_put(*dst);
+ *dst = NULL;
out_put_src:
nfsd_file_put(*src);
+ *src = NULL;
goto out;
}
@@ -1293,15 +1295,15 @@ extern void nfs_sb_deactive(struct super_block *sb);
* setup a work entry in the ssc delayed unmount list.
*/
static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
- struct nfsd4_ssc_umount_item **retwork, struct vfsmount **ss_mnt)
+ struct nfsd4_ssc_umount_item **nsui)
{
struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd4_ssc_umount_item *work = NULL;
struct nfsd4_ssc_umount_item *tmp;
DEFINE_WAIT(wait);
+ __be32 status = 0;
- *ss_mnt = NULL;
- *retwork = NULL;
+ *nsui = NULL;
work = kzalloc(sizeof(*work), GFP_KERNEL);
try_again:
spin_lock(&nn->nfsd_ssc_lock);
@@ -1325,12 +1327,12 @@ try_again:
finish_wait(&nn->nfsd_ssc_waitq, &wait);
goto try_again;
}
- *ss_mnt = ni->nsui_vfsmount;
+ *nsui = ni;
refcount_inc(&ni->nsui_refcnt);
spin_unlock(&nn->nfsd_ssc_lock);
kfree(work);
- /* return vfsmount in ss_mnt */
+ /* return vfsmount in (*nsui)->nsui_vfsmount */
return 0;
}
if (work) {
@@ -1338,31 +1340,32 @@ try_again:
refcount_set(&work->nsui_refcnt, 2);
work->nsui_busy = true;
list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list);
- *retwork = work;
- }
+ *nsui = work;
+ } else
+ status = nfserr_resource;
spin_unlock(&nn->nfsd_ssc_lock);
- return 0;
+ return status;
}
-static void nfsd4_ssc_update_dul_work(struct nfsd_net *nn,
- struct nfsd4_ssc_umount_item *work, struct vfsmount *ss_mnt)
+static void nfsd4_ssc_update_dul(struct nfsd_net *nn,
+ struct nfsd4_ssc_umount_item *nsui,
+ struct vfsmount *ss_mnt)
{
- /* set nsui_vfsmount, clear busy flag and wakeup waiters */
spin_lock(&nn->nfsd_ssc_lock);
- work->nsui_vfsmount = ss_mnt;
- work->nsui_busy = false;
+ nsui->nsui_vfsmount = ss_mnt;
+ nsui->nsui_busy = false;
wake_up_all(&nn->nfsd_ssc_waitq);
spin_unlock(&nn->nfsd_ssc_lock);
}
-static void nfsd4_ssc_cancel_dul_work(struct nfsd_net *nn,
- struct nfsd4_ssc_umount_item *work)
+static void nfsd4_ssc_cancel_dul(struct nfsd_net *nn,
+ struct nfsd4_ssc_umount_item *nsui)
{
spin_lock(&nn->nfsd_ssc_lock);
- list_del(&work->nsui_list);
+ list_del(&nsui->nsui_list);
wake_up_all(&nn->nfsd_ssc_waitq);
spin_unlock(&nn->nfsd_ssc_lock);
- kfree(work);
+ kfree(nsui);
}
/*
@@ -1370,7 +1373,7 @@ static void nfsd4_ssc_cancel_dul_work(struct nfsd_net *nn,
*/
static __be32
nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
- struct vfsmount **mount)
+ struct nfsd4_ssc_umount_item **nsui)
{
struct file_system_type *type;
struct vfsmount *ss_mnt;
@@ -1381,7 +1384,6 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
char *ipaddr, *dev_name, *raw_data;
int len, raw_len;
__be32 status = nfserr_inval;
- struct nfsd4_ssc_umount_item *work = NULL;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
naddr = &nss->u.nl4_addr;
@@ -1389,6 +1391,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
naddr->addr_len,
(struct sockaddr *)&tmp_addr,
sizeof(tmp_addr));
+ *nsui = NULL;
if (tmp_addrlen == 0)
goto out_err;
@@ -1431,10 +1434,10 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
goto out_free_rawdata;
snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
- status = nfsd4_ssc_setup_dul(nn, ipaddr, &work, &ss_mnt);
+ status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui);
if (status)
goto out_free_devname;
- if (ss_mnt)
+ if ((*nsui)->nsui_vfsmount)
goto out_done;
/* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */
@@ -1442,15 +1445,12 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
module_put(type->owner);
if (IS_ERR(ss_mnt)) {
status = nfserr_nodev;
- if (work)
- nfsd4_ssc_cancel_dul_work(nn, work);
+ nfsd4_ssc_cancel_dul(nn, *nsui);
goto out_free_devname;
}
- if (work)
- nfsd4_ssc_update_dul_work(nn, work, ss_mnt);
+ nfsd4_ssc_update_dul(nn, *nsui, ss_mnt);
out_done:
status = 0;
- *mount = ss_mnt;
out_free_devname:
kfree(dev_name);
@@ -1474,7 +1474,7 @@ out_err:
static __be32
nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
- struct nfsd4_copy *copy, struct vfsmount **mount)
+ struct nfsd4_copy *copy)
{
struct svc_fh *s_fh = NULL;
stateid_t *s_stid = &copy->cp_src_stateid;
@@ -1487,7 +1487,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
if (status)
goto out;
- status = nfsd4_interssc_connect(copy->cp_src, rqstp, mount);
+ status = nfsd4_interssc_connect(copy->cp_src, rqstp, &copy->ss_nsui);
if (status)
goto out;
@@ -1505,45 +1505,26 @@ out:
}
static void
-nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp,
+nfsd4_cleanup_inter_ssc(struct nfsd4_ssc_umount_item *nsui, struct file *filp,
struct nfsd_file *dst)
{
- bool found = false;
- long timeout;
- struct nfsd4_ssc_umount_item *tmp;
- struct nfsd4_ssc_umount_item *ni = NULL;
struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id);
+ long timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout);
nfs42_ssc_close(filp);
- nfsd_file_put(dst);
fput(filp);
- if (!nn) {
- mntput(ss_mnt);
- return;
- }
spin_lock(&nn->nfsd_ssc_lock);
- timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout);
- list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
- if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) {
- list_del(&ni->nsui_list);
- /*
- * vfsmount can be shared by multiple exports,
- * decrement refcnt. If the count drops to 1 it
- * will be unmounted when nsui_expire expires.
- */
- refcount_dec(&ni->nsui_refcnt);
- ni->nsui_expire = jiffies + timeout;
- list_add_tail(&ni->nsui_list, &nn->nfsd_ssc_mount_list);
- found = true;
- break;
- }
- }
+ list_del(&nsui->nsui_list);
+ /*
+ * vfsmount can be shared by multiple exports,
+ * decrement refcnt. If the count drops to 1 it
+ * will be unmounted when nsui_expire expires.
+ */
+ refcount_dec(&nsui->nsui_refcnt);
+ nsui->nsui_expire = jiffies + timeout;
+ list_add_tail(&nsui->nsui_list, &nn->nfsd_ssc_mount_list);
spin_unlock(&nn->nfsd_ssc_lock);
- if (!found) {
- mntput(ss_mnt);
- return;
- }
}
#else /* CONFIG_NFSD_V4_2_INTER_SSC */
@@ -1551,15 +1532,13 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp,
static __be32
nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
- struct nfsd4_copy *copy,
- struct vfsmount **mount)
+ struct nfsd4_copy *copy)
{
- *mount = NULL;
return nfserr_inval;
}
static void
-nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp,
+nfsd4_cleanup_inter_ssc(struct nfsd4_ssc_umount_item *nsui, struct file *filp,
struct nfsd_file *dst)
{
}
@@ -1582,13 +1561,6 @@ nfsd4_setup_intra_ssc(struct svc_rqst *rqstp,
&copy->nf_dst);
}
-static void
-nfsd4_cleanup_intra_ssc(struct nfsd_file *src, struct nfsd_file *dst)
-{
- nfsd_file_put(src);
- nfsd_file_put(dst);
-}
-
static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
{
struct nfsd4_cb_offload *cbo =
@@ -1700,18 +1672,27 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
memcpy(dst->cp_src, src->cp_src, sizeof(struct nl4_server));
memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid));
memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh));
- dst->ss_mnt = src->ss_mnt;
+ dst->ss_nsui = src->ss_nsui;
+}
+
+static void release_copy_files(struct nfsd4_copy *copy)
+{
+ if (copy->nf_src)
+ nfsd_file_put(copy->nf_src);
+ if (copy->nf_dst)
+ nfsd_file_put(copy->nf_dst);
}
static void cleanup_async_copy(struct nfsd4_copy *copy)
{
nfs4_free_copy_state(copy);
- nfsd_file_put(copy->nf_dst);
- if (!nfsd4_ssc_is_inter(copy))
- nfsd_file_put(copy->nf_src);
- spin_lock(&copy->cp_clp->async_lock);
- list_del(&copy->copies);
- spin_unlock(&copy->cp_clp->async_lock);
+ release_copy_files(copy);
+ if (copy->cp_clp) {
+ spin_lock(&copy->cp_clp->async_lock);
+ if (!list_empty(&copy->copies))
+ list_del_init(&copy->copies);
+ spin_unlock(&copy->cp_clp->async_lock);
+ }
nfs4_put_copy(copy);
}
@@ -1749,8 +1730,8 @@ static int nfsd4_do_async_copy(void *data)
if (nfsd4_ssc_is_inter(copy)) {
struct file *filp;
- filp = nfs42_ssc_open(copy->ss_mnt, &copy->c_fh,
- &copy->stateid);
+ filp = nfs42_ssc_open(copy->ss_nsui->nsui_vfsmount,
+ &copy->c_fh, &copy->stateid);
if (IS_ERR(filp)) {
switch (PTR_ERR(filp)) {
case -EBADF:
@@ -1764,11 +1745,10 @@ static int nfsd4_do_async_copy(void *data)
}
nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
false);
- nfsd4_cleanup_inter_ssc(copy->ss_mnt, filp, copy->nf_dst);
+ nfsd4_cleanup_inter_ssc(copy->ss_nsui, filp, copy->nf_dst);
} else {
nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file,
copy->nf_dst->nf_file, false);
- nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst);
}
do_callback:
@@ -1790,8 +1770,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfserr_notsupp;
goto out;
}
- status = nfsd4_setup_inter_ssc(rqstp, cstate, copy,
- &copy->ss_mnt);
+ status = nfsd4_setup_inter_ssc(rqstp, cstate, copy);
if (status)
return nfserr_offload_denied;
} else {
@@ -1810,12 +1789,13 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
if (!async_copy)
goto out_err;
+ INIT_LIST_HEAD(&async_copy->copies);
+ refcount_set(&async_copy->refcount, 1);
async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL);
if (!async_copy->cp_src)
goto out_err;
if (!nfs4_init_copy_state(nn, copy))
goto out_err;
- refcount_set(&async_copy->refcount, 1);
memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.cs_stid,
sizeof(copy->cp_res.cb_stateid));
dup_copy_fields(copy, async_copy);
@@ -1832,38 +1812,53 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
} else {
status = nfsd4_do_copy(copy, copy->nf_src->nf_file,
copy->nf_dst->nf_file, true);
- nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst);
}
out:
+ release_copy_files(copy);
return status;
out_err:
+ if (nfsd4_ssc_is_inter(copy)) {
+ /*
+ * Source's vfsmount of inter-copy will be unmounted
+ * by the laundromat. Use copy instead of async_copy
+ * since async_copy->ss_nsui might not be set yet.
+ */
+ refcount_dec(&copy->ss_nsui->nsui_refcnt);
+ }
if (async_copy)
cleanup_async_copy(async_copy);
status = nfserrno(-ENOMEM);
- /*
- * source's vfsmount of inter-copy will be unmounted
- * by the laundromat
- */
goto out;
}
-struct nfsd4_copy *
-find_async_copy(struct nfs4_client *clp, stateid_t *stateid)
+static struct nfsd4_copy *
+find_async_copy_locked(struct nfs4_client *clp, stateid_t *stateid)
{
struct nfsd4_copy *copy;
- spin_lock(&clp->async_lock);
+ lockdep_assert_held(&clp->async_lock);
+
list_for_each_entry(copy, &clp->async_copies, copies) {
if (memcmp(&copy->cp_stateid.cs_stid, stateid, NFS4_STATEID_SIZE))
continue;
- refcount_inc(&copy->refcount);
- spin_unlock(&clp->async_lock);
return copy;
}
- spin_unlock(&clp->async_lock);
return NULL;
}
+static struct nfsd4_copy *
+find_async_copy(struct nfs4_client *clp, stateid_t *stateid)
+{
+ struct nfsd4_copy *copy;
+
+ spin_lock(&clp->async_lock);
+ copy = find_async_copy_locked(clp, stateid);
+ if (copy)
+ refcount_inc(&copy->refcount);
+ spin_unlock(&clp->async_lock);
+ return copy;
+}
+
static __be32
nfsd4_offload_cancel(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
@@ -1948,22 +1943,24 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfsd_file_put(nf);
return status;
}
+
static __be32
nfsd4_offload_status(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_offload_status *os = &u->offload_status;
- __be32 status = 0;
+ __be32 status = nfs_ok;
struct nfsd4_copy *copy;
struct nfs4_client *clp = cstate->clp;
- copy = find_async_copy(clp, &os->stateid);
- if (copy) {
+ spin_lock(&clp->async_lock);
+ copy = find_async_copy_locked(clp, &os->stateid);
+ if (copy)
os->count = copy->cp_res.wr_bytes_written;
- nfs4_put_copy(copy);
- } else
+ else
status = nfserr_bad_stateid;
+ spin_unlock(&clp->async_lock);
return status;
}
@@ -3619,12 +3616,13 @@ static const struct svc_procedure nfsd_procedures4[2] = {
},
};
-static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures4)];
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nfsd_count4[ARRAY_SIZE(nfsd_procedures4)]);
const struct svc_version nfsd_version4 = {
.vs_vers = 4,
- .vs_nproc = 2,
+ .vs_nproc = ARRAY_SIZE(nfsd_procedures4),
.vs_proc = nfsd_procedures4,
- .vs_count = nfsd_count3,
+ .vs_count = nfsd_count4,
.vs_dispatch = nfsd_dispatch,
.vs_xdrsize = NFS4_SVC_XDRSIZE,
.vs_rpcb_optnl = true,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c1684da6c01f..6e61fa3acaf1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -600,23 +600,15 @@ put_nfs4_file(struct nfs4_file *fi)
}
static struct nfsd_file *
-__nfs4_get_fd(struct nfs4_file *f, int oflag)
-{
- if (f->fi_fds[oflag])
- return nfsd_file_get(f->fi_fds[oflag]);
- return NULL;
-}
-
-static struct nfsd_file *
find_writeable_file_locked(struct nfs4_file *f)
{
struct nfsd_file *ret;
lockdep_assert_held(&f->fi_lock);
- ret = __nfs4_get_fd(f, O_WRONLY);
+ ret = nfsd_file_get(f->fi_fds[O_WRONLY]);
if (!ret)
- ret = __nfs4_get_fd(f, O_RDWR);
+ ret = nfsd_file_get(f->fi_fds[O_RDWR]);
return ret;
}
@@ -639,9 +631,9 @@ find_readable_file_locked(struct nfs4_file *f)
lockdep_assert_held(&f->fi_lock);
- ret = __nfs4_get_fd(f, O_RDONLY);
+ ret = nfsd_file_get(f->fi_fds[O_RDONLY]);
if (!ret)
- ret = __nfs4_get_fd(f, O_RDWR);
+ ret = nfsd_file_get(f->fi_fds[O_RDWR]);
return ret;
}
@@ -665,11 +657,11 @@ find_any_file(struct nfs4_file *f)
if (!f)
return NULL;
spin_lock(&f->fi_lock);
- ret = __nfs4_get_fd(f, O_RDWR);
+ ret = nfsd_file_get(f->fi_fds[O_RDWR]);
if (!ret) {
- ret = __nfs4_get_fd(f, O_WRONLY);
+ ret = nfsd_file_get(f->fi_fds[O_WRONLY]);
if (!ret)
- ret = __nfs4_get_fd(f, O_RDONLY);
+ ret = nfsd_file_get(f->fi_fds[O_RDONLY]);
}
spin_unlock(&f->fi_lock);
return ret;
@@ -688,15 +680,6 @@ static struct nfsd_file *find_any_file_locked(struct nfs4_file *f)
return NULL;
}
-static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f)
-{
- lockdep_assert_held(&f->fi_lock);
-
- if (f->fi_deleg_file)
- return f->fi_deleg_file;
- return NULL;
-}
-
static atomic_long_t num_delegations;
unsigned long max_delegations;
@@ -992,7 +975,6 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time;
stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
- stid->cs_type = cs_type;
idr_preload(GFP_KERNEL);
spin_lock(&nn->s2s_cp_lock);
@@ -1003,6 +985,7 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
idr_preload_end();
if (new_id < 0)
return 0;
+ stid->cs_type = cs_type;
return 1;
}
@@ -1036,7 +1019,8 @@ void nfs4_free_copy_state(struct nfsd4_copy *copy)
{
struct nfsd_net *nn;
- WARN_ON_ONCE(copy->cp_stateid.cs_type != NFS4_COPY_STID);
+ if (copy->cp_stateid.cs_type != NFS4_COPY_STID)
+ return;
nn = net_generic(copy->cp_clp->net, nfsd_net_id);
spin_lock(&nn->s2s_cp_lock);
idr_remove(&nn->s2s_cp_stateids,
@@ -2705,7 +2689,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
ds = delegstateid(st);
nf = st->sc_file;
spin_lock(&nf->fi_lock);
- file = find_deleg_file_locked(nf);
+ file = nf->fi_deleg_file;
if (!file)
goto out;
@@ -5298,16 +5282,17 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp,
/* test and set deny mode */
spin_lock(&fp->fi_lock);
status = nfs4_file_check_deny(fp, open->op_share_deny);
- if (status == nfs_ok) {
- if (status != nfserr_share_denied) {
- set_deny(open->op_share_deny, stp);
- fp->fi_share_deny |=
- (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
- } else {
- if (nfs4_resolve_deny_conflicts_locked(fp, false,
- stp, open->op_share_deny, false))
- status = nfserr_jukebox;
- }
+ switch (status) {
+ case nfs_ok:
+ set_deny(open->op_share_deny, stp);
+ fp->fi_share_deny |=
+ (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
+ break;
+ case nfserr_share_denied:
+ if (nfs4_resolve_deny_conflicts_locked(fp, false,
+ stp, open->op_share_deny, false))
+ status = nfserr_jukebox;
+ break;
}
spin_unlock(&fp->fi_lock);
@@ -5438,6 +5423,23 @@ nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp,
return 0;
}
+/*
+ * We avoid breaking delegations held by a client due to its own activity, but
+ * clearing setuid/setgid bits on a write is an implicit activity and the client
+ * may not notice and continue using the old mode. Avoid giving out a delegation
+ * on setuid/setgid files when the client is requesting an open for write.
+ */
+static int
+nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf)
+{
+ struct inode *inode = file_inode(nf->nf_file);
+
+ if ((open->op_share_access & NFS4_SHARE_ACCESS_WRITE) &&
+ (inode->i_mode & (S_ISUID|S_ISGID)))
+ return -EAGAIN;
+ return 0;
+}
+
static struct nfs4_delegation *
nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
struct svc_fh *parent)
@@ -5471,6 +5473,8 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
spin_lock(&fp->fi_lock);
if (nfs4_delegation_exists(clp, fp))
status = -EAGAIN;
+ else if (nfsd4_verify_setuid_write(open, nf))
+ status = -EAGAIN;
else if (!fp->fi_deleg_file) {
fp->fi_deleg_file = nf;
/* increment early to prevent fi_deleg_file from being
@@ -5511,6 +5515,14 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
if (status)
goto out_unlock;
+ /*
+ * Now that the deleg is set, check again to ensure that nothing
+ * raced in and changed the mode while we weren't lookng.
+ */
+ status = nfsd4_verify_setuid_write(open, fp->fi_deleg_file);
+ if (status)
+ goto out_unlock;
+
spin_lock(&state_lock);
spin_lock(&fp->fi_lock);
if (fp->fi_had_conflict)
@@ -6406,23 +6418,26 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
static struct nfsd_file *
nfs4_find_file(struct nfs4_stid *s, int flags)
{
+ struct nfsd_file *ret = NULL;
+
if (!s)
return NULL;
switch (s->sc_type) {
case NFS4_DELEG_STID:
- if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file))
- return NULL;
- return nfsd_file_get(s->sc_file->fi_deleg_file);
+ spin_lock(&s->sc_file->fi_lock);
+ ret = nfsd_file_get(s->sc_file->fi_deleg_file);
+ spin_unlock(&s->sc_file->fi_lock);
+ break;
case NFS4_OPEN_STID:
case NFS4_LOCK_STID:
if (flags & RD_STATE)
- return find_readable_file(s->sc_file);
+ ret = find_readable_file(s->sc_file);
else
- return find_writeable_file(s->sc_file);
+ ret = find_writeable_file(s->sc_file);
}
- return NULL;
+ return ret;
}
static __be32
@@ -6547,8 +6562,19 @@ void nfs4_put_cpntf_state(struct nfsd_net *nn, struct nfs4_cpntf_state *cps)
spin_unlock(&nn->s2s_cp_lock);
}
-/*
- * Checks for stateid operations
+/**
+ * nfs4_preprocess_stateid_op - find and prep stateid for an operation
+ * @rqstp: incoming request from client
+ * @cstate: current compound state
+ * @fhp: filehandle associated with requested stateid
+ * @stateid: stateid (provided by client)
+ * @flags: flags describing type of operation to be done
+ * @nfp: optional nfsd_file return pointer (may be NULL)
+ * @cstid: optional returned nfs4_stid pointer (may be NULL)
+ *
+ * Given info from the client, look up a nfs4_stid for the operation. On
+ * success, it returns a reference to the nfs4_stid and/or the nfsd_file
+ * associated with it.
*/
__be32
nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
@@ -6737,8 +6763,18 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
return status;
}
-/*
- * Checks for sequence id mutating operations.
+/**
+ * nfs4_preprocess_seqid_op - find and prep an ol_stateid for a seqid-morphing op
+ * @cstate: compund state
+ * @seqid: seqid (provided by client)
+ * @stateid: stateid (provided by client)
+ * @typemask: mask of allowable types for this operation
+ * @stpp: return pointer for the stateid found
+ * @nn: net namespace for request
+ *
+ * Given a stateid+seqid from a client, look up an nfs4_ol_stateid and
+ * return it in @stpp. On a nfs_ok return, the returned stateid will
+ * have its st_mutex locked.
*/
static __be32
nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 3e64a3d50a1c..041faa13b852 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -488,7 +488,7 @@ found_entry:
case RC_NOCACHE:
break;
case RC_REPLSTAT:
- svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat);
+ xdr_stream_encode_be32(&rqstp->rq_res_stream, rp->c_replstat);
rtn = RC_REPLY;
break;
case RC_REPLBUFF:
@@ -509,7 +509,7 @@ out_trace:
* nfsd_cache_update - Update an entry in the duplicate reply cache.
* @rqstp: svc_rqst with a finished Reply
* @cachetype: which cache to update
- * @statp: Reply's status code
+ * @statp: pointer to Reply's NFS status code, or NULL
*
* This is called from nfsd_dispatch when the procedure has been
* executed and the complete reply is in rqstp->rq_res.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c2577ee7ffb2..7b8f17ee5224 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -14,7 +14,6 @@
#include <linux/lockd/lockd.h>
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/gss_api.h>
-#include <linux/sunrpc/gss_krb5_enctypes.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/module.h>
#include <linux/fsnotify.h>
@@ -47,7 +46,6 @@ enum {
NFSD_MaxBlkSize,
NFSD_MaxConnections,
NFSD_Filecache,
- NFSD_SupportedEnctypes,
/*
* The below MUST come last. Otherwise we leave a hole in nfsd_files[]
* with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -187,16 +185,6 @@ static int export_features_show(struct seq_file *m, void *v)
DEFINE_SHOW_ATTRIBUTE(export_features);
-#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
-static int supported_enctypes_show(struct seq_file *m, void *v)
-{
- seq_printf(m, KRB5_SUPPORTED_ENCTYPES);
- return 0;
-}
-
-DEFINE_SHOW_ATTRIBUTE(supported_enctypes);
-#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
-
static const struct file_operations pool_stats_operations = {
.open = nfsd_pool_stats_open,
.read = seq_read,
@@ -1150,6 +1138,9 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
inode->i_op = &simple_dir_inode_operations;
inc_nlink(inode);
break;
+ case S_IFLNK:
+ inode->i_op = &simple_symlink_inode_operations;
+ break;
default:
break;
}
@@ -1195,6 +1186,54 @@ out_err:
goto out;
}
+#if IS_ENABLED(CONFIG_SUNRPC_GSS)
+static int __nfsd_symlink(struct inode *dir, struct dentry *dentry,
+ umode_t mode, const char *content)
+{
+ struct inode *inode;
+
+ inode = nfsd_get_inode(dir->i_sb, mode);
+ if (!inode)
+ return -ENOMEM;
+
+ inode->i_link = (char *)content;
+ inode->i_size = strlen(content);
+
+ d_add(dentry, inode);
+ inc_nlink(dir);
+ fsnotify_create(dir, dentry);
+ return 0;
+}
+
+/*
+ * @content is assumed to be a NUL-terminated string that lives
+ * longer than the symlink itself.
+ */
+static void nfsd_symlink(struct dentry *parent, const char *name,
+ const char *content)
+{
+ struct inode *dir = parent->d_inode;
+ struct dentry *dentry;
+ int ret;
+
+ inode_lock(dir);
+ dentry = d_alloc_name(parent, name);
+ if (!dentry)
+ goto out;
+ ret = __nfsd_symlink(d_inode(parent), dentry, S_IFLNK | 0777, content);
+ if (ret)
+ dput(dentry);
+out:
+ inode_unlock(dir);
+}
+#else
+static inline void nfsd_symlink(struct dentry *parent, const char *name,
+ const char *content)
+{
+}
+
+#endif
+
static void clear_ncl(struct inode *inode)
{
struct nfsdfs_client *ncl = inode->i_private;
@@ -1355,10 +1394,6 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
[NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO},
-#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
- [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes",
- &supported_enctypes_fops, S_IRUGO},
-#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
#ifdef CONFIG_NFSD_V4
[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
@@ -1371,6 +1406,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
if (ret)
return ret;
+ nfsd_symlink(sb->s_root, "supported_krb5_enctypes",
+ "/proc/net/rpc/gss_krb5_enctypes");
dentry = nfsd_mkdir(sb->s_root, NULL, "clients");
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -1458,16 +1495,11 @@ static __net_init int nfsd_init_net(struct net *net)
nn->nfsd_versions = NULL;
nn->nfsd4_minorversions = NULL;
nfsd4_init_leases_net(nn);
- retval = nfsd_reply_cache_init(nn);
- if (retval)
- goto out_cache_error;
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
return 0;
-out_cache_error:
- nfsd_idmap_shutdown(net);
out_idmap_error:
nfsd_export_shutdown(net);
out_export_error:
@@ -1476,9 +1508,6 @@ out_export_error:
static __net_exit void nfsd_exit_net(struct net *net)
{
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-
- nfsd_reply_cache_shutdown(nn);
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index fa0144a74267..d88498f8b275 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -86,7 +86,7 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp,
* Function prototypes.
*/
int nfsd_svc(int nrservs, struct net *net, const struct cred *cred);
-int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
+int nfsd_dispatch(struct svc_rqst *rqstp);
int nfsd_nrthreads(struct net *);
int nfsd_nrpools(struct net *);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a82d91afdc9c..c37195572fd0 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -838,11 +838,11 @@ static const struct svc_procedure nfsd_procedures2[18] = {
},
};
-
-static unsigned int nfsd_count2[ARRAY_SIZE(nfsd_procedures2)];
+static DEFINE_PER_CPU_ALIGNED(unsigned long,
+ nfsd_count2[ARRAY_SIZE(nfsd_procedures2)]);
const struct svc_version nfsd_version2 = {
.vs_vers = 2,
- .vs_nproc = 18,
+ .vs_nproc = ARRAY_SIZE(nfsd_procedures2),
.vs_proc = nfsd_procedures2,
.vs_count = nfsd_count2,
.vs_dispatch = nfsd_dispatch,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 325d3d3f1211..9c7b1ef5be40 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -363,7 +363,7 @@ void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn)
do {
read_seqbegin_or_lock(&nn->writeverf_lock, &seq);
- memcpy(verf, nn->writeverf, sizeof(*verf));
+ memcpy(verf, nn->writeverf, sizeof(nn->writeverf));
} while (need_seqretry(&nn->writeverf_lock, seq));
done_seqretry(&nn->writeverf_lock, seq);
}
@@ -427,16 +427,23 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
ret = nfsd_file_cache_start_net(net);
if (ret)
goto out_lockd;
- ret = nfs4_state_start_net(net);
+
+ ret = nfsd_reply_cache_init(nn);
if (ret)
goto out_filecache;
+ ret = nfs4_state_start_net(net);
+ if (ret)
+ goto out_reply_cache;
+
#ifdef CONFIG_NFSD_V4_2_INTER_SSC
nfsd4_ssc_init_umount_work(nn);
#endif
nn->nfsd_net_up = true;
return 0;
+out_reply_cache:
+ nfsd_reply_cache_shutdown(nn);
out_filecache:
nfsd_file_cache_shutdown_net(net);
out_lockd:
@@ -454,6 +461,7 @@ static void nfsd_shutdown_net(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
nfs4_state_shutdown_net(net);
+ nfsd_reply_cache_shutdown(nn);
nfsd_file_cache_shutdown_net(net);
if (nn->lockd_up) {
lockd_down(net);
@@ -1022,7 +1030,6 @@ out:
/**
* nfsd_dispatch - Process an NFS or NFSACL Request
* @rqstp: incoming request
- * @statp: pointer to location of accept_stat field in RPC Reply buffer
*
* This RPC dispatcher integrates the NFS server's duplicate reply cache.
*
@@ -1030,9 +1037,10 @@ out:
* %0: Processing complete; do not send a Reply
* %1: Processing complete; send Reply in rqstp->rq_res
*/
-int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+int nfsd_dispatch(struct svc_rqst *rqstp)
{
const struct svc_procedure *proc = rqstp->rq_procinfo;
+ __be32 *statp = rqstp->rq_accept_statp;
/*
* Give the xdr decoder a chance to change this if it wants
@@ -1040,7 +1048,6 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
*/
rqstp->rq_cachetype = proc->pc_cachetype;
- svcxdr_init_decode(rqstp);
if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
@@ -1053,12 +1060,6 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
goto out_dropit;
}
- /*
- * Need to grab the location to store the status, as
- * NFSv4 does some encoding while processing
- */
- svcxdr_init_encode(rqstp);
-
*statp = proc->pc_func(rqstp);
if (test_bit(RQ_DROPME, &rqstp->rq_flags))
goto out_update_drop;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e94634d30591..d49d3060ed4f 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -705,8 +705,6 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name
extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
void put_nfs4_file(struct nfs4_file *fi);
-extern struct nfsd4_copy *
-find_async_copy(struct nfs4_client *clp, stateid_t *staetid);
extern void nfs4_put_cpntf_state(struct nfsd_net *nn,
struct nfs4_cpntf_state *cps);
extern __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st,
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 8f9c82d9e075..4183819ea082 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1202,37 +1202,6 @@ TRACE_EVENT(nfsd_file_close,
)
);
-TRACE_EVENT(nfsd_file_fsync,
- TP_PROTO(
- const struct nfsd_file *nf,
- int ret
- ),
- TP_ARGS(nf, ret),
- TP_STRUCT__entry(
- __field(void *, nf_inode)
- __field(int, nf_ref)
- __field(int, ret)
- __field(unsigned long, nf_flags)
- __field(unsigned char, nf_may)
- __field(struct file *, nf_file)
- ),
- TP_fast_assign(
- __entry->nf_inode = nf->nf_inode;
- __entry->nf_ref = refcount_read(&nf->nf_ref);
- __entry->ret = ret;
- __entry->nf_flags = nf->nf_flags;
- __entry->nf_may = nf->nf_may;
- __entry->nf_file = nf->nf_file;
- ),
- TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d",
- __entry->nf_inode,
- __entry->nf_ref,
- show_nf_flags(__entry->nf_flags),
- show_nfsd_may_flags(__entry->nf_may),
- __entry->nf_file, __entry->ret
- )
-);
-
#include "cache.h"
TRACE_DEFINE_ENUM(RC_DROPIT);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ab4ee3509ce3..e7462b5e5f1e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -126,9 +126,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
struct dentry *dentry = *dpp;
struct path path = {.mnt = mntget(exp->ex_path.mnt),
.dentry = dget(dentry)};
+ unsigned int follow_flags = 0;
int err = 0;
- err = follow_down(&path);
+ if (exp->ex_flags & NFSEXP_CROSSMOUNT)
+ follow_flags = LOOKUP_AUTOMOUNT;
+
+ err = follow_down(&path, follow_flags);
if (err < 0)
goto out;
if (path.mnt == exp->ex_path.mnt && path.dentry == dentry &&
@@ -223,7 +227,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
return 1;
if (nfsd4_is_junction(dentry))
return 1;
- if (d_mountpoint(dentry))
+ if (d_managed(dentry))
/*
* Might only be a mountpoint in a different namespace,
* but we need to check.
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 4fd2cf6d1d2d..510978e602da 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -571,7 +571,7 @@ struct nfsd4_copy {
struct task_struct *copy_task;
refcount_t refcount;
- struct vfsmount *ss_mnt;
+ struct nfsd4_ssc_umount_item *ss_nsui;
struct nfs_fh c_fh;
nfs4_stateid stateid;
};
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index a07b24d170f2..aecbd712a00c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -46,6 +46,7 @@
#include <linux/net.h>
#include <linux/export.h>
#include <net/tcp.h>
+#include <trace/events/sock.h>
#include <linux/uaccess.h>
@@ -585,6 +586,8 @@ static void o2net_data_ready(struct sock *sk)
void (*ready)(struct sock *sk);
struct o2net_sock_container *sc;
+ trace_sk_data_ready(sk);
+
read_lock_bh(&sk->sk_callback_lock);
sc = sk->sk_user_data;
if (sc) {
@@ -1931,6 +1934,8 @@ static void o2net_listen_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
+ trace_sk_data_ready(sk);
+
read_lock_bh(&sk->sk_callback_lock);
ready = sk->sk_user_data;
if (ready == NULL) { /* check for teardown race */
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
index 7d605db3bb3b..ace133cf6072 100644
--- a/fs/smbfs_common/smb2pdu.h
+++ b/fs/smbfs_common/smb2pdu.h
@@ -167,7 +167,7 @@ struct smb2_err_rsp {
__u8 ErrorContextCount;
__u8 Reserved;
__le32 ByteCount; /* even if zero, at least one byte follows */
- __u8 ErrorData[1]; /* variable length */
+ __u8 ErrorData[]; /* variable length */
} __packed;
#define SMB3_AES_CCM_NONCE 11
@@ -308,7 +308,7 @@ struct smb2_tree_connect_req {
__le16 Flags; /* Flags in SMB3.1.1 */
__le16 PathOffset;
__le16 PathLength;
- __u8 Buffer[1]; /* variable length */
+ __u8 Buffer[]; /* variable length */
} __packed;
/* Possible ShareType values */
@@ -595,7 +595,7 @@ struct smb2_negotiate_rsp {
__le16 SecurityBufferOffset;
__le16 SecurityBufferLength;
__le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */
- __u8 Buffer[1]; /* variable length GSS security buffer */
+ __u8 Buffer[]; /* variable length GSS security buffer */
} __packed;
@@ -616,7 +616,7 @@ struct smb2_sess_setup_req {
__le16 SecurityBufferOffset;
__le16 SecurityBufferLength;
__le64 PreviousSessionId;
- __u8 Buffer[1]; /* variable length GSS security buffer */
+ __u8 Buffer[]; /* variable length GSS security buffer */
} __packed;
/* Currently defined SessionFlags */
@@ -633,7 +633,7 @@ struct smb2_sess_setup_rsp {
__le16 SessionFlags;
__le16 SecurityBufferOffset;
__le16 SecurityBufferLength;
- __u8 Buffer[1]; /* variable length GSS security buffer */
+ __u8 Buffer[]; /* variable length GSS security buffer */
} __packed;
@@ -715,7 +715,7 @@ struct smb2_read_req {
__le32 RemainingBytes;
__le16 ReadChannelInfoOffset;
__le16 ReadChannelInfoLength;
- __u8 Buffer[1];
+ __u8 Buffer[];
} __packed;
/* Read flags */
@@ -730,7 +730,7 @@ struct smb2_read_rsp {
__le32 DataLength;
__le32 DataRemaining;
__le32 Flags;
- __u8 Buffer[1];
+ __u8 Buffer[];
} __packed;
@@ -754,7 +754,7 @@ struct smb2_write_req {
__le16 WriteChannelInfoOffset;
__le16 WriteChannelInfoLength;
__le32 Flags;
- __u8 Buffer[1];
+ __u8 Buffer[];
} __packed;
struct smb2_write_rsp {
@@ -765,7 +765,7 @@ struct smb2_write_rsp {
__le32 DataLength;
__le32 DataRemaining;
__u32 Reserved2;
- __u8 Buffer[1];
+ __u8 Buffer[];
} __packed;
@@ -812,7 +812,10 @@ struct smb2_lock_req {
__u64 PersistentFileId;
__u64 VolatileFileId;
/* Followed by at least one */
- struct smb2_lock_element locks[1];
+ union {
+ struct smb2_lock_element lock;
+ DECLARE_FLEX_ARRAY(struct smb2_lock_element, locks);
+ };
} __packed;
struct smb2_lock_rsp {
@@ -866,7 +869,7 @@ struct smb2_query_directory_req {
__le16 FileNameOffset;
__le16 FileNameLength;
__le32 OutputBufferLength;
- __u8 Buffer[1];
+ __u8 Buffer[];
} __packed;
struct smb2_query_directory_rsp {
@@ -874,7 +877,7 @@ struct smb2_query_directory_rsp {
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
- __u8 Buffer[1];
+ __u8 Buffer[];
} __packed;
/*
@@ -897,7 +900,7 @@ struct smb2_set_info_req {
__le32 AdditionalInformation;
__u64 PersistentFileId;
__u64 VolatileFileId;
- __u8 Buffer[1];
+ __u8 Buffer[];
} __packed;
struct smb2_set_info_rsp {
@@ -952,7 +955,7 @@ struct smb2_change_notify_rsp {
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
- __u8 Buffer[1]; /* array of file notify structs */
+ __u8 Buffer[]; /* array of file notify structs */
} __packed;
@@ -1158,7 +1161,7 @@ struct smb2_create_rsp {
__u64 VolatileFileId;
__le32 CreateContextsOffset;
__le32 CreateContextsLength;
- __u8 Buffer[1];
+ __u8 Buffer[];
} __packed;
struct create_posix {
@@ -1501,7 +1504,7 @@ struct smb2_query_info_req {
__le32 Flags;
__u64 PersistentFileId;
__u64 VolatileFileId;
- __u8 Buffer[1];
+ __u8 Buffer[];
} __packed;
struct smb2_query_info_rsp {
@@ -1509,7 +1512,7 @@ struct smb2_query_info_rsp {
__le16 StructureSize; /* Must be 9 */
__le16 OutputBufferOffset;
__le32 OutputBufferLength;
- __u8 Buffer[1];
+ __u8 Buffer[];
} __packed;
/*
@@ -1570,7 +1573,10 @@ struct smb2_file_all_info { /* data block encoding of response to level 18 */
__le32 Mode;
__le32 AlignmentRequirement;
__le32 FileNameLength;
- char FileName[1];
+ union {
+ char __pad; /* Legacy structure padding */
+ DECLARE_FLEX_ARRAY(char, FileName);
+ };
} __packed; /* level 18 Query */
struct smb2_file_eof_info { /* encoding of request for level 10 */
diff --git a/fs/splice.c b/fs/splice.c
index 87d9b19349de..2e76dbb81a8f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -282,6 +282,99 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
kfree(spd->partial);
}
+/*
+ * Splice data from an O_DIRECT file into pages and then add them to the output
+ * pipe.
+ */
+ssize_t direct_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ struct iov_iter to;
+ struct bio_vec *bv;
+ struct kiocb kiocb;
+ struct page **pages;
+ ssize_t ret;
+ size_t used, npages, chunk, remain, reclaim;
+ int i;
+
+ /* Work out how much data we can actually add into the pipe */
+ used = pipe_occupancy(pipe->head, pipe->tail);
+ npages = max_t(ssize_t, pipe->max_usage - used, 0);
+ len = min_t(size_t, len, npages * PAGE_SIZE);
+ npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+ bv = kzalloc(array_size(npages, sizeof(bv[0])) +
+ array_size(npages, sizeof(struct page *)), GFP_KERNEL);
+ if (!bv)
+ return -ENOMEM;
+
+ pages = (void *)(bv + npages);
+ npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
+ if (!npages) {
+ kfree(bv);
+ return -ENOMEM;
+ }
+
+ remain = len = min_t(size_t, len, npages * PAGE_SIZE);
+
+ for (i = 0; i < npages; i++) {
+ chunk = min_t(size_t, PAGE_SIZE, remain);
+ bv[i].bv_page = pages[i];
+ bv[i].bv_offset = 0;
+ bv[i].bv_len = chunk;
+ remain -= chunk;
+ }
+
+ /* Do the I/O */
+ iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
+ init_sync_kiocb(&kiocb, in);
+ kiocb.ki_pos = *ppos;
+ ret = call_read_iter(in, &kiocb, &to);
+
+ reclaim = npages * PAGE_SIZE;
+ remain = 0;
+ if (ret > 0) {
+ reclaim -= ret;
+ remain = ret;
+ *ppos = kiocb.ki_pos;
+ file_accessed(in);
+ } else if (ret < 0) {
+ /*
+ * callers of ->splice_read() expect -EAGAIN on
+ * "can't put anything in there", rather than -EFAULT.
+ */
+ if (ret == -EFAULT)
+ ret = -EAGAIN;
+ }
+
+ /* Free any pages that didn't get touched at all. */
+ reclaim /= PAGE_SIZE;
+ if (reclaim) {
+ npages -= reclaim;
+ release_pages(pages + npages, reclaim);
+ }
+
+ /* Push the remaining pages into the pipe. */
+ for (i = 0; i < npages; i++) {
+ struct pipe_buffer *buf = pipe_head_buf(pipe);
+
+ chunk = min_t(size_t, remain, PAGE_SIZE);
+ *buf = (struct pipe_buffer) {
+ .ops = &default_pipe_buf_ops,
+ .page = bv[i].bv_page,
+ .offset = 0,
+ .len = chunk,
+ };
+ pipe->head++;
+ remain -= chunk;
+ }
+
+ kfree(bv);
+ return ret;
+}
+EXPORT_SYMBOL(direct_splice_read);
+
/**
* generic_file_splice_read - splice data from file to a pipe
* @in: file to splice from
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 989cf341779b..f8ff81c3de76 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2472,20 +2472,20 @@ xfs_defer_agfl_block(
struct xfs_owner_info *oinfo)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_extent_free_item *new; /* new element */
+ struct xfs_extent_free_item *xefi;
ASSERT(xfs_extfree_item_cache != NULL);
ASSERT(oinfo != NULL);
- new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
- new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
- new->xefi_blockcount = 1;
- new->xefi_owner = oinfo->oi_owner;
+ xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
+ xefi->xefi_blockcount = 1;
+ xefi->xefi_owner = oinfo->oi_owner;
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
}
/*
@@ -2500,7 +2500,7 @@ __xfs_free_extent_later(
const struct xfs_owner_info *oinfo,
bool skip_discard)
{
- struct xfs_extent_free_item *new; /* new element */
+ struct xfs_extent_free_item *xefi;
#ifdef DEBUG
struct xfs_mount *mp = tp->t_mountp;
xfs_agnumber_t agno;
@@ -2519,27 +2519,27 @@ __xfs_free_extent_later(
#endif
ASSERT(xfs_extfree_item_cache != NULL);
- new = kmem_cache_zalloc(xfs_extfree_item_cache,
+ xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
GFP_KERNEL | __GFP_NOFAIL);
- new->xefi_startblock = bno;
- new->xefi_blockcount = (xfs_extlen_t)len;
+ xefi->xefi_startblock = bno;
+ xefi->xefi_blockcount = (xfs_extlen_t)len;
if (skip_discard)
- new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
+ xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
if (oinfo) {
ASSERT(oinfo->oi_offset == 0);
if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
- new->xefi_flags |= XFS_EFI_ATTR_FORK;
+ xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
- new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
- new->xefi_owner = oinfo->oi_owner;
+ xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
+ xefi->xefi_owner = oinfo->oi_owner;
} else {
- new->xefi_owner = XFS_RMAP_OWN_NULL;
+ xefi->xefi_owner = XFS_RMAP_OWN_NULL;
}
trace_xfs_bmap_free_defer(tp->t_mountp,
XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
}
#ifdef DEBUG
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 0d56a8d862e8..c8c65387136c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6146,39 +6146,37 @@ xfs_bmap_unmap_extent(
int
xfs_bmap_finish_one(
struct xfs_trans *tp,
- struct xfs_inode *ip,
- enum xfs_bmap_intent_type type,
- int whichfork,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t *blockcount,
- xfs_exntst_t state)
+ struct xfs_bmap_intent *bi)
{
+ struct xfs_bmbt_irec *bmap = &bi->bi_bmap;
int error = 0;
ASSERT(tp->t_firstblock == NULLFSBLOCK);
trace_xfs_bmap_deferred(tp->t_mountp,
- XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
- XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
- ip->i_ino, whichfork, startoff, *blockcount, state);
+ XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
+ bi->bi_type,
+ XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
+ bi->bi_owner->i_ino, bi->bi_whichfork,
+ bmap->br_startoff, bmap->br_blockcount,
+ bmap->br_state);
- if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
+ if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK))
return -EFSCORRUPTED;
if (XFS_TEST_ERROR(false, tp->t_mountp,
XFS_ERRTAG_BMAP_FINISH_ONE))
return -EIO;
- switch (type) {
+ switch (bi->bi_type) {
case XFS_BMAP_MAP:
- error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
- startblock, 0);
- *blockcount = 0;
+ error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff,
+ bmap->br_blockcount, bmap->br_startblock, 0);
+ bmap->br_blockcount = 0;
break;
case XFS_BMAP_UNMAP:
- error = __xfs_bunmapi(tp, ip, startoff, blockcount,
- XFS_BMAPI_REMAP, 1);
+ error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff,
+ &bmap->br_blockcount, XFS_BMAPI_REMAP, 1);
break;
default:
ASSERT(0);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 16db95b11589..01c2df35c3e3 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -234,10 +234,7 @@ struct xfs_bmap_intent {
struct xfs_bmbt_irec bi_bmap;
};
-int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip,
- enum xfs_bmap_intent_type type, int whichfork,
- xfs_fileoff_t startoff, xfs_fsblock_t startblock,
- xfs_filblks_t *blockcount, xfs_exntst_t state);
+int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 35f574421670..da8c769887fd 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2913,9 +2913,22 @@ xfs_btree_split_worker(
}
/*
- * BMBT split requests often come in with little stack to work on. Push
+ * BMBT split requests often come in with little stack to work on so we push
* them off to a worker thread so there is lots of stack to use. For the other
* btree types, just call directly to avoid the context switch overhead here.
+ *
+ * Care must be taken here - the work queue rescuer thread introduces potential
+ * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new
+ * AGFs to allocate blocks. A task being run by the rescuer could attempt to
+ * lock an AGF that is already locked by a task queued to run by the rescuer,
+ * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to
+ * release it until the current thread it is running gains the lock.
+ *
+ * To avoid this issue, we only ever queue BMBT splits that don't have an AGF
+ * already locked to allocate from. The only place that doesn't hold an AGF
+ * locked is unwritten extent conversion at IO completion, but that has already
+ * been offloaded to a worker thread and hence has no stack consumption issues
+ * we have to worry about.
*/
STATIC int /* error */
xfs_btree_split(
@@ -2929,7 +2942,8 @@ xfs_btree_split(
struct xfs_btree_split_args args;
DECLARE_COMPLETION_ONSTACK(done);
- if (cur->bc_btnum != XFS_BTNUM_BMAP)
+ if (cur->bc_btnum != XFS_BTNUM_BMAP ||
+ cur->bc_tp->t_firstblock == NULLFSBLOCK)
return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
args.cur = cur;
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 6f7ed9288fe4..bcf46aa0d08b 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1213,37 +1213,33 @@ out_error:
STATIC int
xfs_refcount_adjust(
struct xfs_btree_cur *cur,
- xfs_agblock_t agbno,
- xfs_extlen_t aglen,
- xfs_agblock_t *new_agbno,
- xfs_extlen_t *new_aglen,
+ xfs_agblock_t *agbno,
+ xfs_extlen_t *aglen,
enum xfs_refc_adjust_op adj)
{
bool shape_changed;
int shape_changes = 0;
int error;
- *new_agbno = agbno;
- *new_aglen = aglen;
if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
- trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- agbno, aglen);
+ trace_xfs_refcount_increase(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, *agbno, *aglen);
else
- trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- agbno, aglen);
+ trace_xfs_refcount_decrease(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, *agbno, *aglen);
/*
* Ensure that no rcextents cross the boundary of the adjustment range.
*/
error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
- agbno, &shape_changed);
+ *agbno, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
shape_changes++;
error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED,
- agbno + aglen, &shape_changed);
+ *agbno + *aglen, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
@@ -1253,7 +1249,7 @@ xfs_refcount_adjust(
* Try to merge with the left or right extents of the range.
*/
error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED,
- new_agbno, new_aglen, adj, &shape_changed);
+ agbno, aglen, adj, &shape_changed);
if (error)
goto out_error;
if (shape_changed)
@@ -1262,7 +1258,7 @@ xfs_refcount_adjust(
cur->bc_ag.refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
- error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj);
+ error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj);
if (error)
goto out_error;
@@ -1298,21 +1294,20 @@ xfs_refcount_finish_one_cleanup(
static inline int
xfs_refcount_continue_op(
struct xfs_btree_cur *cur,
- xfs_fsblock_t startblock,
- xfs_agblock_t new_agbno,
- xfs_extlen_t new_len,
- xfs_fsblock_t *new_fsbno)
+ struct xfs_refcount_intent *ri,
+ xfs_agblock_t new_agbno)
{
struct xfs_mount *mp = cur->bc_mp;
struct xfs_perag *pag = cur->bc_ag.pag;
- if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len)))
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno,
+ ri->ri_blockcount)))
return -EFSCORRUPTED;
- *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
+ ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
- ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len));
- ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno));
+ ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount));
+ ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
return 0;
}
@@ -1327,11 +1322,7 @@ xfs_refcount_continue_op(
int
xfs_refcount_finish_one(
struct xfs_trans *tp,
- enum xfs_refcount_intent_type type,
- xfs_fsblock_t startblock,
- xfs_extlen_t blockcount,
- xfs_fsblock_t *new_fsb,
- xfs_extlen_t *new_len,
+ struct xfs_refcount_intent *ri,
struct xfs_btree_cur **pcur)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -1339,17 +1330,16 @@ xfs_refcount_finish_one(
struct xfs_buf *agbp = NULL;
int error = 0;
xfs_agblock_t bno;
- xfs_agblock_t new_agbno;
unsigned long nr_ops = 0;
int shape_changes = 0;
struct xfs_perag *pag;
- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
- bno = XFS_FSB_TO_AGBNO(mp, startblock);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock));
+ bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock);
- trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
- type, XFS_FSB_TO_AGBNO(mp, startblock),
- blockcount);
+ trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock),
+ ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock),
+ ri->ri_blockcount);
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) {
error = -EIO;
@@ -1380,42 +1370,42 @@ xfs_refcount_finish_one(
}
*pcur = rcur;
- switch (type) {
+ switch (ri->ri_type) {
case XFS_REFCOUNT_INCREASE:
- error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_INCREASE);
+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+ XFS_REFCOUNT_ADJUST_INCREASE);
if (error)
goto out_drop;
- if (*new_len > 0)
- error = xfs_refcount_continue_op(rcur, startblock,
- new_agbno, *new_len, new_fsb);
+ if (ri->ri_blockcount > 0)
+ error = xfs_refcount_continue_op(rcur, ri, bno);
break;
case XFS_REFCOUNT_DECREASE:
- error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
- new_len, XFS_REFCOUNT_ADJUST_DECREASE);
+ error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount,
+ XFS_REFCOUNT_ADJUST_DECREASE);
if (error)
goto out_drop;
- if (*new_len > 0)
- error = xfs_refcount_continue_op(rcur, startblock,
- new_agbno, *new_len, new_fsb);
+ if (ri->ri_blockcount > 0)
+ error = xfs_refcount_continue_op(rcur, ri, bno);
break;
case XFS_REFCOUNT_ALLOC_COW:
- *new_fsb = startblock + blockcount;
- *new_len = 0;
- error = __xfs_refcount_cow_alloc(rcur, bno, blockcount);
+ error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount);
+ if (error)
+ goto out_drop;
+ ri->ri_blockcount = 0;
break;
case XFS_REFCOUNT_FREE_COW:
- *new_fsb = startblock + blockcount;
- *new_len = 0;
- error = __xfs_refcount_cow_free(rcur, bno, blockcount);
+ error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount);
+ if (error)
+ goto out_drop;
+ ri->ri_blockcount = 0;
break;
default:
ASSERT(0);
error = -EFSCORRUPTED;
}
- if (!error && *new_len > 0)
- trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type,
- bno, blockcount, new_agbno, *new_len);
+ if (!error && ri->ri_blockcount > 0)
+ trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno,
+ ri->ri_type, bno, ri->ri_blockcount);
out_drop:
xfs_perag_put(pag);
return error;
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 452f30556f5a..c633477ce3ce 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -75,9 +75,7 @@ void xfs_refcount_decrease_extent(struct xfs_trans *tp,
extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
struct xfs_btree_cur *rcur, int error);
extern int xfs_refcount_finish_one(struct xfs_trans *tp,
- enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
- xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
- xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
+ struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur);
extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index b56aca1e7c66..df720041cd3d 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2390,13 +2390,7 @@ xfs_rmap_finish_one_cleanup(
int
xfs_rmap_finish_one(
struct xfs_trans *tp,
- enum xfs_rmap_intent_type type,
- uint64_t owner,
- int whichfork,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
- xfs_exntst_t state,
+ struct xfs_rmap_intent *ri,
struct xfs_btree_cur **pcur)
{
struct xfs_mount *mp = tp->t_mountp;
@@ -2408,11 +2402,13 @@ xfs_rmap_finish_one(
xfs_agblock_t bno;
bool unwritten;
- pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock));
- bno = XFS_FSB_TO_AGBNO(mp, startblock);
+ pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock));
+ bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock);
- trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork,
- startoff, blockcount, state);
+ trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno,
+ ri->ri_owner, ri->ri_whichfork,
+ ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount,
+ ri->ri_bmap.br_state);
if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) {
error = -EIO;
@@ -2448,35 +2444,37 @@ xfs_rmap_finish_one(
}
*pcur = rcur;
- xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff);
- unwritten = state == XFS_EXT_UNWRITTEN;
- bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock);
+ xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork,
+ ri->ri_bmap.br_startoff);
+ unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
+ bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock);
- switch (type) {
+ switch (ri->ri_type) {
case XFS_RMAP_ALLOC:
case XFS_RMAP_MAP:
- error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
+ error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount,
+ unwritten, &oinfo);
break;
case XFS_RMAP_MAP_SHARED:
- error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten,
- &oinfo);
+ error = xfs_rmap_map_shared(rcur, bno,
+ ri->ri_bmap.br_blockcount, unwritten, &oinfo);
break;
case XFS_RMAP_FREE:
case XFS_RMAP_UNMAP:
- error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
- &oinfo);
+ error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount,
+ unwritten, &oinfo);
break;
case XFS_RMAP_UNMAP_SHARED:
- error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten,
- &oinfo);
+ error = xfs_rmap_unmap_shared(rcur, bno,
+ ri->ri_bmap.br_blockcount, unwritten, &oinfo);
break;
case XFS_RMAP_CONVERT:
- error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
- &oinfo);
+ error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount,
+ !unwritten, &oinfo);
break;
case XFS_RMAP_CONVERT_SHARED:
- error = xfs_rmap_convert_shared(rcur, bno, blockcount,
- !unwritten, &oinfo);
+ error = xfs_rmap_convert_shared(rcur, bno,
+ ri->ri_bmap.br_blockcount, !unwritten, &oinfo);
break;
default:
ASSERT(0);
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 54741a591a17..2dac88cea28d 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -179,10 +179,8 @@ void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
struct xfs_btree_cur *rcur, int error);
-int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
- uint64_t owner, int whichfork, xfs_fileoff_t startoff,
- xfs_fsblock_t startblock, xfs_filblks_t blockcount,
- xfs_exntst_t state, struct xfs_btree_cur **pcur);
+int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
+ struct xfs_btree_cur **pcur);
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 41323da523d1..6e2f0013380a 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -246,18 +246,11 @@ static int
xfs_trans_log_finish_bmap_update(
struct xfs_trans *tp,
struct xfs_bud_log_item *budp,
- enum xfs_bmap_intent_type type,
- struct xfs_inode *ip,
- int whichfork,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t *blockcount,
- xfs_exntst_t state)
+ struct xfs_bmap_intent *bi)
{
int error;
- error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
- startblock, blockcount, state);
+ error = xfs_bmap_finish_one(tp, bi);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -290,24 +283,24 @@ xfs_bmap_update_diff_items(
/* Set the map extent flags for this mapping. */
static void
xfs_trans_set_bmap_flags(
- struct xfs_map_extent *bmap,
+ struct xfs_map_extent *map,
enum xfs_bmap_intent_type type,
int whichfork,
xfs_exntst_t state)
{
- bmap->me_flags = 0;
+ map->me_flags = 0;
switch (type) {
case XFS_BMAP_MAP:
case XFS_BMAP_UNMAP:
- bmap->me_flags = type;
+ map->me_flags = type;
break;
default:
ASSERT(0);
}
if (state == XFS_EXT_UNWRITTEN)
- bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+ map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
if (whichfork == XFS_ATTR_FORK)
- bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+ map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
}
/* Log bmap updates in the intent item. */
@@ -315,7 +308,7 @@ STATIC void
xfs_bmap_update_log_item(
struct xfs_trans *tp,
struct xfs_bui_log_item *buip,
- struct xfs_bmap_intent *bmap)
+ struct xfs_bmap_intent *bi)
{
uint next_extent;
struct xfs_map_extent *map;
@@ -331,12 +324,12 @@ xfs_bmap_update_log_item(
next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
ASSERT(next_extent < buip->bui_format.bui_nextents);
map = &buip->bui_format.bui_extents[next_extent];
- map->me_owner = bmap->bi_owner->i_ino;
- map->me_startblock = bmap->bi_bmap.br_startblock;
- map->me_startoff = bmap->bi_bmap.br_startoff;
- map->me_len = bmap->bi_bmap.br_blockcount;
- xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
- bmap->bi_bmap.br_state);
+ map->me_owner = bi->bi_owner->i_ino;
+ map->me_startblock = bi->bi_bmap.br_startblock;
+ map->me_startoff = bi->bi_bmap.br_startoff;
+ map->me_len = bi->bi_bmap.br_blockcount;
+ xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork,
+ bi->bi_bmap.br_state);
}
static struct xfs_log_item *
@@ -348,15 +341,15 @@ xfs_bmap_update_create_intent(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_bui_log_item *buip = xfs_bui_init(mp);
- struct xfs_bmap_intent *bmap;
+ struct xfs_bmap_intent *bi;
ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
xfs_trans_add_item(tp, &buip->bui_item);
if (sort)
list_sort(mp, items, xfs_bmap_update_diff_items);
- list_for_each_entry(bmap, items, bi_list)
- xfs_bmap_update_log_item(tp, buip, bmap);
+ list_for_each_entry(bi, items, bi_list)
+ xfs_bmap_update_log_item(tp, buip, bi);
return &buip->bui_item;
}
@@ -378,25 +371,17 @@ xfs_bmap_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_bmap_intent *bmap;
- xfs_filblks_t count;
+ struct xfs_bmap_intent *bi;
int error;
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- count = bmap->bi_bmap.br_blockcount;
- error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done),
- bmap->bi_type,
- bmap->bi_owner, bmap->bi_whichfork,
- bmap->bi_bmap.br_startoff,
- bmap->bi_bmap.br_startblock,
- &count,
- bmap->bi_bmap.br_state);
- if (!error && count > 0) {
- ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
- bmap->bi_bmap.br_blockcount = count;
+ bi = container_of(item, struct xfs_bmap_intent, bi_list);
+
+ error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi);
+ if (!error && bi->bi_bmap.br_blockcount > 0) {
+ ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
return -EAGAIN;
}
- kmem_cache_free(xfs_bmap_intent_cache, bmap);
+ kmem_cache_free(xfs_bmap_intent_cache, bi);
return error;
}
@@ -413,10 +398,10 @@ STATIC void
xfs_bmap_update_cancel_item(
struct list_head *item)
{
- struct xfs_bmap_intent *bmap;
+ struct xfs_bmap_intent *bi;
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- kmem_cache_free(xfs_bmap_intent_cache, bmap);
+ bi = container_of(item, struct xfs_bmap_intent, bi_list);
+ kmem_cache_free(xfs_bmap_intent_cache, bi);
}
const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
@@ -434,18 +419,18 @@ xfs_bui_validate(
struct xfs_mount *mp,
struct xfs_bui_log_item *buip)
{
- struct xfs_map_extent *bmap;
+ struct xfs_map_extent *map;
/* Only one mapping operation per BUI... */
if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
return false;
- bmap = &buip->bui_format.bui_extents[0];
+ map = &buip->bui_format.bui_extents[0];
- if (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)
+ if (map->me_flags & ~XFS_BMAP_EXTENT_FLAGS)
return false;
- switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
+ switch (map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
case XFS_BMAP_MAP:
case XFS_BMAP_UNMAP:
break;
@@ -453,13 +438,13 @@ xfs_bui_validate(
return false;
}
- if (!xfs_verify_ino(mp, bmap->me_owner))
+ if (!xfs_verify_ino(mp, map->me_owner))
return false;
- if (!xfs_verify_fileext(mp, bmap->me_startoff, bmap->me_len))
+ if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
return false;
- return xfs_verify_fsbext(mp, bmap->me_startblock, bmap->me_len);
+ return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
/*
@@ -471,17 +456,13 @@ xfs_bui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
- struct xfs_bmbt_irec irec;
+ struct xfs_bmap_intent fake = { };
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
- struct xfs_map_extent *bmap;
+ struct xfs_map_extent *map;
struct xfs_bud_log_item *budp;
- xfs_filblks_t count;
- xfs_exntst_t state;
- unsigned int bui_type;
- int whichfork;
int iext_delta;
int error = 0;
@@ -491,14 +472,12 @@ xfs_bui_item_recover(
return -EFSCORRUPTED;
}
- bmap = &buip->bui_format.bui_extents[0];
- state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
- whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+ map = &buip->bui_format.bui_extents[0];
+ fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
XFS_ATTR_FORK : XFS_DATA_FORK;
- bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+ fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
- error = xlog_recover_iget(mp, bmap->me_owner, &ip);
+ error = xlog_recover_iget(mp, map->me_owner, &ip);
if (error)
return error;
@@ -512,34 +491,34 @@ xfs_bui_item_recover(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- if (bui_type == XFS_BMAP_MAP)
+ if (fake.bi_type == XFS_BMAP_MAP)
iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
else
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
- error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+ error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto err_cancel;
- count = bmap->me_len;
- error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
- whichfork, bmap->me_startoff, bmap->me_startblock,
- &count, state);
+ fake.bi_owner = ip;
+ fake.bi_bmap.br_startblock = map->me_startblock;
+ fake.bi_bmap.br_startoff = map->me_startoff;
+ fake.bi_bmap.br_blockcount = map->me_len;
+ fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+
+ error = xfs_trans_log_finish_bmap_update(tp, budp, &fake);
if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap,
- sizeof(*bmap));
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map,
+ sizeof(*map));
if (error)
goto err_cancel;
- if (count > 0) {
- ASSERT(bui_type == XFS_BMAP_UNMAP);
- irec.br_startblock = bmap->me_startblock;
- irec.br_blockcount = count;
- irec.br_startoff = bmap->me_startoff;
- irec.br_state = state;
- xfs_bmap_unmap_extent(tp, ip, &irec);
+ if (fake.bi_bmap.br_blockcount > 0) {
+ ASSERT(fake.bi_type == XFS_BMAP_UNMAP);
+ xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap);
}
/*
@@ -579,18 +558,18 @@ xfs_bui_item_relog(
{
struct xfs_bud_log_item *budp;
struct xfs_bui_log_item *buip;
- struct xfs_map_extent *extp;
+ struct xfs_map_extent *map;
unsigned int count;
count = BUI_ITEM(intent)->bui_format.bui_nextents;
- extp = BUI_ITEM(intent)->bui_format.bui_extents;
+ map = BUI_ITEM(intent)->bui_format.bui_extents;
tp->t_flags |= XFS_TRANS_DIRTY;
budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
buip = xfs_bui_init(tp->t_mountp);
- memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
+ memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map));
atomic_set(&buip->bui_next_extent, count);
xfs_trans_add_item(tp, &buip->bui_item);
set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ae082808cfed..b2cbbba3e15a 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -228,7 +228,7 @@ static struct attribute *xfs_errortag_attrs[] = {
};
ATTRIBUTE_GROUPS(xfs_errortag);
-static struct kobj_type xfs_errortag_ktype = {
+static const struct kobj_type xfs_errortag_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_errortag_sysfs_ops,
.default_groups = xfs_errortag_groups,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index dbe6c37dc697..0b9c5ba8a598 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -75,7 +75,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
/*
* XFS panic tags -- allow a call to xfs_alert_tag() be turned into
- * a panic by setting xfs_panic_mask in a sysctl.
+ * a panic by setting fs.xfs.panic_mask in a sysctl.
*/
#define XFS_NO_PTAG 0u
#define XFS_PTAG_IFLUSH (1u << 0)
@@ -88,6 +88,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
#define XFS_PTAG_FSBLOCK_ZERO (1u << 7)
#define XFS_PTAG_VERIFIER_ERROR (1u << 8)
+#define XFS_PTAG_MASK (XFS_PTAG_IFLUSH | \
+ XFS_PTAG_LOGRES | \
+ XFS_PTAG_AILDELETE | \
+ XFS_PTAG_ERROR_REPORT | \
+ XFS_PTAG_SHUTDOWN_CORRUPT | \
+ XFS_PTAG_SHUTDOWN_IOERROR | \
+ XFS_PTAG_SHUTDOWN_LOGERROR | \
+ XFS_PTAG_FSBLOCK_ZERO | \
+ XFS_PTAG_VERIFIER_ERROR)
+
#define XFS_PTAG_STRINGS \
{ XFS_NO_PTAG, "none" }, \
{ XFS_PTAG_IFLUSH, "iflush" }, \
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d5130d1fcfae..011b50469301 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -345,23 +345,30 @@ static int
xfs_trans_free_extent(
struct xfs_trans *tp,
struct xfs_efd_log_item *efdp,
- xfs_fsblock_t start_block,
- xfs_extlen_t ext_len,
- const struct xfs_owner_info *oinfo,
- bool skip_discard)
+ struct xfs_extent_free_item *xefi)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_mount *mp = tp->t_mountp;
struct xfs_extent *extp;
uint next_extent;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block);
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp,
+ xefi->xefi_startblock);
xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
- start_block);
+ xefi->xefi_startblock);
int error;
- trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
+ oinfo.oi_owner = xefi->xefi_owner;
+ if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+
+ trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno,
+ xefi->xefi_blockcount);
- error = __xfs_free_extent(tp, start_block, ext_len,
- oinfo, XFS_AG_RESV_NONE, skip_discard);
+ error = __xfs_free_extent(tp, xefi->xefi_startblock,
+ xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
+ xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
/*
* Mark the transaction dirty, even on error. This ensures the
* transaction is aborted, which:
@@ -375,8 +382,8 @@ xfs_trans_free_extent(
next_extent = efdp->efd_next_extent;
ASSERT(next_extent < efdp->efd_format.efd_nextents);
extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = start_block;
- extp->ext_len = ext_len;
+ extp->ext_start = xefi->xefi_startblock;
+ extp->ext_len = xefi->xefi_blockcount;
efdp->efd_next_extent++;
return error;
@@ -404,7 +411,7 @@ STATIC void
xfs_extent_free_log_item(
struct xfs_trans *tp,
struct xfs_efi_log_item *efip,
- struct xfs_extent_free_item *free)
+ struct xfs_extent_free_item *xefi)
{
uint next_extent;
struct xfs_extent *extp;
@@ -420,8 +427,8 @@ xfs_extent_free_log_item(
next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
ASSERT(next_extent < efip->efi_format.efi_nextents);
extp = &efip->efi_format.efi_extents[next_extent];
- extp->ext_start = free->xefi_startblock;
- extp->ext_len = free->xefi_blockcount;
+ extp->ext_start = xefi->xefi_startblock;
+ extp->ext_len = xefi->xefi_blockcount;
}
static struct xfs_log_item *
@@ -433,15 +440,15 @@ xfs_extent_free_create_intent(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_efi_log_item *efip = xfs_efi_init(mp, count);
- struct xfs_extent_free_item *free;
+ struct xfs_extent_free_item *xefi;
ASSERT(count > 0);
xfs_trans_add_item(tp, &efip->efi_item);
if (sort)
list_sort(mp, items, xfs_extent_free_diff_items);
- list_for_each_entry(free, items, xefi_list)
- xfs_extent_free_log_item(tp, efip, free);
+ list_for_each_entry(xefi, items, xefi_list)
+ xfs_extent_free_log_item(tp, efip, xefi);
return &efip->efi_item;
}
@@ -463,21 +470,13 @@ xfs_extent_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_owner_info oinfo = { };
- struct xfs_extent_free_item *free;
+ struct xfs_extent_free_item *xefi;
int error;
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- oinfo.oi_owner = free->xefi_owner;
- if (free->xefi_flags & XFS_EFI_ATTR_FORK)
- oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
- if (free->xefi_flags & XFS_EFI_BMBT_BLOCK)
- oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
- error = xfs_trans_free_extent(tp, EFD_ITEM(done),
- free->xefi_startblock,
- free->xefi_blockcount,
- &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD);
- kmem_cache_free(xfs_extfree_item_cache, free);
+ xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+ error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
+ kmem_cache_free(xfs_extfree_item_cache, xefi);
return error;
}
@@ -494,10 +493,10 @@ STATIC void
xfs_extent_free_cancel_item(
struct list_head *item)
{
- struct xfs_extent_free_item *free;
+ struct xfs_extent_free_item *xefi;
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- kmem_cache_free(xfs_extfree_item_cache, free);
+ xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+ kmem_cache_free(xfs_extfree_item_cache, xefi);
}
const struct xfs_defer_op_type xfs_extent_free_defer_type = {
@@ -523,7 +522,7 @@ xfs_agfl_free_finish_item(
struct xfs_owner_info oinfo = { };
struct xfs_mount *mp = tp->t_mountp;
struct xfs_efd_log_item *efdp = EFD_ITEM(done);
- struct xfs_extent_free_item *free;
+ struct xfs_extent_free_item *xefi;
struct xfs_extent *extp;
struct xfs_buf *agbp;
int error;
@@ -532,13 +531,13 @@ xfs_agfl_free_finish_item(
uint next_extent;
struct xfs_perag *pag;
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- ASSERT(free->xefi_blockcount == 1);
- agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
- agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
- oinfo.oi_owner = free->xefi_owner;
+ xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+ ASSERT(xefi->xefi_blockcount == 1);
+ agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
+ agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
+ oinfo.oi_owner = xefi->xefi_owner;
- trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
+ trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount);
pag = xfs_perag_get(mp, agno);
error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
@@ -559,11 +558,11 @@ xfs_agfl_free_finish_item(
next_extent = efdp->efd_next_extent;
ASSERT(next_extent < efdp->efd_format.efd_nextents);
extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = free->xefi_startblock;
- extp->ext_len = free->xefi_blockcount;
+ extp->ext_start = xefi->xefi_startblock;
+ extp->ext_len = xefi->xefi_blockcount;
efdp->efd_next_extent++;
- kmem_cache_free(xfs_extfree_item_cache, free);
+ kmem_cache_free(xfs_extfree_item_cache, xefi);
return error;
}
@@ -599,7 +598,6 @@ xfs_efi_item_recover(
struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
- struct xfs_extent *extp;
int i;
int error = 0;
@@ -624,10 +622,17 @@ xfs_efi_item_recover(
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+ struct xfs_extent_free_item fake = {
+ .xefi_owner = XFS_RMAP_OWN_UNKNOWN,
+ };
+ struct xfs_extent *extp;
+
extp = &efip->efi_format.efi_extents[i];
- error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
- extp->ext_len,
- &XFS_RMAP_OINFO_ANY_OWNER, false);
+
+ fake.xefi_startblock = extp->ext_start;
+ fake.xefi_blockcount = extp->ext_len;
+
+ error = xfs_trans_free_extent(tp, efdp, &fake);
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
extp, sizeof(*extp));
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 4d0a98f920ca..9edc1f2bc939 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -4,6 +4,7 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_error.h"
/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
@@ -15,7 +16,7 @@ xfs_param_t xfs_params = {
/* MIN DFLT MAX */
.sgid_inherit = { 0, 0, 1 },
.symlink_mode = { 0, 0, 1 },
- .panic_mask = { 0, 0, 256 },
+ .panic_mask = { 0, 0, XFS_PTAG_MASK},
.error_level = { 0, 3, 11 },
.syncd_timer = { 1*100, 30*100, 7200*100},
.stats_clear = { 0, 0, 1 },
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index fc1946f80a4a..69dbe7814128 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -83,7 +83,7 @@ xfs_iomap_valid(
return true;
}
-static const struct iomap_page_ops xfs_iomap_page_ops = {
+static const struct iomap_folio_ops xfs_iomap_folio_ops = {
.iomap_valid = xfs_iomap_valid,
};
@@ -133,7 +133,7 @@ xfs_bmbt_to_iomap(
iomap->flags |= IOMAP_F_DIRTY;
iomap->validity_cookie = sequence_cookie;
- iomap->page_ops = &xfs_iomap_page_ops;
+ iomap->folio_ops = &xfs_iomap_folio_ops;
return 0;
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 858e3e9eb4a8..48d771a76add 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -252,17 +252,12 @@ static int
xfs_trans_log_finish_refcount_update(
struct xfs_trans *tp,
struct xfs_cud_log_item *cudp,
- enum xfs_refcount_intent_type type,
- xfs_fsblock_t startblock,
- xfs_extlen_t blockcount,
- xfs_fsblock_t *new_fsb,
- xfs_extlen_t *new_len,
+ struct xfs_refcount_intent *ri,
struct xfs_btree_cur **pcur)
{
int error;
- error = xfs_refcount_finish_one(tp, type, startblock,
- blockcount, new_fsb, new_len, pcur);
+ error = xfs_refcount_finish_one(tp, ri, pcur);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -297,16 +292,16 @@ xfs_refcount_update_diff_items(
/* Set the phys extent flags for this reverse mapping. */
static void
xfs_trans_set_refcount_flags(
- struct xfs_phys_extent *refc,
+ struct xfs_phys_extent *pmap,
enum xfs_refcount_intent_type type)
{
- refc->pe_flags = 0;
+ pmap->pe_flags = 0;
switch (type) {
case XFS_REFCOUNT_INCREASE:
case XFS_REFCOUNT_DECREASE:
case XFS_REFCOUNT_ALLOC_COW:
case XFS_REFCOUNT_FREE_COW:
- refc->pe_flags |= type;
+ pmap->pe_flags |= type;
break;
default:
ASSERT(0);
@@ -318,10 +313,10 @@ STATIC void
xfs_refcount_update_log_item(
struct xfs_trans *tp,
struct xfs_cui_log_item *cuip,
- struct xfs_refcount_intent *refc)
+ struct xfs_refcount_intent *ri)
{
uint next_extent;
- struct xfs_phys_extent *ext;
+ struct xfs_phys_extent *pmap;
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
@@ -333,10 +328,10 @@ xfs_refcount_update_log_item(
*/
next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
ASSERT(next_extent < cuip->cui_format.cui_nextents);
- ext = &cuip->cui_format.cui_extents[next_extent];
- ext->pe_startblock = refc->ri_startblock;
- ext->pe_len = refc->ri_blockcount;
- xfs_trans_set_refcount_flags(ext, refc->ri_type);
+ pmap = &cuip->cui_format.cui_extents[next_extent];
+ pmap->pe_startblock = ri->ri_startblock;
+ pmap->pe_len = ri->ri_blockcount;
+ xfs_trans_set_refcount_flags(pmap, ri->ri_type);
}
static struct xfs_log_item *
@@ -348,15 +343,15 @@ xfs_refcount_update_create_intent(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count);
- struct xfs_refcount_intent *refc;
+ struct xfs_refcount_intent *ri;
ASSERT(count > 0);
xfs_trans_add_item(tp, &cuip->cui_item);
if (sort)
list_sort(mp, items, xfs_refcount_update_diff_items);
- list_for_each_entry(refc, items, ri_list)
- xfs_refcount_update_log_item(tp, cuip, refc);
+ list_for_each_entry(ri, items, ri_list)
+ xfs_refcount_update_log_item(tp, cuip, ri);
return &cuip->cui_item;
}
@@ -378,25 +373,20 @@ xfs_refcount_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_refcount_intent *refc;
- xfs_fsblock_t new_fsb;
- xfs_extlen_t new_aglen;
+ struct xfs_refcount_intent *ri;
int error;
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
- error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done),
- refc->ri_type, refc->ri_startblock, refc->ri_blockcount,
- &new_fsb, &new_aglen, state);
+ ri = container_of(item, struct xfs_refcount_intent, ri_list);
+ error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri,
+ state);
/* Did we run out of reservation? Requeue what we didn't finish. */
- if (!error && new_aglen > 0) {
- ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
- refc->ri_type == XFS_REFCOUNT_DECREASE);
- refc->ri_startblock = new_fsb;
- refc->ri_blockcount = new_aglen;
+ if (!error && ri->ri_blockcount > 0) {
+ ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
+ ri->ri_type == XFS_REFCOUNT_DECREASE);
return -EAGAIN;
}
- kmem_cache_free(xfs_refcount_intent_cache, refc);
+ kmem_cache_free(xfs_refcount_intent_cache, ri);
return error;
}
@@ -413,10 +403,10 @@ STATIC void
xfs_refcount_update_cancel_item(
struct list_head *item)
{
- struct xfs_refcount_intent *refc;
+ struct xfs_refcount_intent *ri;
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
- kmem_cache_free(xfs_refcount_intent_cache, refc);
+ ri = container_of(item, struct xfs_refcount_intent, ri_list);
+ kmem_cache_free(xfs_refcount_intent_cache, ri);
}
const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
@@ -433,15 +423,15 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
static inline bool
xfs_cui_validate_phys(
struct xfs_mount *mp,
- struct xfs_phys_extent *refc)
+ struct xfs_phys_extent *pmap)
{
if (!xfs_has_reflink(mp))
return false;
- if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)
+ if (pmap->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)
return false;
- switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
+ switch (pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
case XFS_REFCOUNT_INCREASE:
case XFS_REFCOUNT_DECREASE:
case XFS_REFCOUNT_ALLOC_COW:
@@ -451,7 +441,7 @@ xfs_cui_validate_phys(
return false;
}
- return xfs_verify_fsbext(mp, refc->pe_startblock, refc->pe_len);
+ return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
}
/*
@@ -463,18 +453,13 @@ xfs_cui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
- struct xfs_bmbt_irec irec;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
- struct xfs_phys_extent *refc;
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
- xfs_fsblock_t new_fsb;
- xfs_extlen_t new_len;
unsigned int refc_type;
bool requeue_only = false;
- enum xfs_refcount_intent_type type;
int i;
int error = 0;
@@ -513,14 +498,17 @@ xfs_cui_item_recover(
cudp = xfs_trans_get_cud(tp, cuip);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
- refc = &cuip->cui_format.cui_extents[i];
- refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
+ struct xfs_refcount_intent fake = { };
+ struct xfs_phys_extent *pmap;
+
+ pmap = &cuip->cui_format.cui_extents[i];
+ refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
switch (refc_type) {
case XFS_REFCOUNT_INCREASE:
case XFS_REFCOUNT_DECREASE:
case XFS_REFCOUNT_ALLOC_COW:
case XFS_REFCOUNT_FREE_COW:
- type = refc_type;
+ fake.ri_type = refc_type;
break;
default:
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
@@ -529,13 +517,12 @@ xfs_cui_item_recover(
error = -EFSCORRUPTED;
goto abort_error;
}
- if (requeue_only) {
- new_fsb = refc->pe_startblock;
- new_len = refc->pe_len;
- } else
+
+ fake.ri_startblock = pmap->pe_startblock;
+ fake.ri_blockcount = pmap->pe_len;
+ if (!requeue_only)
error = xfs_trans_log_finish_refcount_update(tp, cudp,
- type, refc->pe_startblock, refc->pe_len,
- &new_fsb, &new_len, &rcur);
+ &fake, &rcur);
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
&cuip->cui_format,
@@ -544,10 +531,13 @@ xfs_cui_item_recover(
goto abort_error;
/* Requeue what we didn't finish. */
- if (new_len > 0) {
- irec.br_startblock = new_fsb;
- irec.br_blockcount = new_len;
- switch (type) {
+ if (fake.ri_blockcount > 0) {
+ struct xfs_bmbt_irec irec = {
+ .br_startblock = fake.ri_startblock,
+ .br_blockcount = fake.ri_blockcount,
+ };
+
+ switch (fake.ri_type) {
case XFS_REFCOUNT_INCREASE:
xfs_refcount_increase_extent(tp, &irec);
break;
@@ -596,18 +586,18 @@ xfs_cui_item_relog(
{
struct xfs_cud_log_item *cudp;
struct xfs_cui_log_item *cuip;
- struct xfs_phys_extent *extp;
+ struct xfs_phys_extent *pmap;
unsigned int count;
count = CUI_ITEM(intent)->cui_format.cui_nextents;
- extp = CUI_ITEM(intent)->cui_format.cui_extents;
+ pmap = CUI_ITEM(intent)->cui_format.cui_extents;
tp->t_flags |= XFS_TRANS_DIRTY;
cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
cuip = xfs_cui_init(tp->t_mountp, count);
- memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
+ memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
atomic_set(&cuip->cui_next_extent, count);
xfs_trans_add_item(tp, &cuip->cui_item);
set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 534504ede1a3..a1619d67015f 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -244,40 +244,40 @@ xfs_trans_get_rud(
/* Set the map extent flags for this reverse mapping. */
static void
xfs_trans_set_rmap_flags(
- struct xfs_map_extent *rmap,
+ struct xfs_map_extent *map,
enum xfs_rmap_intent_type type,
int whichfork,
xfs_exntst_t state)
{
- rmap->me_flags = 0;
+ map->me_flags = 0;
if (state == XFS_EXT_UNWRITTEN)
- rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
+ map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
if (whichfork == XFS_ATTR_FORK)
- rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
+ map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
switch (type) {
case XFS_RMAP_MAP:
- rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
+ map->me_flags |= XFS_RMAP_EXTENT_MAP;
break;
case XFS_RMAP_MAP_SHARED:
- rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+ map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
break;
case XFS_RMAP_UNMAP:
- rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
+ map->me_flags |= XFS_RMAP_EXTENT_UNMAP;
break;
case XFS_RMAP_UNMAP_SHARED:
- rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+ map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
break;
case XFS_RMAP_CONVERT:
- rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
+ map->me_flags |= XFS_RMAP_EXTENT_CONVERT;
break;
case XFS_RMAP_CONVERT_SHARED:
- rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+ map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
break;
case XFS_RMAP_ALLOC:
- rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
+ map->me_flags |= XFS_RMAP_EXTENT_ALLOC;
break;
case XFS_RMAP_FREE:
- rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
+ map->me_flags |= XFS_RMAP_EXTENT_FREE;
break;
default:
ASSERT(0);
@@ -293,19 +293,12 @@ static int
xfs_trans_log_finish_rmap_update(
struct xfs_trans *tp,
struct xfs_rud_log_item *rudp,
- enum xfs_rmap_intent_type type,
- uint64_t owner,
- int whichfork,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
- xfs_exntst_t state,
+ struct xfs_rmap_intent *ri,
struct xfs_btree_cur **pcur)
{
int error;
- error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
- startblock, blockcount, state, pcur);
+ error = xfs_rmap_finish_one(tp, ri, pcur);
/*
* Mark the transaction dirty, even on error. This ensures the
@@ -342,7 +335,7 @@ STATIC void
xfs_rmap_update_log_item(
struct xfs_trans *tp,
struct xfs_rui_log_item *ruip,
- struct xfs_rmap_intent *rmap)
+ struct xfs_rmap_intent *ri)
{
uint next_extent;
struct xfs_map_extent *map;
@@ -358,12 +351,12 @@ xfs_rmap_update_log_item(
next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
ASSERT(next_extent < ruip->rui_format.rui_nextents);
map = &ruip->rui_format.rui_extents[next_extent];
- map->me_owner = rmap->ri_owner;
- map->me_startblock = rmap->ri_bmap.br_startblock;
- map->me_startoff = rmap->ri_bmap.br_startoff;
- map->me_len = rmap->ri_bmap.br_blockcount;
- xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
- rmap->ri_bmap.br_state);
+ map->me_owner = ri->ri_owner;
+ map->me_startblock = ri->ri_bmap.br_startblock;
+ map->me_startoff = ri->ri_bmap.br_startoff;
+ map->me_len = ri->ri_bmap.br_blockcount;
+ xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork,
+ ri->ri_bmap.br_state);
}
static struct xfs_log_item *
@@ -375,15 +368,15 @@ xfs_rmap_update_create_intent(
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count);
- struct xfs_rmap_intent *rmap;
+ struct xfs_rmap_intent *ri;
ASSERT(count > 0);
xfs_trans_add_item(tp, &ruip->rui_item);
if (sort)
list_sort(mp, items, xfs_rmap_update_diff_items);
- list_for_each_entry(rmap, items, ri_list)
- xfs_rmap_update_log_item(tp, ruip, rmap);
+ list_for_each_entry(ri, items, ri_list)
+ xfs_rmap_update_log_item(tp, ruip, ri);
return &ruip->rui_item;
}
@@ -405,16 +398,14 @@ xfs_rmap_update_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
- struct xfs_rmap_intent *rmap;
+ struct xfs_rmap_intent *ri;
int error;
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done),
- rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork,
- rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock,
- rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state,
+ ri = container_of(item, struct xfs_rmap_intent, ri_list);
+
+ error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
state);
- kmem_cache_free(xfs_rmap_intent_cache, rmap);
+ kmem_cache_free(xfs_rmap_intent_cache, ri);
return error;
}
@@ -431,10 +422,10 @@ STATIC void
xfs_rmap_update_cancel_item(
struct list_head *item)
{
- struct xfs_rmap_intent *rmap;
+ struct xfs_rmap_intent *ri;
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- kmem_cache_free(xfs_rmap_intent_cache, rmap);
+ ri = container_of(item, struct xfs_rmap_intent, ri_list);
+ kmem_cache_free(xfs_rmap_intent_cache, ri);
}
const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
@@ -451,15 +442,15 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
static inline bool
xfs_rui_validate_map(
struct xfs_mount *mp,
- struct xfs_map_extent *rmap)
+ struct xfs_map_extent *map)
{
if (!xfs_has_rmapbt(mp))
return false;
- if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)
+ if (map->me_flags & ~XFS_RMAP_EXTENT_FLAGS)
return false;
- switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+ switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
case XFS_RMAP_EXTENT_MAP:
case XFS_RMAP_EXTENT_MAP_SHARED:
case XFS_RMAP_EXTENT_UNMAP:
@@ -473,14 +464,14 @@ xfs_rui_validate_map(
return false;
}
- if (!XFS_RMAP_NON_INODE_OWNER(rmap->me_owner) &&
- !xfs_verify_ino(mp, rmap->me_owner))
+ if (!XFS_RMAP_NON_INODE_OWNER(map->me_owner) &&
+ !xfs_verify_ino(mp, map->me_owner))
return false;
- if (!xfs_verify_fileext(mp, rmap->me_startoff, rmap->me_len))
+ if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
return false;
- return xfs_verify_fsbext(mp, rmap->me_startblock, rmap->me_len);
+ return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
/*
@@ -493,15 +484,11 @@ xfs_rui_item_recover(
struct list_head *capture_list)
{
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
- struct xfs_map_extent *rmap;
struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
- enum xfs_rmap_intent_type type;
- xfs_exntst_t state;
int i;
- int whichfork;
int error = 0;
/*
@@ -526,35 +513,34 @@ xfs_rui_item_recover(
rudp = xfs_trans_get_rud(tp, ruip);
for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
- rmap = &ruip->rui_format.rui_extents[i];
- state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
- whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+ struct xfs_rmap_intent fake = { };
+ struct xfs_map_extent *map;
+
+ map = &ruip->rui_format.rui_extents[i];
+ switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
case XFS_RMAP_EXTENT_MAP:
- type = XFS_RMAP_MAP;
+ fake.ri_type = XFS_RMAP_MAP;
break;
case XFS_RMAP_EXTENT_MAP_SHARED:
- type = XFS_RMAP_MAP_SHARED;
+ fake.ri_type = XFS_RMAP_MAP_SHARED;
break;
case XFS_RMAP_EXTENT_UNMAP:
- type = XFS_RMAP_UNMAP;
+ fake.ri_type = XFS_RMAP_UNMAP;
break;
case XFS_RMAP_EXTENT_UNMAP_SHARED:
- type = XFS_RMAP_UNMAP_SHARED;
+ fake.ri_type = XFS_RMAP_UNMAP_SHARED;
break;
case XFS_RMAP_EXTENT_CONVERT:
- type = XFS_RMAP_CONVERT;
+ fake.ri_type = XFS_RMAP_CONVERT;
break;
case XFS_RMAP_EXTENT_CONVERT_SHARED:
- type = XFS_RMAP_CONVERT_SHARED;
+ fake.ri_type = XFS_RMAP_CONVERT_SHARED;
break;
case XFS_RMAP_EXTENT_ALLOC:
- type = XFS_RMAP_ALLOC;
+ fake.ri_type = XFS_RMAP_ALLOC;
break;
case XFS_RMAP_EXTENT_FREE:
- type = XFS_RMAP_FREE;
+ fake.ri_type = XFS_RMAP_FREE;
break;
default:
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
@@ -563,13 +549,21 @@ xfs_rui_item_recover(
error = -EFSCORRUPTED;
goto abort_error;
}
- error = xfs_trans_log_finish_rmap_update(tp, rudp, type,
- rmap->me_owner, whichfork,
- rmap->me_startoff, rmap->me_startblock,
- rmap->me_len, state, &rcur);
+
+ fake.ri_owner = map->me_owner;
+ fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ fake.ri_bmap.br_startblock = map->me_startblock;
+ fake.ri_bmap.br_startoff = map->me_startoff;
+ fake.ri_bmap.br_blockcount = map->me_len;
+ fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+
+ error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake,
+ &rcur);
if (error == -EFSCORRUPTED)
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- rmap, sizeof(*rmap));
+ map, sizeof(*map));
if (error)
goto abort_error;
@@ -600,18 +594,18 @@ xfs_rui_item_relog(
{
struct xfs_rud_log_item *rudp;
struct xfs_rui_log_item *ruip;
- struct xfs_map_extent *extp;
+ struct xfs_map_extent *map;
unsigned int count;
count = RUI_ITEM(intent)->rui_format.rui_nextents;
- extp = RUI_ITEM(intent)->rui_format.rui_extents;
+ map = RUI_ITEM(intent)->rui_format.rui_extents;
tp->t_flags |= XFS_TRANS_DIRTY;
rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
ruip = xfs_rui_init(tp->t_mountp, count);
- memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
+ memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map));
atomic_set(&ruip->rui_next_extent, count);
xfs_trans_add_item(tp, &ruip->rui_item);
set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index f7faf6e70d7f..a3c6b1548723 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -69,7 +69,7 @@ static struct attribute *xfs_mp_attrs[] = {
};
ATTRIBUTE_GROUPS(xfs_mp);
-struct kobj_type xfs_mp_ktype = {
+const struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_mp_groups,
@@ -266,7 +266,7 @@ static struct attribute *xfs_dbg_attrs[] = {
};
ATTRIBUTE_GROUPS(xfs_dbg);
-struct kobj_type xfs_dbg_ktype = {
+const struct kobj_type xfs_dbg_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_dbg_groups,
@@ -324,7 +324,7 @@ static struct attribute *xfs_stats_attrs[] = {
};
ATTRIBUTE_GROUPS(xfs_stats);
-struct kobj_type xfs_stats_ktype = {
+const struct kobj_type xfs_stats_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_stats_groups,
@@ -410,7 +410,7 @@ static struct attribute *xfs_log_attrs[] = {
};
ATTRIBUTE_GROUPS(xfs_log);
-struct kobj_type xfs_log_ktype = {
+const struct kobj_type xfs_log_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_log_groups,
@@ -564,13 +564,13 @@ static struct attribute *xfs_error_attrs[] = {
};
ATTRIBUTE_GROUPS(xfs_error);
-static struct kobj_type xfs_error_cfg_ktype = {
+static const struct kobj_type xfs_error_cfg_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_error_groups,
};
-static struct kobj_type xfs_error_ktype = {
+static const struct kobj_type xfs_error_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
};
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 513095e353a5..148893ebfdef 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -7,10 +7,10 @@
#ifndef __XFS_SYSFS_H__
#define __XFS_SYSFS_H__
-extern struct kobj_type xfs_mp_ktype; /* xfs_mount */
-extern struct kobj_type xfs_dbg_ktype; /* debug */
-extern struct kobj_type xfs_log_ktype; /* xlog */
-extern struct kobj_type xfs_stats_ktype; /* stats */
+extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */
+extern const struct kobj_type xfs_dbg_ktype; /* debug */
+extern const struct kobj_type xfs_log_ktype; /* xlog */
+extern const struct kobj_type xfs_stats_ktype; /* stats */
static inline struct xfs_kobj *
to_kobj(struct kobject *kobject)
@@ -28,7 +28,7 @@ xfs_sysfs_release(struct kobject *kobject)
static inline int
xfs_sysfs_init(
struct xfs_kobj *kobj,
- struct kobj_type *ktype,
+ const struct kobj_type *ktype,
struct xfs_kobj *parent_kobj,
const char *name)
{
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 421d1e504ac4..6b0e9ae7c513 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3207,17 +3207,14 @@ DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
TRACE_EVENT(xfs_refcount_finish_one_leftover,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int type, xfs_agblock_t agbno, xfs_extlen_t len,
- xfs_agblock_t new_agbno, xfs_extlen_t new_len),
- TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len),
+ int type, xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, type, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(int, type)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
- __field(xfs_agblock_t, new_agbno)
- __field(xfs_extlen_t, new_len)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
@@ -3225,17 +3222,13 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover,
__entry->type = type;
__entry->agbno = agbno;
__entry->len = len;
- __entry->new_agbno = new_agbno;
- __entry->new_len = new_len;
),
- TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x",
+ TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->type,
__entry->agno,
__entry->agbno,
- __entry->len,
- __entry->new_agbno,
- __entry->new_len)
+ __entry->len)
);
/* simple inode-based error/%ip tracepoint class */
diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile
index 9fe54f5319f2..645f7229de4a 100644
--- a/fs/zonefs/Makefile
+++ b/fs/zonefs/Makefile
@@ -3,4 +3,4 @@ ccflags-y += -I$(src)
obj-$(CONFIG_ZONEFS_FS) += zonefs.o
-zonefs-y := super.o sysfs.o
+zonefs-y := super.o file.o sysfs.o
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
new file mode 100644
index 000000000000..738b0e28d74b
--- /dev/null
+++ b/fs/zonefs/file.c
@@ -0,0 +1,878 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Simple file system for zoned block devices exposing zones as files.
+ *
+ * Copyright (C) 2022 Western Digital Corporation or its affiliates.
+ */
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/iomap.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/statfs.h>
+#include <linux/writeback.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
+#include <linux/uio.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "zonefs.h"
+
+#include "trace.h"
+
+static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+ struct super_block *sb = inode->i_sb;
+ loff_t isize;
+
+ /*
+ * All blocks are always mapped below EOF. If reading past EOF,
+ * act as if there is a hole up to the file maximum size.
+ */
+ mutex_lock(&zi->i_truncate_mutex);
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+ isize = i_size_read(inode);
+ if (iomap->offset >= isize) {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = length;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
+ iomap->length = isize - iomap->offset;
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ trace_zonefs_iomap_begin(inode, iomap);
+
+ return 0;
+}
+
+static const struct iomap_ops zonefs_read_iomap_ops = {
+ .iomap_begin = zonefs_read_iomap_begin,
+};
+
+static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+ struct super_block *sb = inode->i_sb;
+ loff_t isize;
+
+ /* All write I/Os should always be within the file maximum size */
+ if (WARN_ON_ONCE(offset + length > z->z_capacity))
+ return -EIO;
+
+ /*
+ * Sequential zones can only accept direct writes. This is already
+ * checked when writes are issued, so warn if we see a page writeback
+ * operation.
+ */
+ if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
+ return -EIO;
+
+ /*
+ * For conventional zones, all blocks are always mapped. For sequential
+ * zones, all blocks after always mapped below the inode size (zone
+ * write pointer) and unwriten beyond.
+ */
+ mutex_lock(&zi->i_truncate_mutex);
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+ iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
+ isize = i_size_read(inode);
+ if (iomap->offset >= isize) {
+ iomap->type = IOMAP_UNWRITTEN;
+ iomap->length = z->z_capacity - iomap->offset;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->length = isize - iomap->offset;
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ trace_zonefs_iomap_begin(inode, iomap);
+
+ return 0;
+}
+
+static const struct iomap_ops zonefs_write_iomap_ops = {
+ .iomap_begin = zonefs_write_iomap_begin,
+};
+
+static int zonefs_read_folio(struct file *unused, struct folio *folio)
+{
+ return iomap_read_folio(folio, &zonefs_read_iomap_ops);
+}
+
+static void zonefs_readahead(struct readahead_control *rac)
+{
+ iomap_readahead(rac, &zonefs_read_iomap_ops);
+}
+
+/*
+ * Map blocks for page writeback. This is used only on conventional zone files,
+ * which implies that the page range can only be within the fixed inode size.
+ */
+static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset)
+{
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+
+ if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
+ return -EIO;
+ if (WARN_ON_ONCE(offset >= i_size_read(inode)))
+ return -EIO;
+
+ /* If the mapping is already OK, nothing needs to be done */
+ if (offset >= wpc->iomap.offset &&
+ offset < wpc->iomap.offset + wpc->iomap.length)
+ return 0;
+
+ return zonefs_write_iomap_begin(inode, offset,
+ z->z_capacity - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
+}
+
+static const struct iomap_writeback_ops zonefs_writeback_ops = {
+ .map_blocks = zonefs_write_map_blocks,
+};
+
+static int zonefs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct iomap_writepage_ctx wpc = { };
+
+ return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
+}
+
+static int zonefs_swap_activate(struct swap_info_struct *sis,
+ struct file *swap_file, sector_t *span)
+{
+ struct inode *inode = file_inode(swap_file);
+
+ if (zonefs_inode_is_seq(inode)) {
+ zonefs_err(inode->i_sb,
+ "swap file: not a conventional zone file\n");
+ return -EINVAL;
+ }
+
+ return iomap_swapfile_activate(sis, swap_file, span,
+ &zonefs_read_iomap_ops);
+}
+
+const struct address_space_operations zonefs_file_aops = {
+ .read_folio = zonefs_read_folio,
+ .readahead = zonefs_readahead,
+ .writepages = zonefs_writepages,
+ .dirty_folio = filemap_dirty_folio,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
+ .migrate_folio = filemap_migrate_folio,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+ .direct_IO = noop_direct_IO,
+ .swap_activate = zonefs_swap_activate,
+};
+
+int zonefs_file_truncate(struct inode *inode, loff_t isize)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+ loff_t old_isize;
+ enum req_op op;
+ int ret = 0;
+
+ /*
+ * Only sequential zone files can be truncated and truncation is allowed
+ * only down to a 0 size, which is equivalent to a zone reset, and to
+ * the maximum file size, which is equivalent to a zone finish.
+ */
+ if (!zonefs_zone_is_seq(z))
+ return -EPERM;
+
+ if (!isize)
+ op = REQ_OP_ZONE_RESET;
+ else if (isize == z->z_capacity)
+ op = REQ_OP_ZONE_FINISH;
+ else
+ return -EPERM;
+
+ inode_dio_wait(inode);
+
+ /* Serialize against page faults */
+ filemap_invalidate_lock(inode->i_mapping);
+
+ /* Serialize against zonefs_iomap_begin() */
+ mutex_lock(&zi->i_truncate_mutex);
+
+ old_isize = i_size_read(inode);
+ if (isize == old_isize)
+ goto unlock;
+
+ ret = zonefs_inode_zone_mgmt(inode, op);
+ if (ret)
+ goto unlock;
+
+ /*
+ * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
+ * take care of open zones.
+ */
+ if (z->z_flags & ZONEFS_ZONE_OPEN) {
+ /*
+ * Truncating a zone to EMPTY or FULL is the equivalent of
+ * closing the zone. For a truncation to 0, we need to
+ * re-open the zone to ensure new writes can be processed.
+ * For a truncation to the maximum file size, the zone is
+ * closed and writes cannot be accepted anymore, so clear
+ * the open flag.
+ */
+ if (!isize)
+ ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+ else
+ z->z_flags &= ~ZONEFS_ZONE_OPEN;
+ }
+
+ zonefs_update_stats(inode, isize);
+ truncate_setsize(inode, isize);
+ z->z_wpoffset = isize;
+ zonefs_inode_account_active(inode);
+
+unlock:
+ mutex_unlock(&zi->i_truncate_mutex);
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ return ret;
+}
+
+static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ struct inode *inode = file_inode(file);
+ int ret = 0;
+
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
+ /*
+ * Since only direct writes are allowed in sequential files, page cache
+ * flush is needed only for conventional zone files.
+ */
+ if (zonefs_inode_is_cnv(inode))
+ ret = file_write_and_wait_range(file, start, end);
+ if (!ret)
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev);
+
+ if (ret)
+ zonefs_io_error(inode, true);
+
+ return ret;
+}
+
+static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ vm_fault_t ret;
+
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Sanity check: only conventional zone files can have shared
+ * writeable mappings.
+ */
+ if (zonefs_inode_is_seq(inode))
+ return VM_FAULT_NOPAGE;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vmf->vma->vm_file);
+
+ /* Serialize against truncates */
+ filemap_invalidate_lock_shared(inode->i_mapping);
+ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
+
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+
+static const struct vm_operations_struct zonefs_file_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = zonefs_filemap_page_mkwrite,
+};
+
+static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ /*
+ * Conventional zones accept random writes, so their files can support
+ * shared writable mappings. For sequential zone files, only read
+ * mappings are possible since there are no guarantees for write
+ * ordering between msync() and page cache writeback.
+ */
+ if (zonefs_inode_is_seq(file_inode(file)) &&
+ (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ return -EINVAL;
+
+ file_accessed(file);
+ vma->vm_ops = &zonefs_file_vm_ops;
+
+ return 0;
+}
+
+static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ loff_t isize = i_size_read(file_inode(file));
+
+ /*
+ * Seeks are limited to below the zone size for conventional zones
+ * and below the zone write pointer for sequential zones. In both
+ * cases, this limit is the inode size.
+ */
+ return generic_file_llseek_size(file, offset, whence, isize, isize);
+}
+
+static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned int flags)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+ if (error) {
+ zonefs_io_error(inode, true);
+ return error;
+ }
+
+ if (size && zonefs_inode_is_seq(inode)) {
+ /*
+ * Note that we may be seeing completions out of order,
+ * but that is not a problem since a write completed
+ * successfully necessarily means that all preceding writes
+ * were also successful. So we can safely increase the inode
+ * size to the write end location.
+ */
+ mutex_lock(&zi->i_truncate_mutex);
+ if (i_size_read(inode) < iocb->ki_pos + size) {
+ zonefs_update_stats(inode, iocb->ki_pos + size);
+ zonefs_i_size_write(inode, iocb->ki_pos + size);
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
+ }
+
+ return 0;
+}
+
+static const struct iomap_dio_ops zonefs_write_dio_ops = {
+ .end_io = zonefs_file_write_dio_end_io,
+};
+
+static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ unsigned int max = bdev_max_zone_append_sectors(bdev);
+ struct bio *bio;
+ ssize_t size;
+ int nr_pages;
+ ssize_t ret;
+
+ max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
+ iov_iter_truncate(from, max);
+
+ nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
+ if (!nr_pages)
+ return 0;
+
+ bio = bio_alloc(bdev, nr_pages,
+ REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
+ bio->bi_iter.bi_sector = z->z_sector;
+ bio->bi_ioprio = iocb->ki_ioprio;
+ if (iocb_is_dsync(iocb))
+ bio->bi_opf |= REQ_FUA;
+
+ ret = bio_iov_iter_get_pages(bio, from);
+ if (unlikely(ret))
+ goto out_release;
+
+ size = bio->bi_iter.bi_size;
+ task_io_account_write(size);
+
+ if (iocb->ki_flags & IOCB_HIPRI)
+ bio_set_polled(bio, iocb);
+
+ ret = submit_bio_wait(bio);
+
+ /*
+ * If the file zone was written underneath the file system, the zone
+ * write pointer may not be where we expect it to be, but the zone
+ * append write can still succeed. So check manually that we wrote where
+ * we intended to, that is, at zi->i_wpoffset.
+ */
+ if (!ret) {
+ sector_t wpsector =
+ z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT);
+
+ if (bio->bi_iter.bi_sector != wpsector) {
+ zonefs_warn(inode->i_sb,
+ "Corrupted write pointer %llu for zone at %llu\n",
+ wpsector, z->z_sector);
+ ret = -EIO;
+ }
+ }
+
+ zonefs_file_write_dio_end_io(iocb, size, ret, 0);
+ trace_zonefs_file_dio_append(inode, size, ret);
+
+out_release:
+ bio_release_pages(bio, false);
+ bio_put(bio);
+
+ if (ret >= 0) {
+ iocb->ki_pos += size;
+ return size;
+ }
+
+ return ret;
+}
+
+/*
+ * Do not exceed the LFS limits nor the file zone size. If pos is under the
+ * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
+ */
+static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
+ loff_t count)
+{
+ struct inode *inode = file_inode(file);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+ loff_t limit = rlimit(RLIMIT_FSIZE);
+ loff_t max_size = z->z_capacity;
+
+ if (limit != RLIM_INFINITY) {
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
+ }
+ count = min(count, limit - pos);
+ }
+
+ if (!(file->f_flags & O_LARGEFILE))
+ max_size = min_t(loff_t, MAX_NON_LFS, max_size);
+
+ if (unlikely(pos >= max_size))
+ return -EFBIG;
+
+ return min(count, max_size - pos);
+}
+
+static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+ loff_t count;
+
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+
+ if (!iov_iter_count(from))
+ return 0;
+
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+ return -EINVAL;
+
+ if (iocb->ki_flags & IOCB_APPEND) {
+ if (zonefs_zone_is_cnv(z))
+ return -EINVAL;
+ mutex_lock(&zi->i_truncate_mutex);
+ iocb->ki_pos = z->z_wpoffset;
+ mutex_unlock(&zi->i_truncate_mutex);
+ }
+
+ count = zonefs_write_check_limits(file, iocb->ki_pos,
+ iov_iter_count(from));
+ if (count < 0)
+ return count;
+
+ iov_iter_truncate(from, count);
+ return iov_iter_count(from);
+}
+
+/*
+ * Handle direct writes. For sequential zone files, this is the only possible
+ * write path. For these files, check that the user is issuing writes
+ * sequentially from the end of the file. This code assumes that the block layer
+ * delivers write requests to the device in sequential order. This is always the
+ * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
+ * elevator feature is being used (e.g. mq-deadline). The block layer always
+ * automatically select such an elevator for zoned block devices during the
+ * device initialization.
+ */
+static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+ struct super_block *sb = inode->i_sb;
+ bool sync = is_sync_kiocb(iocb);
+ bool append = false;
+ ssize_t ret, count;
+
+ /*
+ * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
+ * as this can cause write reordering (e.g. the first aio gets EAGAIN
+ * on the inode lock but the second goes through but is now unaligned).
+ */
+ if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT))
+ return -EOPNOTSUPP;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock(inode);
+ }
+
+ count = zonefs_write_checks(iocb, from);
+ if (count <= 0) {
+ ret = count;
+ goto inode_unlock;
+ }
+
+ if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
+ ret = -EINVAL;
+ goto inode_unlock;
+ }
+
+ /* Enforce sequential writes (append only) in sequential zones */
+ if (zonefs_zone_is_seq(z)) {
+ mutex_lock(&zi->i_truncate_mutex);
+ if (iocb->ki_pos != z->z_wpoffset) {
+ mutex_unlock(&zi->i_truncate_mutex);
+ ret = -EINVAL;
+ goto inode_unlock;
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
+ append = sync;
+ }
+
+ if (append)
+ ret = zonefs_file_dio_append(iocb, from);
+ else
+ ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
+ &zonefs_write_dio_ops, 0, NULL, 0);
+ if (zonefs_zone_is_seq(z) &&
+ (ret > 0 || ret == -EIOCBQUEUED)) {
+ if (ret > 0)
+ count = ret;
+
+ /*
+ * Update the zone write pointer offset assuming the write
+ * operation succeeded. If it did not, the error recovery path
+ * will correct it. Also do active seq file accounting.
+ */
+ mutex_lock(&zi->i_truncate_mutex);
+ z->z_wpoffset += count;
+ zonefs_inode_account_active(inode);
+ mutex_unlock(&zi->i_truncate_mutex);
+ }
+
+inode_unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
+static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ /*
+ * Direct IO writes are mandatory for sequential zone files so that the
+ * write IO issuing order is preserved.
+ */
+ if (zonefs_inode_is_seq(inode))
+ return -EIO;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock(inode);
+ }
+
+ ret = zonefs_write_checks(iocb, from);
+ if (ret <= 0)
+ goto inode_unlock;
+
+ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
+ if (ret > 0)
+ iocb->ki_pos += ret;
+ else if (ret == -EIO)
+ zonefs_io_error(inode, true);
+
+inode_unlock:
+ inode_unlock(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+
+ return ret;
+}
+
+static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
+ if (sb_rdonly(inode->i_sb))
+ return -EROFS;
+
+ /* Write operations beyond the zone capacity are not allowed */
+ if (iocb->ki_pos >= z->z_capacity)
+ return -EFBIG;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ssize_t ret = zonefs_file_dio_write(iocb, from);
+
+ if (ret != -ENOTBLK)
+ return ret;
+ }
+
+ return zonefs_file_buffered_write(iocb, from);
+}
+
+static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned int flags)
+{
+ if (error) {
+ zonefs_io_error(file_inode(iocb->ki_filp), false);
+ return error;
+ }
+
+ return 0;
+}
+
+static const struct iomap_dio_ops zonefs_read_dio_ops = {
+ .end_io = zonefs_file_read_dio_end_io,
+};
+
+static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+ struct super_block *sb = inode->i_sb;
+ loff_t isize;
+ ssize_t ret;
+
+ /* Offline zones cannot be read */
+ if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
+ return -EPERM;
+
+ if (iocb->ki_pos >= z->z_capacity)
+ return 0;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ /* Limit read operations to written data */
+ mutex_lock(&zi->i_truncate_mutex);
+ isize = i_size_read(inode);
+ if (iocb->ki_pos >= isize) {
+ mutex_unlock(&zi->i_truncate_mutex);
+ ret = 0;
+ goto inode_unlock;
+ }
+ iov_iter_truncate(to, isize - iocb->ki_pos);
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ size_t count = iov_iter_count(to);
+
+ if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
+ ret = -EINVAL;
+ goto inode_unlock;
+ }
+ file_accessed(iocb->ki_filp);
+ ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
+ &zonefs_read_dio_ops, 0, NULL, 0);
+ } else {
+ ret = generic_file_read_iter(iocb, to);
+ if (ret == -EIO)
+ zonefs_io_error(inode, false);
+ }
+
+inode_unlock:
+ inode_unlock_shared(inode);
+
+ return ret;
+}
+
+/*
+ * Write open accounting is done only for sequential files.
+ */
+static inline bool zonefs_seq_file_need_wro(struct inode *inode,
+ struct file *file)
+{
+ if (zonefs_inode_is_cnv(inode))
+ return false;
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return false;
+
+ return true;
+}
+
+static int zonefs_seq_file_write_open(struct inode *inode)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+ int ret = 0;
+
+ mutex_lock(&zi->i_truncate_mutex);
+
+ if (!zi->i_wr_refcnt) {
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
+
+ if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+
+ if (sbi->s_max_wro_seq_files
+ && wro > sbi->s_max_wro_seq_files) {
+ atomic_dec(&sbi->s_wro_seq_files);
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ if (i_size_read(inode) < z->z_capacity) {
+ ret = zonefs_inode_zone_mgmt(inode,
+ REQ_OP_ZONE_OPEN);
+ if (ret) {
+ atomic_dec(&sbi->s_wro_seq_files);
+ goto unlock;
+ }
+ z->z_flags |= ZONEFS_ZONE_OPEN;
+ zonefs_inode_account_active(inode);
+ }
+ }
+ }
+
+ zi->i_wr_refcnt++;
+
+unlock:
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ return ret;
+}
+
+static int zonefs_file_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = generic_file_open(inode, file);
+ if (ret)
+ return ret;
+
+ if (zonefs_seq_file_need_wro(inode, file))
+ return zonefs_seq_file_write_open(inode);
+
+ return 0;
+}
+
+static void zonefs_seq_file_write_close(struct inode *inode)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+ struct super_block *sb = inode->i_sb;
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ int ret = 0;
+
+ mutex_lock(&zi->i_truncate_mutex);
+
+ zi->i_wr_refcnt--;
+ if (zi->i_wr_refcnt)
+ goto unlock;
+
+ /*
+ * The file zone may not be open anymore (e.g. the file was truncated to
+ * its maximum size or it was fully written). For this case, we only
+ * need to decrement the write open count.
+ */
+ if (z->z_flags & ZONEFS_ZONE_OPEN) {
+ ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+ if (ret) {
+ __zonefs_io_error(inode, false);
+ /*
+ * Leaving zones explicitly open may lead to a state
+ * where most zones cannot be written (zone resources
+ * exhausted). So take preventive action by remounting
+ * read-only.
+ */
+ if (z->z_flags & ZONEFS_ZONE_OPEN &&
+ !(sb->s_flags & SB_RDONLY)) {
+ zonefs_warn(sb,
+ "closing zone at %llu failed %d\n",
+ z->z_sector, ret);
+ zonefs_warn(sb,
+ "remounting filesystem read-only\n");
+ sb->s_flags |= SB_RDONLY;
+ }
+ goto unlock;
+ }
+
+ z->z_flags &= ~ZONEFS_ZONE_OPEN;
+ zonefs_inode_account_active(inode);
+ }
+
+ atomic_dec(&sbi->s_wro_seq_files);
+
+unlock:
+ mutex_unlock(&zi->i_truncate_mutex);
+}
+
+static int zonefs_file_release(struct inode *inode, struct file *file)
+{
+ /*
+ * If we explicitly open a zone we must close it again as well, but the
+ * zone management operation can fail (either due to an IO error or as
+ * the zone has gone offline or read-only). Make sure we don't fail the
+ * close(2) for user-space.
+ */
+ if (zonefs_seq_file_need_wro(inode, file))
+ zonefs_seq_file_write_close(inode);
+
+ return 0;
+}
+
+const struct file_operations zonefs_file_operations = {
+ .open = zonefs_file_open,
+ .release = zonefs_file_release,
+ .fsync = zonefs_file_fsync,
+ .mmap = zonefs_file_mmap,
+ .llseek = zonefs_file_llseek,
+ .read_iter = zonefs_file_read_iter,
+ .write_iter = zonefs_file_write_iter,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .iopoll = iocb_bio_iopoll,
+};
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 72ef97320b99..23b8b299c64e 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -28,33 +28,47 @@
#include "trace.h"
/*
- * Manage the active zone count. Called with zi->i_truncate_mutex held.
+ * Get the name of a zone group directory.
*/
-static void zonefs_account_active(struct inode *inode)
+static const char *zonefs_zgroup_name(enum zonefs_ztype ztype)
{
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ switch (ztype) {
+ case ZONEFS_ZTYPE_CNV:
+ return "cnv";
+ case ZONEFS_ZTYPE_SEQ:
+ return "seq";
+ default:
+ WARN_ON_ONCE(1);
+ return "???";
+ }
+}
- lockdep_assert_held(&zi->i_truncate_mutex);
+/*
+ * Manage the active zone count.
+ */
+static void zonefs_account_active(struct super_block *sb,
+ struct zonefs_zone *z)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+ if (zonefs_zone_is_cnv(z))
return;
/*
* For zones that transitioned to the offline or readonly condition,
* we only need to clear the active state.
*/
- if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY))
+ if (z->z_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY))
goto out;
/*
* If the zone is active, that is, if it is explicitly open or
* partially written, check if it was already accounted as active.
*/
- if ((zi->i_flags & ZONEFS_ZONE_OPEN) ||
- (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) {
- if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) {
- zi->i_flags |= ZONEFS_ZONE_ACTIVE;
+ if ((z->z_flags & ZONEFS_ZONE_OPEN) ||
+ (z->z_wpoffset > 0 && z->z_wpoffset < z->z_capacity)) {
+ if (!(z->z_flags & ZONEFS_ZONE_ACTIVE)) {
+ z->z_flags |= ZONEFS_ZONE_ACTIVE;
atomic_inc(&sbi->s_active_seq_files);
}
return;
@@ -62,18 +76,29 @@ static void zonefs_account_active(struct inode *inode)
out:
/* The zone is not active. If it was, update the active count */
- if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
- zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
+ if (z->z_flags & ZONEFS_ZONE_ACTIVE) {
+ z->z_flags &= ~ZONEFS_ZONE_ACTIVE;
atomic_dec(&sbi->s_active_seq_files);
}
}
-static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op)
+/*
+ * Manage the active zone count. Called with zi->i_truncate_mutex held.
+ */
+void zonefs_inode_account_active(struct inode *inode)
{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- int ret;
+ lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex);
- lockdep_assert_held(&zi->i_truncate_mutex);
+ return zonefs_account_active(inode->i_sb, zonefs_inode_zone(inode));
+}
+
+/*
+ * Execute a zone management operation.
+ */
+static int zonefs_zone_mgmt(struct super_block *sb,
+ struct zonefs_zone *z, enum req_op op)
+{
+ int ret;
/*
* With ZNS drives, closing an explicitly open zone that has not been
@@ -83,201 +108,49 @@ static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op)
* are exceeded, make sure that the zone does not remain active by
* resetting it.
*/
- if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset)
+ if (op == REQ_OP_ZONE_CLOSE && !z->z_wpoffset)
op = REQ_OP_ZONE_RESET;
- trace_zonefs_zone_mgmt(inode, op);
- ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
- zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
+ trace_zonefs_zone_mgmt(sb, z, op);
+ ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
+ z->z_size >> SECTOR_SHIFT, GFP_NOFS);
if (ret) {
- zonefs_err(inode->i_sb,
+ zonefs_err(sb,
"Zone management operation %s at %llu failed %d\n",
- blk_op_str(op), zi->i_zsector, ret);
+ blk_op_str(op), z->z_sector, ret);
return ret;
}
return 0;
}
-static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
-{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
-
- i_size_write(inode, isize);
- /*
- * A full zone is no longer open/active and does not need
- * explicit closing.
- */
- if (isize >= zi->i_max_size) {
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
-
- if (zi->i_flags & ZONEFS_ZONE_ACTIVE)
- atomic_dec(&sbi->s_active_seq_files);
- zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
- }
-}
-
-static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
- loff_t length, unsigned int flags,
- struct iomap *iomap, struct iomap *srcmap)
+int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op)
{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct super_block *sb = inode->i_sb;
- loff_t isize;
+ lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex);
- /*
- * All blocks are always mapped below EOF. If reading past EOF,
- * act as if there is a hole up to the file maximum size.
- */
- mutex_lock(&zi->i_truncate_mutex);
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
- isize = i_size_read(inode);
- if (iomap->offset >= isize) {
- iomap->type = IOMAP_HOLE;
- iomap->addr = IOMAP_NULL_ADDR;
- iomap->length = length;
- } else {
- iomap->type = IOMAP_MAPPED;
- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
- iomap->length = isize - iomap->offset;
- }
- mutex_unlock(&zi->i_truncate_mutex);
-
- trace_zonefs_iomap_begin(inode, iomap);
-
- return 0;
+ return zonefs_zone_mgmt(inode->i_sb, zonefs_inode_zone(inode), op);
}
-static const struct iomap_ops zonefs_read_iomap_ops = {
- .iomap_begin = zonefs_read_iomap_begin,
-};
-
-static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
- loff_t length, unsigned int flags,
- struct iomap *iomap, struct iomap *srcmap)
+void zonefs_i_size_write(struct inode *inode, loff_t isize)
{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct super_block *sb = inode->i_sb;
- loff_t isize;
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
- /* All write I/Os should always be within the file maximum size */
- if (WARN_ON_ONCE(offset + length > zi->i_max_size))
- return -EIO;
-
- /*
- * Sequential zones can only accept direct writes. This is already
- * checked when writes are issued, so warn if we see a page writeback
- * operation.
- */
- if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
- !(flags & IOMAP_DIRECT)))
- return -EIO;
+ i_size_write(inode, isize);
/*
- * For conventional zones, all blocks are always mapped. For sequential
- * zones, all blocks after always mapped below the inode size (zone
- * write pointer) and unwriten beyond.
+ * A full zone is no longer open/active and does not need
+ * explicit closing.
*/
- mutex_lock(&zi->i_truncate_mutex);
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
- isize = i_size_read(inode);
- if (iomap->offset >= isize) {
- iomap->type = IOMAP_UNWRITTEN;
- iomap->length = zi->i_max_size - iomap->offset;
- } else {
- iomap->type = IOMAP_MAPPED;
- iomap->length = isize - iomap->offset;
- }
- mutex_unlock(&zi->i_truncate_mutex);
-
- trace_zonefs_iomap_begin(inode, iomap);
-
- return 0;
-}
-
-static const struct iomap_ops zonefs_write_iomap_ops = {
- .iomap_begin = zonefs_write_iomap_begin,
-};
-
-static int zonefs_read_folio(struct file *unused, struct folio *folio)
-{
- return iomap_read_folio(folio, &zonefs_read_iomap_ops);
-}
-
-static void zonefs_readahead(struct readahead_control *rac)
-{
- iomap_readahead(rac, &zonefs_read_iomap_ops);
-}
-
-/*
- * Map blocks for page writeback. This is used only on conventional zone files,
- * which implies that the page range can only be within the fixed inode size.
- */
-static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset)
-{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
-
- if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
- return -EIO;
- if (WARN_ON_ONCE(offset >= i_size_read(inode)))
- return -EIO;
-
- /* If the mapping is already OK, nothing needs to be done */
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
-
- return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
-}
-
-static const struct iomap_writeback_ops zonefs_writeback_ops = {
- .map_blocks = zonefs_write_map_blocks,
-};
-
-static int zonefs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct iomap_writepage_ctx wpc = { };
-
- return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
-}
-
-static int zonefs_swap_activate(struct swap_info_struct *sis,
- struct file *swap_file, sector_t *span)
-{
- struct inode *inode = file_inode(swap_file);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ if (isize >= z->z_capacity) {
+ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
- if (zi->i_ztype != ZONEFS_ZTYPE_CNV) {
- zonefs_err(inode->i_sb,
- "swap file: not a conventional zone file\n");
- return -EINVAL;
+ if (z->z_flags & ZONEFS_ZONE_ACTIVE)
+ atomic_dec(&sbi->s_active_seq_files);
+ z->z_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
}
-
- return iomap_swapfile_activate(sis, swap_file, span,
- &zonefs_read_iomap_ops);
}
-static const struct address_space_operations zonefs_file_aops = {
- .read_folio = zonefs_read_folio,
- .readahead = zonefs_readahead,
- .writepages = zonefs_writepages,
- .dirty_folio = filemap_dirty_folio,
- .release_folio = iomap_release_folio,
- .invalidate_folio = iomap_invalidate_folio,
- .migrate_folio = filemap_migrate_folio,
- .is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
- .direct_IO = noop_direct_IO,
- .swap_activate = zonefs_swap_activate,
-};
-
-static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
+void zonefs_update_stats(struct inode *inode, loff_t new_isize)
{
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -310,63 +183,69 @@ static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
}
/*
- * Check a zone condition and adjust its file inode access permissions for
- * offline and readonly zones. Return the inode size corresponding to the
- * amount of readable data in the zone.
+ * Check a zone condition. Return the amount of written (and still readable)
+ * data in the zone.
*/
-static loff_t zonefs_check_zone_condition(struct inode *inode,
- struct blk_zone *zone, bool warn,
- bool mount)
+static loff_t zonefs_check_zone_condition(struct super_block *sb,
+ struct zonefs_zone *z,
+ struct blk_zone *zone)
{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
-
switch (zone->cond) {
case BLK_ZONE_COND_OFFLINE:
- /*
- * Dead zone: make the inode immutable, disable all accesses
- * and set the file size to 0 (zone wp set to zone start).
- */
- if (warn)
- zonefs_warn(inode->i_sb, "inode %lu: offline zone\n",
- inode->i_ino);
- inode->i_flags |= S_IMMUTABLE;
- inode->i_mode &= ~0777;
- zone->wp = zone->start;
- zi->i_flags |= ZONEFS_ZONE_OFFLINE;
+ zonefs_warn(sb, "Zone %llu: offline zone\n",
+ z->z_sector);
+ z->z_flags |= ZONEFS_ZONE_OFFLINE;
return 0;
case BLK_ZONE_COND_READONLY:
/*
- * The write pointer of read-only zones is invalid. If such a
- * zone is found during mount, the file size cannot be retrieved
- * so we treat the zone as offline (mount == true case).
- * Otherwise, keep the file size as it was when last updated
- * so that the user can recover data. In both cases, writes are
- * always disabled for the zone.
+ * The write pointer of read-only zones is invalid, so we cannot
+ * determine the zone wpoffset (inode size). We thus keep the
+ * zone wpoffset as is, which leads to an empty file
+ * (wpoffset == 0) on mount. For a runtime error, this keeps
+ * the inode size as it was when last updated so that the user
+ * can recover data.
*/
- if (warn)
- zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n",
- inode->i_ino);
- inode->i_flags |= S_IMMUTABLE;
- if (mount) {
- zone->cond = BLK_ZONE_COND_OFFLINE;
- inode->i_mode &= ~0777;
- zone->wp = zone->start;
- zi->i_flags |= ZONEFS_ZONE_OFFLINE;
- return 0;
- }
- zi->i_flags |= ZONEFS_ZONE_READONLY;
- inode->i_mode &= ~0222;
- return i_size_read(inode);
+ zonefs_warn(sb, "Zone %llu: read-only zone\n",
+ z->z_sector);
+ z->z_flags |= ZONEFS_ZONE_READONLY;
+ if (zonefs_zone_is_cnv(z))
+ return z->z_capacity;
+ return z->z_wpoffset;
case BLK_ZONE_COND_FULL:
/* The write pointer of full zones is invalid. */
- return zi->i_max_size;
+ return z->z_capacity;
default:
- if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
- return zi->i_max_size;
+ if (zonefs_zone_is_cnv(z))
+ return z->z_capacity;
return (zone->wp - zone->start) << SECTOR_SHIFT;
}
}
+/*
+ * Check a zone condition and adjust its inode access permissions for
+ * offline and readonly zones.
+ */
+static void zonefs_inode_update_mode(struct inode *inode)
+{
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+
+ if (z->z_flags & ZONEFS_ZONE_OFFLINE) {
+ /* Offline zones cannot be read nor written */
+ inode->i_flags |= S_IMMUTABLE;
+ inode->i_mode &= ~0777;
+ } else if (z->z_flags & ZONEFS_ZONE_READONLY) {
+ /* Readonly zones cannot be written */
+ inode->i_flags |= S_IMMUTABLE;
+ if (z->z_flags & ZONEFS_ZONE_INIT_MODE)
+ inode->i_mode &= ~0777;
+ else
+ inode->i_mode &= ~0222;
+ }
+
+ z->z_flags &= ~ZONEFS_ZONE_INIT_MODE;
+ z->z_mode = inode->i_mode;
+}
+
struct zonefs_ioerr_data {
struct inode *inode;
bool write;
@@ -377,7 +256,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
{
struct zonefs_ioerr_data *err = data;
struct inode *inode = err->inode;
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
loff_t isize, data_size;
@@ -388,10 +267,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* as there is no inconsistency between the inode size and the amount of
* data writen in the zone (data_size).
*/
- data_size = zonefs_check_zone_condition(inode, zone, true, false);
+ data_size = zonefs_check_zone_condition(sb, z, zone);
isize = i_size_read(inode);
- if (zone->cond != BLK_ZONE_COND_OFFLINE &&
- zone->cond != BLK_ZONE_COND_READONLY &&
+ if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
!err->write && isize == data_size)
return 0;
@@ -414,8 +292,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* In all cases, warn about inode size inconsistency and handle the
* IO error according to the zone condition and to the mount options.
*/
- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size)
- zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n",
+ if (zonefs_zone_is_seq(z) && isize != data_size)
+ zonefs_warn(sb,
+ "inode %lu: invalid size %lld (should be %lld)\n",
inode->i_ino, isize, data_size);
/*
@@ -424,24 +303,22 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* zone condition to read-only and offline respectively, as if the
* condition was signaled by the hardware.
*/
- if (zone->cond == BLK_ZONE_COND_OFFLINE ||
- sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) {
+ if ((z->z_flags & ZONEFS_ZONE_OFFLINE) ||
+ (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) {
zonefs_warn(sb, "inode %lu: read/write access disabled\n",
inode->i_ino);
- if (zone->cond != BLK_ZONE_COND_OFFLINE) {
- zone->cond = BLK_ZONE_COND_OFFLINE;
- data_size = zonefs_check_zone_condition(inode, zone,
- false, false);
- }
- } else if (zone->cond == BLK_ZONE_COND_READONLY ||
- sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) {
+ if (!(z->z_flags & ZONEFS_ZONE_OFFLINE))
+ z->z_flags |= ZONEFS_ZONE_OFFLINE;
+ zonefs_inode_update_mode(inode);
+ data_size = 0;
+ } else if ((z->z_flags & ZONEFS_ZONE_READONLY) ||
+ (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) {
zonefs_warn(sb, "inode %lu: write access disabled\n",
inode->i_ino);
- if (zone->cond != BLK_ZONE_COND_READONLY) {
- zone->cond = BLK_ZONE_COND_READONLY;
- data_size = zonefs_check_zone_condition(inode, zone,
- false, false);
- }
+ if (!(z->z_flags & ZONEFS_ZONE_READONLY))
+ z->z_flags |= ZONEFS_ZONE_READONLY;
+ zonefs_inode_update_mode(inode);
+ data_size = isize;
} else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO &&
data_size > isize) {
/* Do not expose garbage data */
@@ -455,9 +332,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* close of the zone when the inode file is closed.
*/
if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) &&
- (zone->cond == BLK_ZONE_COND_OFFLINE ||
- zone->cond == BLK_ZONE_COND_READONLY))
- zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+ (z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)))
+ z->z_flags &= ~ZONEFS_ZONE_OPEN;
/*
* If error=remount-ro was specified, any error result in remounting
@@ -474,8 +350,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
*/
zonefs_update_stats(inode, data_size);
zonefs_i_size_write(inode, data_size);
- zi->i_wpoffset = data_size;
- zonefs_account_active(inode);
+ z->z_wpoffset = data_size;
+ zonefs_inode_account_active(inode);
return 0;
}
@@ -487,9 +363,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* eventually correct the file size and zonefs inode write pointer offset
* (which can be out of sync with the drive due to partial write failures).
*/
-static void __zonefs_io_error(struct inode *inode, bool write)
+void __zonefs_io_error(struct inode *inode, bool write)
{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
unsigned int noio_flag;
@@ -505,8 +381,8 @@ static void __zonefs_io_error(struct inode *inode, bool write)
* files with aggregated conventional zones, for which the inode zone
* size is always larger than the device zone size.
*/
- if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev))
- nr_zones = zi->i_zone_size >>
+ if (z->z_size > bdev_zone_sectors(sb->s_bdev))
+ nr_zones = z->z_size >>
(sbi->s_zone_sectors_shift + SECTOR_SHIFT);
/*
@@ -518,7 +394,7 @@ static void __zonefs_io_error(struct inode *inode, bool write)
* the GFP_NOIO context avoids both problems.
*/
noio_flag = memalloc_noio_save();
- ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones,
+ ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones,
zonefs_io_error_cb, &err);
if (ret != nr_zones)
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
@@ -526,749 +402,6 @@ static void __zonefs_io_error(struct inode *inode, bool write)
memalloc_noio_restore(noio_flag);
}
-static void zonefs_io_error(struct inode *inode, bool write)
-{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
-
- mutex_lock(&zi->i_truncate_mutex);
- __zonefs_io_error(inode, write);
- mutex_unlock(&zi->i_truncate_mutex);
-}
-
-static int zonefs_file_truncate(struct inode *inode, loff_t isize)
-{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- loff_t old_isize;
- enum req_op op;
- int ret = 0;
-
- /*
- * Only sequential zone files can be truncated and truncation is allowed
- * only down to a 0 size, which is equivalent to a zone reset, and to
- * the maximum file size, which is equivalent to a zone finish.
- */
- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
- return -EPERM;
-
- if (!isize)
- op = REQ_OP_ZONE_RESET;
- else if (isize == zi->i_max_size)
- op = REQ_OP_ZONE_FINISH;
- else
- return -EPERM;
-
- inode_dio_wait(inode);
-
- /* Serialize against page faults */
- filemap_invalidate_lock(inode->i_mapping);
-
- /* Serialize against zonefs_iomap_begin() */
- mutex_lock(&zi->i_truncate_mutex);
-
- old_isize = i_size_read(inode);
- if (isize == old_isize)
- goto unlock;
-
- ret = zonefs_zone_mgmt(inode, op);
- if (ret)
- goto unlock;
-
- /*
- * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
- * take care of open zones.
- */
- if (zi->i_flags & ZONEFS_ZONE_OPEN) {
- /*
- * Truncating a zone to EMPTY or FULL is the equivalent of
- * closing the zone. For a truncation to 0, we need to
- * re-open the zone to ensure new writes can be processed.
- * For a truncation to the maximum file size, the zone is
- * closed and writes cannot be accepted anymore, so clear
- * the open flag.
- */
- if (!isize)
- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
- else
- zi->i_flags &= ~ZONEFS_ZONE_OPEN;
- }
-
- zonefs_update_stats(inode, isize);
- truncate_setsize(inode, isize);
- zi->i_wpoffset = isize;
- zonefs_account_active(inode);
-
-unlock:
- mutex_unlock(&zi->i_truncate_mutex);
- filemap_invalidate_unlock(inode->i_mapping);
-
- return ret;
-}
-
-static int zonefs_inode_setattr(struct mnt_idmap *idmap,
- struct dentry *dentry, struct iattr *iattr)
-{
- struct inode *inode = d_inode(dentry);
- int ret;
-
- if (unlikely(IS_IMMUTABLE(inode)))
- return -EPERM;
-
- ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
- if (ret)
- return ret;
-
- /*
- * Since files and directories cannot be created nor deleted, do not
- * allow setting any write attributes on the sub-directories grouping
- * files by zone type.
- */
- if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) &&
- (iattr->ia_mode & 0222))
- return -EPERM;
-
- if (((iattr->ia_valid & ATTR_UID) &&
- !uid_eq(iattr->ia_uid, inode->i_uid)) ||
- ((iattr->ia_valid & ATTR_GID) &&
- !gid_eq(iattr->ia_gid, inode->i_gid))) {
- ret = dquot_transfer(&nop_mnt_idmap, inode, iattr);
- if (ret)
- return ret;
- }
-
- if (iattr->ia_valid & ATTR_SIZE) {
- ret = zonefs_file_truncate(inode, iattr->ia_size);
- if (ret)
- return ret;
- }
-
- setattr_copy(&nop_mnt_idmap, inode, iattr);
-
- return 0;
-}
-
-static const struct inode_operations zonefs_file_inode_operations = {
- .setattr = zonefs_inode_setattr,
-};
-
-static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *inode = file_inode(file);
- int ret = 0;
-
- if (unlikely(IS_IMMUTABLE(inode)))
- return -EPERM;
-
- /*
- * Since only direct writes are allowed in sequential files, page cache
- * flush is needed only for conventional zone files.
- */
- if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
- ret = file_write_and_wait_range(file, start, end);
- if (!ret)
- ret = blkdev_issue_flush(inode->i_sb->s_bdev);
-
- if (ret)
- zonefs_io_error(inode, true);
-
- return ret;
-}
-
-static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- vm_fault_t ret;
-
- if (unlikely(IS_IMMUTABLE(inode)))
- return VM_FAULT_SIGBUS;
-
- /*
- * Sanity check: only conventional zone files can have shared
- * writeable mappings.
- */
- if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
- return VM_FAULT_NOPAGE;
-
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
-
- /* Serialize against truncates */
- filemap_invalidate_lock_shared(inode->i_mapping);
- ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
- filemap_invalidate_unlock_shared(inode->i_mapping);
-
- sb_end_pagefault(inode->i_sb);
- return ret;
-}
-
-static const struct vm_operations_struct zonefs_file_vm_ops = {
- .fault = filemap_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = zonefs_filemap_page_mkwrite,
-};
-
-static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
-{
- /*
- * Conventional zones accept random writes, so their files can support
- * shared writable mappings. For sequential zone files, only read
- * mappings are possible since there are no guarantees for write
- * ordering between msync() and page cache writeback.
- */
- if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ &&
- (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
- return -EINVAL;
-
- file_accessed(file);
- vma->vm_ops = &zonefs_file_vm_ops;
-
- return 0;
-}
-
-static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
-{
- loff_t isize = i_size_read(file_inode(file));
-
- /*
- * Seeks are limited to below the zone size for conventional zones
- * and below the zone write pointer for sequential zones. In both
- * cases, this limit is the inode size.
- */
- return generic_file_llseek_size(file, offset, whence, isize, isize);
-}
-
-static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
- int error, unsigned int flags)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
-
- if (error) {
- zonefs_io_error(inode, true);
- return error;
- }
-
- if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) {
- /*
- * Note that we may be seeing completions out of order,
- * but that is not a problem since a write completed
- * successfully necessarily means that all preceding writes
- * were also successful. So we can safely increase the inode
- * size to the write end location.
- */
- mutex_lock(&zi->i_truncate_mutex);
- if (i_size_read(inode) < iocb->ki_pos + size) {
- zonefs_update_stats(inode, iocb->ki_pos + size);
- zonefs_i_size_write(inode, iocb->ki_pos + size);
- }
- mutex_unlock(&zi->i_truncate_mutex);
- }
-
- return 0;
-}
-
-static const struct iomap_dio_ops zonefs_write_dio_ops = {
- .end_io = zonefs_file_write_dio_end_io,
-};
-
-static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int max = bdev_max_zone_append_sectors(bdev);
- struct bio *bio;
- ssize_t size;
- int nr_pages;
- ssize_t ret;
-
- max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
- iov_iter_truncate(from, max);
-
- nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
- if (!nr_pages)
- return 0;
-
- bio = bio_alloc(bdev, nr_pages,
- REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
- bio->bi_iter.bi_sector = zi->i_zsector;
- bio->bi_ioprio = iocb->ki_ioprio;
- if (iocb_is_dsync(iocb))
- bio->bi_opf |= REQ_FUA;
-
- ret = bio_iov_iter_get_pages(bio, from);
- if (unlikely(ret))
- goto out_release;
-
- size = bio->bi_iter.bi_size;
- task_io_account_write(size);
-
- if (iocb->ki_flags & IOCB_HIPRI)
- bio_set_polled(bio, iocb);
-
- ret = submit_bio_wait(bio);
-
- /*
- * If the file zone was written underneath the file system, the zone
- * write pointer may not be where we expect it to be, but the zone
- * append write can still succeed. So check manually that we wrote where
- * we intended to, that is, at zi->i_wpoffset.
- */
- if (!ret) {
- sector_t wpsector =
- zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT);
-
- if (bio->bi_iter.bi_sector != wpsector) {
- zonefs_warn(inode->i_sb,
- "Corrupted write pointer %llu for zone at %llu\n",
- wpsector, zi->i_zsector);
- ret = -EIO;
- }
- }
-
- zonefs_file_write_dio_end_io(iocb, size, ret, 0);
- trace_zonefs_file_dio_append(inode, size, ret);
-
-out_release:
- bio_release_pages(bio, false);
- bio_put(bio);
-
- if (ret >= 0) {
- iocb->ki_pos += size;
- return size;
- }
-
- return ret;
-}
-
-/*
- * Do not exceed the LFS limits nor the file zone size. If pos is under the
- * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
- */
-static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
- loff_t count)
-{
- struct inode *inode = file_inode(file);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- loff_t limit = rlimit(RLIMIT_FSIZE);
- loff_t max_size = zi->i_max_size;
-
- if (limit != RLIM_INFINITY) {
- if (pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- count = min(count, limit - pos);
- }
-
- if (!(file->f_flags & O_LARGEFILE))
- max_size = min_t(loff_t, MAX_NON_LFS, max_size);
-
- if (unlikely(pos >= max_size))
- return -EFBIG;
-
- return min(count, max_size - pos);
-}
-
-static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- loff_t count;
-
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
-
- if (!iov_iter_count(from))
- return 0;
-
- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
- return -EINVAL;
-
- if (iocb->ki_flags & IOCB_APPEND) {
- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
- return -EINVAL;
- mutex_lock(&zi->i_truncate_mutex);
- iocb->ki_pos = zi->i_wpoffset;
- mutex_unlock(&zi->i_truncate_mutex);
- }
-
- count = zonefs_write_check_limits(file, iocb->ki_pos,
- iov_iter_count(from));
- if (count < 0)
- return count;
-
- iov_iter_truncate(from, count);
- return iov_iter_count(from);
-}
-
-/*
- * Handle direct writes. For sequential zone files, this is the only possible
- * write path. For these files, check that the user is issuing writes
- * sequentially from the end of the file. This code assumes that the block layer
- * delivers write requests to the device in sequential order. This is always the
- * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
- * elevator feature is being used (e.g. mq-deadline). The block layer always
- * automatically select such an elevator for zoned block devices during the
- * device initialization.
- */
-static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct super_block *sb = inode->i_sb;
- bool sync = is_sync_kiocb(iocb);
- bool append = false;
- ssize_t ret, count;
-
- /*
- * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
- * as this can cause write reordering (e.g. the first aio gets EAGAIN
- * on the inode lock but the second goes through but is now unaligned).
- */
- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync &&
- (iocb->ki_flags & IOCB_NOWAIT))
- return -EOPNOTSUPP;
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock(inode))
- return -EAGAIN;
- } else {
- inode_lock(inode);
- }
-
- count = zonefs_write_checks(iocb, from);
- if (count <= 0) {
- ret = count;
- goto inode_unlock;
- }
-
- if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
- ret = -EINVAL;
- goto inode_unlock;
- }
-
- /* Enforce sequential writes (append only) in sequential zones */
- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
- mutex_lock(&zi->i_truncate_mutex);
- if (iocb->ki_pos != zi->i_wpoffset) {
- mutex_unlock(&zi->i_truncate_mutex);
- ret = -EINVAL;
- goto inode_unlock;
- }
- mutex_unlock(&zi->i_truncate_mutex);
- append = sync;
- }
-
- if (append)
- ret = zonefs_file_dio_append(iocb, from);
- else
- ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
- &zonefs_write_dio_ops, 0, NULL, 0);
- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
- (ret > 0 || ret == -EIOCBQUEUED)) {
- if (ret > 0)
- count = ret;
-
- /*
- * Update the zone write pointer offset assuming the write
- * operation succeeded. If it did not, the error recovery path
- * will correct it. Also do active seq file accounting.
- */
- mutex_lock(&zi->i_truncate_mutex);
- zi->i_wpoffset += count;
- zonefs_account_active(inode);
- mutex_unlock(&zi->i_truncate_mutex);
- }
-
-inode_unlock:
- inode_unlock(inode);
-
- return ret;
-}
-
-static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
- struct iov_iter *from)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- ssize_t ret;
-
- /*
- * Direct IO writes are mandatory for sequential zone files so that the
- * write IO issuing order is preserved.
- */
- if (zi->i_ztype != ZONEFS_ZTYPE_CNV)
- return -EIO;
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock(inode))
- return -EAGAIN;
- } else {
- inode_lock(inode);
- }
-
- ret = zonefs_write_checks(iocb, from);
- if (ret <= 0)
- goto inode_unlock;
-
- ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
- if (ret > 0)
- iocb->ki_pos += ret;
- else if (ret == -EIO)
- zonefs_io_error(inode, true);
-
-inode_unlock:
- inode_unlock(inode);
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
-
- return ret;
-}
-
-static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
-
- if (unlikely(IS_IMMUTABLE(inode)))
- return -EPERM;
-
- if (sb_rdonly(inode->i_sb))
- return -EROFS;
-
- /* Write operations beyond the zone size are not allowed */
- if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size)
- return -EFBIG;
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- ssize_t ret = zonefs_file_dio_write(iocb, from);
- if (ret != -ENOTBLK)
- return ret;
- }
-
- return zonefs_file_buffered_write(iocb, from);
-}
-
-static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
- int error, unsigned int flags)
-{
- if (error) {
- zonefs_io_error(file_inode(iocb->ki_filp), false);
- return error;
- }
-
- return 0;
-}
-
-static const struct iomap_dio_ops zonefs_read_dio_ops = {
- .end_io = zonefs_file_read_dio_end_io,
-};
-
-static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct super_block *sb = inode->i_sb;
- loff_t isize;
- ssize_t ret;
-
- /* Offline zones cannot be read */
- if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
- return -EPERM;
-
- if (iocb->ki_pos >= zi->i_max_size)
- return 0;
-
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock_shared(inode))
- return -EAGAIN;
- } else {
- inode_lock_shared(inode);
- }
-
- /* Limit read operations to written data */
- mutex_lock(&zi->i_truncate_mutex);
- isize = i_size_read(inode);
- if (iocb->ki_pos >= isize) {
- mutex_unlock(&zi->i_truncate_mutex);
- ret = 0;
- goto inode_unlock;
- }
- iov_iter_truncate(to, isize - iocb->ki_pos);
- mutex_unlock(&zi->i_truncate_mutex);
-
- if (iocb->ki_flags & IOCB_DIRECT) {
- size_t count = iov_iter_count(to);
-
- if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
- ret = -EINVAL;
- goto inode_unlock;
- }
- file_accessed(iocb->ki_filp);
- ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
- &zonefs_read_dio_ops, 0, NULL, 0);
- } else {
- ret = generic_file_read_iter(iocb, to);
- if (ret == -EIO)
- zonefs_io_error(inode, false);
- }
-
-inode_unlock:
- inode_unlock_shared(inode);
-
- return ret;
-}
-
-/*
- * Write open accounting is done only for sequential files.
- */
-static inline bool zonefs_seq_file_need_wro(struct inode *inode,
- struct file *file)
-{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
-
- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
- return false;
-
- if (!(file->f_mode & FMODE_WRITE))
- return false;
-
- return true;
-}
-
-static int zonefs_seq_file_write_open(struct inode *inode)
-{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- int ret = 0;
-
- mutex_lock(&zi->i_truncate_mutex);
-
- if (!zi->i_wr_refcnt) {
- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
- unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
-
- if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
-
- if (sbi->s_max_wro_seq_files
- && wro > sbi->s_max_wro_seq_files) {
- atomic_dec(&sbi->s_wro_seq_files);
- ret = -EBUSY;
- goto unlock;
- }
-
- if (i_size_read(inode) < zi->i_max_size) {
- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
- if (ret) {
- atomic_dec(&sbi->s_wro_seq_files);
- goto unlock;
- }
- zi->i_flags |= ZONEFS_ZONE_OPEN;
- zonefs_account_active(inode);
- }
- }
- }
-
- zi->i_wr_refcnt++;
-
-unlock:
- mutex_unlock(&zi->i_truncate_mutex);
-
- return ret;
-}
-
-static int zonefs_file_open(struct inode *inode, struct file *file)
-{
- int ret;
-
- ret = generic_file_open(inode, file);
- if (ret)
- return ret;
-
- if (zonefs_seq_file_need_wro(inode, file))
- return zonefs_seq_file_write_open(inode);
-
- return 0;
-}
-
-static void zonefs_seq_file_write_close(struct inode *inode)
-{
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- struct super_block *sb = inode->i_sb;
- struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
- int ret = 0;
-
- mutex_lock(&zi->i_truncate_mutex);
-
- zi->i_wr_refcnt--;
- if (zi->i_wr_refcnt)
- goto unlock;
-
- /*
- * The file zone may not be open anymore (e.g. the file was truncated to
- * its maximum size or it was fully written). For this case, we only
- * need to decrement the write open count.
- */
- if (zi->i_flags & ZONEFS_ZONE_OPEN) {
- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
- if (ret) {
- __zonefs_io_error(inode, false);
- /*
- * Leaving zones explicitly open may lead to a state
- * where most zones cannot be written (zone resources
- * exhausted). So take preventive action by remounting
- * read-only.
- */
- if (zi->i_flags & ZONEFS_ZONE_OPEN &&
- !(sb->s_flags & SB_RDONLY)) {
- zonefs_warn(sb,
- "closing zone at %llu failed %d\n",
- zi->i_zsector, ret);
- zonefs_warn(sb,
- "remounting filesystem read-only\n");
- sb->s_flags |= SB_RDONLY;
- }
- goto unlock;
- }
-
- zi->i_flags &= ~ZONEFS_ZONE_OPEN;
- zonefs_account_active(inode);
- }
-
- atomic_dec(&sbi->s_wro_seq_files);
-
-unlock:
- mutex_unlock(&zi->i_truncate_mutex);
-}
-
-static int zonefs_file_release(struct inode *inode, struct file *file)
-{
- /*
- * If we explicitly open a zone we must close it again as well, but the
- * zone management operation can fail (either due to an IO error or as
- * the zone has gone offline or read-only). Make sure we don't fail the
- * close(2) for user-space.
- */
- if (zonefs_seq_file_need_wro(inode, file))
- zonefs_seq_file_write_close(inode);
-
- return 0;
-}
-
-static const struct file_operations zonefs_file_operations = {
- .open = zonefs_file_open,
- .release = zonefs_file_release,
- .fsync = zonefs_file_fsync,
- .mmap = zonefs_file_mmap,
- .llseek = zonefs_file_llseek,
- .read_iter = zonefs_file_read_iter,
- .write_iter = zonefs_file_write_iter,
- .splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
- .iopoll = iocb_bio_iopoll,
-};
-
static struct kmem_cache *zonefs_inode_cachep;
static struct inode *zonefs_alloc_inode(struct super_block *sb)
@@ -1282,7 +415,6 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
inode_init_once(&zi->i_vnode);
mutex_init(&zi->i_truncate_mutex);
zi->i_wr_refcnt = 0;
- zi->i_flags = 0;
return &zi->i_vnode;
}
@@ -1315,8 +447,8 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = buf->f_bfree;
for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
- if (sbi->s_nr_files[t])
- buf->f_files += sbi->s_nr_files[t] + 1;
+ if (sbi->s_zgroup[t].g_nr_zones)
+ buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1;
}
buf->f_ffree = 0;
@@ -1408,185 +540,440 @@ static int zonefs_remount(struct super_block *sb, int *flags, char *data)
return zonefs_parse_options(sb, data);
}
-static const struct super_operations zonefs_sops = {
- .alloc_inode = zonefs_alloc_inode,
- .free_inode = zonefs_free_inode,
- .statfs = zonefs_statfs,
- .remount_fs = zonefs_remount,
- .show_options = zonefs_show_options,
-};
+static int zonefs_inode_setattr(struct mnt_idmap *idmap,
+ struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = d_inode(dentry);
+ int ret;
+
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
+ ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
+ if (ret)
+ return ret;
+
+ /*
+ * Since files and directories cannot be created nor deleted, do not
+ * allow setting any write attributes on the sub-directories grouping
+ * files by zone type.
+ */
+ if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) &&
+ (iattr->ia_mode & 0222))
+ return -EPERM;
+
+ if (((iattr->ia_valid & ATTR_UID) &&
+ !uid_eq(iattr->ia_uid, inode->i_uid)) ||
+ ((iattr->ia_valid & ATTR_GID) &&
+ !gid_eq(iattr->ia_gid, inode->i_gid))) {
+ ret = dquot_transfer(&nop_mnt_idmap, inode, iattr);
+ if (ret)
+ return ret;
+ }
+
+ if (iattr->ia_valid & ATTR_SIZE) {
+ ret = zonefs_file_truncate(inode, iattr->ia_size);
+ if (ret)
+ return ret;
+ }
-static const struct inode_operations zonefs_dir_inode_operations = {
- .lookup = simple_lookup,
+ setattr_copy(&nop_mnt_idmap, inode, iattr);
+
+ if (S_ISREG(inode->i_mode)) {
+ struct zonefs_zone *z = zonefs_inode_zone(inode);
+
+ z->z_mode = inode->i_mode;
+ z->z_uid = inode->i_uid;
+ z->z_gid = inode->i_gid;
+ }
+
+ return 0;
+}
+
+static const struct inode_operations zonefs_file_inode_operations = {
.setattr = zonefs_inode_setattr,
};
-static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
- enum zonefs_ztype type)
+static long zonefs_fname_to_fno(const struct qstr *fname)
{
- struct super_block *sb = parent->i_sb;
+ const char *name = fname->name;
+ unsigned int len = fname->len;
+ long fno = 0, shift = 1;
+ const char *rname;
+ char c = *name;
+ unsigned int i;
- inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1;
- inode_init_owner(&nop_mnt_idmap, inode, parent, S_IFDIR | 0555);
- inode->i_op = &zonefs_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
- set_nlink(inode, 2);
- inc_nlink(parent);
+ /*
+ * File names are always a base-10 number string without any
+ * leading 0s.
+ */
+ if (!isdigit(c))
+ return -ENOENT;
+
+ if (len > 1 && c == '0')
+ return -ENOENT;
+
+ if (len == 1)
+ return c - '0';
+
+ for (i = 0, rname = name + len - 1; i < len; i++, rname--) {
+ c = *rname;
+ if (!isdigit(c))
+ return -ENOENT;
+ fno += (c - '0') * shift;
+ shift *= 10;
+ }
+
+ return fno;
}
-static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
- enum zonefs_ztype type)
+static struct inode *zonefs_get_file_inode(struct inode *dir,
+ struct dentry *dentry)
{
- struct super_block *sb = inode->i_sb;
+ struct zonefs_zone_group *zgroup = dir->i_private;
+ struct super_block *sb = dir->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
- struct zonefs_inode_info *zi = ZONEFS_I(inode);
- int ret = 0;
+ struct zonefs_zone *z;
+ struct inode *inode;
+ ino_t ino;
+ long fno;
- inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
- inode->i_mode = S_IFREG | sbi->s_perm;
+ /* Get the file number from the file name */
+ fno = zonefs_fname_to_fno(&dentry->d_name);
+ if (fno < 0)
+ return ERR_PTR(fno);
- zi->i_ztype = type;
- zi->i_zsector = zone->start;
- zi->i_zone_size = zone->len << SECTOR_SHIFT;
- if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT &&
- !(sbi->s_features & ZONEFS_F_AGGRCNV)) {
- zonefs_err(sb,
- "zone size %llu doesn't match device's zone sectors %llu\n",
- zi->i_zone_size,
- bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT);
- return -EINVAL;
- }
+ if (!zgroup->g_nr_zones || fno >= zgroup->g_nr_zones)
+ return ERR_PTR(-ENOENT);
- zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
- zone->capacity << SECTOR_SHIFT);
- zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true);
+ z = &zgroup->g_zones[fno];
+ ino = z->z_sector >> sbi->s_zone_sectors_shift;
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW)) {
+ WARN_ON_ONCE(inode->i_private != z);
+ return inode;
+ }
- inode->i_uid = sbi->s_uid;
- inode->i_gid = sbi->s_gid;
- inode->i_size = zi->i_wpoffset;
- inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT;
+ inode->i_ino = ino;
+ inode->i_mode = z->z_mode;
+ inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
+ inode->i_uid = z->z_uid;
+ inode->i_gid = z->z_gid;
+ inode->i_size = z->z_wpoffset;
+ inode->i_blocks = z->z_capacity >> SECTOR_SHIFT;
+ inode->i_private = z;
inode->i_op = &zonefs_file_inode_operations;
inode->i_fop = &zonefs_file_operations;
inode->i_mapping->a_ops = &zonefs_file_aops;
- sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
- sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
- sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
+ /* Update the inode access rights depending on the zone condition */
+ zonefs_inode_update_mode(inode);
+
+ unlock_new_inode(inode);
+
+ return inode;
+}
+
+static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
+ enum zonefs_ztype ztype)
+{
+ struct inode *root = d_inode(sb->s_root);
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ struct inode *inode;
+ ino_t ino = bdev_nr_zones(sb->s_bdev) + ztype + 1;
+
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ inode->i_ino = ino;
+ inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555);
+ inode->i_size = sbi->s_zgroup[ztype].g_nr_zones;
+ inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime;
+ inode->i_private = &sbi->s_zgroup[ztype];
+ set_nlink(inode, 2);
+
+ inode->i_op = &zonefs_dir_inode_operations;
+ inode->i_fop = &zonefs_dir_operations;
+
+ unlock_new_inode(inode);
+
+ return inode;
+}
- mutex_lock(&zi->i_truncate_mutex);
+
+static struct inode *zonefs_get_dir_inode(struct inode *dir,
+ struct dentry *dentry)
+{
+ struct super_block *sb = dir->i_sb;
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ const char *name = dentry->d_name.name;
+ enum zonefs_ztype ztype;
/*
- * For sequential zones, make sure that any open zone is closed first
- * to ensure that the initial number of open zones is 0, in sync with
- * the open zone accounting done when the mount option
- * ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
+ * We only need to check for the "seq" directory and
+ * the "cnv" directory if we have conventional zones.
*/
- if (type == ZONEFS_ZTYPE_SEQ &&
- (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
- zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
- if (ret)
- goto unlock;
+ if (dentry->d_name.len != 3)
+ return ERR_PTR(-ENOENT);
+
+ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
+ if (sbi->s_zgroup[ztype].g_nr_zones &&
+ memcmp(name, zonefs_zgroup_name(ztype), 3) == 0)
+ break;
}
+ if (ztype == ZONEFS_ZTYPE_MAX)
+ return ERR_PTR(-ENOENT);
- zonefs_account_active(inode);
+ return zonefs_get_zgroup_inode(sb, ztype);
+}
-unlock:
- mutex_unlock(&zi->i_truncate_mutex);
+static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct inode *inode;
- return ret;
+ if (dentry->d_name.len > ZONEFS_NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ if (dir == d_inode(dir->i_sb->s_root))
+ inode = zonefs_get_dir_inode(dir, dentry);
+ else
+ inode = zonefs_get_file_inode(dir, dentry);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return d_splice_alias(inode, dentry);
}
-static struct dentry *zonefs_create_inode(struct dentry *parent,
- const char *name, struct blk_zone *zone,
- enum zonefs_ztype type)
+static int zonefs_readdir_root(struct file *file, struct dir_context *ctx)
{
- struct inode *dir = d_inode(parent);
- struct dentry *dentry;
- struct inode *inode;
- int ret = -ENOMEM;
+ struct inode *inode = file_inode(file);
+ struct super_block *sb = inode->i_sb;
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ enum zonefs_ztype ztype = ZONEFS_ZTYPE_CNV;
+ ino_t base_ino = bdev_nr_zones(sb->s_bdev) + 1;
- dentry = d_alloc_name(parent, name);
- if (!dentry)
- return ERR_PTR(ret);
+ if (ctx->pos >= inode->i_size)
+ return 0;
- inode = new_inode(parent->d_sb);
- if (!inode)
- goto dput;
+ if (!dir_emit_dots(file, ctx))
+ return 0;
- inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
- if (zone) {
- ret = zonefs_init_file_inode(inode, zone, type);
- if (ret) {
- iput(inode);
- goto dput;
- }
- } else {
- zonefs_init_dir_inode(dir, inode, type);
+ if (ctx->pos == 2) {
+ if (!sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones)
+ ztype = ZONEFS_ZTYPE_SEQ;
+
+ if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3,
+ base_ino + ztype, DT_DIR))
+ return 0;
+ ctx->pos++;
}
- d_add(dentry, inode);
- dir->i_size++;
+ if (ctx->pos == 3 && ztype != ZONEFS_ZTYPE_SEQ) {
+ ztype = ZONEFS_ZTYPE_SEQ;
+ if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3,
+ base_ino + ztype, DT_DIR))
+ return 0;
+ ctx->pos++;
+ }
- return dentry;
+ return 0;
+}
+
+static int zonefs_readdir_zgroup(struct file *file,
+ struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
+ struct zonefs_zone_group *zgroup = inode->i_private;
+ struct super_block *sb = inode->i_sb;
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ struct zonefs_zone *z;
+ int fname_len;
+ char *fname;
+ ino_t ino;
+ int f;
+
+ /*
+ * The size of zone group directories is equal to the number
+ * of zone files in the group and does note include the "." and
+ * ".." entries. Hence the "+ 2" here.
+ */
+ if (ctx->pos >= inode->i_size + 2)
+ return 0;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ fname = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL);
+ if (!fname)
+ return -ENOMEM;
+
+ for (f = ctx->pos - 2; f < zgroup->g_nr_zones; f++) {
+ z = &zgroup->g_zones[f];
+ ino = z->z_sector >> sbi->s_zone_sectors_shift;
+ fname_len = snprintf(fname, ZONEFS_NAME_MAX - 1, "%u", f);
+ if (!dir_emit(ctx, fname, fname_len, ino, DT_REG))
+ break;
+ ctx->pos++;
+ }
-dput:
- dput(dentry);
+ kfree(fname);
- return ERR_PTR(ret);
+ return 0;
+}
+
+static int zonefs_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct inode *inode = file_inode(file);
+
+ if (inode == d_inode(inode->i_sb->s_root))
+ return zonefs_readdir_root(file, ctx);
+
+ return zonefs_readdir_zgroup(file, ctx);
}
+const struct inode_operations zonefs_dir_inode_operations = {
+ .lookup = zonefs_lookup,
+ .setattr = zonefs_inode_setattr,
+};
+
+const struct file_operations zonefs_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = zonefs_readdir,
+};
+
struct zonefs_zone_data {
struct super_block *sb;
unsigned int nr_zones[ZONEFS_ZTYPE_MAX];
+ sector_t cnv_zone_start;
struct blk_zone *zones;
};
+static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
+ struct zonefs_zone_data *zd = data;
+ struct super_block *sb = zd->sb;
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+
+ /*
+ * We do not care about the first zone: it contains the super block
+ * and not exposed as a file.
+ */
+ if (!idx)
+ return 0;
+
+ /*
+ * Count the number of zones that will be exposed as files.
+ * For sequential zones, we always have as many files as zones.
+ * FOr conventional zones, the number of files depends on if we have
+ * conventional zones aggregation enabled.
+ */
+ switch (zone->type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ if (sbi->s_features & ZONEFS_F_AGGRCNV) {
+ /* One file per set of contiguous conventional zones */
+ if (!(sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) ||
+ zone->start != zd->cnv_zone_start)
+ sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++;
+ zd->cnv_zone_start = zone->start + zone->len;
+ } else {
+ /* One file per zone */
+ sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++;
+ }
+ break;
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ sbi->s_zgroup[ZONEFS_ZTYPE_SEQ].g_nr_zones++;
+ break;
+ default:
+ zonefs_err(zd->sb, "Unsupported zone type 0x%x\n",
+ zone->type);
+ return -EIO;
+ }
+
+ memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone));
+
+ return 0;
+}
+
+static int zonefs_get_zone_info(struct zonefs_zone_data *zd)
+{
+ struct block_device *bdev = zd->sb->s_bdev;
+ int ret;
+
+ zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone),
+ GFP_KERNEL);
+ if (!zd->zones)
+ return -ENOMEM;
+
+ /* Get zones information from the device */
+ ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES,
+ zonefs_get_zone_info_cb, zd);
+ if (ret < 0) {
+ zonefs_err(zd->sb, "Zone report failed %d\n", ret);
+ return ret;
+ }
+
+ if (ret != bdev_nr_zones(bdev)) {
+ zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n",
+ ret, bdev_nr_zones(bdev));
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static inline void zonefs_free_zone_info(struct zonefs_zone_data *zd)
+{
+ kvfree(zd->zones);
+}
+
/*
* Create a zone group and populate it with zone files.
*/
-static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
- enum zonefs_ztype type)
+static int zonefs_init_zgroup(struct super_block *sb,
+ struct zonefs_zone_data *zd,
+ enum zonefs_ztype ztype)
{
- struct super_block *sb = zd->sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype];
struct blk_zone *zone, *next, *end;
- const char *zgroup_name;
- char *file_name;
- struct dentry *dir, *dent;
+ struct zonefs_zone *z;
unsigned int n = 0;
int ret;
- /* If the group is empty, there is nothing to do */
- if (!zd->nr_zones[type])
+ /* Allocate the zone group. If it is empty, we have nothing to do. */
+ if (!zgroup->g_nr_zones)
return 0;
- file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL);
- if (!file_name)
+ zgroup->g_zones = kvcalloc(zgroup->g_nr_zones,
+ sizeof(struct zonefs_zone), GFP_KERNEL);
+ if (!zgroup->g_zones)
return -ENOMEM;
- if (type == ZONEFS_ZTYPE_CNV)
- zgroup_name = "cnv";
- else
- zgroup_name = "seq";
-
- dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type);
- if (IS_ERR(dir)) {
- ret = PTR_ERR(dir);
- goto free;
- }
-
/*
- * The first zone contains the super block: skip it.
+ * Initialize the zone groups using the device zone information.
+ * We always skip the first zone as it contains the super block
+ * and is not use to back a file.
*/
end = zd->zones + bdev_nr_zones(sb->s_bdev);
for (zone = &zd->zones[1]; zone < end; zone = next) {
next = zone + 1;
- if (zonefs_zone_type(zone) != type)
+ if (zonefs_zone_type(zone) != ztype)
continue;
+ if (WARN_ON_ONCE(n >= zgroup->g_nr_zones))
+ return -EINVAL;
+
/*
* For conventional zones, contiguous zones can be aggregated
* together to form larger files. Note that this overwrites the
@@ -1595,10 +982,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
* found, assume that all zones aggregated have the same
* condition.
*/
- if (type == ZONEFS_ZTYPE_CNV &&
+ if (ztype == ZONEFS_ZTYPE_CNV &&
(sbi->s_features & ZONEFS_F_AGGRCNV)) {
for (; next < end; next++) {
- if (zonefs_zone_type(next) != type)
+ if (zonefs_zone_type(next) != ztype)
break;
zone->len += next->len;
zone->capacity += next->capacity;
@@ -1608,99 +995,118 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
else if (next->cond == BLK_ZONE_COND_OFFLINE)
zone->cond = BLK_ZONE_COND_OFFLINE;
}
- if (zone->capacity != zone->len) {
- zonefs_err(sb, "Invalid conventional zone capacity\n");
- ret = -EINVAL;
- goto free;
- }
}
+ z = &zgroup->g_zones[n];
+ if (ztype == ZONEFS_ZTYPE_CNV)
+ z->z_flags |= ZONEFS_ZONE_CNV;
+ z->z_sector = zone->start;
+ z->z_size = zone->len << SECTOR_SHIFT;
+ if (z->z_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT &&
+ !(sbi->s_features & ZONEFS_F_AGGRCNV)) {
+ zonefs_err(sb,
+ "Invalid zone size %llu (device zone sectors %llu)\n",
+ z->z_size,
+ bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT);
+ return -EINVAL;
+ }
+
+ z->z_capacity = min_t(loff_t, MAX_LFS_FILESIZE,
+ zone->capacity << SECTOR_SHIFT);
+ z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone);
+
+ z->z_mode = S_IFREG | sbi->s_perm;
+ z->z_uid = sbi->s_uid;
+ z->z_gid = sbi->s_gid;
+
/*
- * Use the file number within its group as file name.
+ * Let zonefs_inode_update_mode() know that we will need
+ * special initialization of the inode mode the first time
+ * it is accessed.
*/
- snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n);
- dent = zonefs_create_inode(dir, file_name, zone, type);
- if (IS_ERR(dent)) {
- ret = PTR_ERR(dent);
- goto free;
+ z->z_flags |= ZONEFS_ZONE_INIT_MODE;
+
+ sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes);
+ sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits;
+ sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits;
+
+ /*
+ * For sequential zones, make sure that any open zone is closed
+ * first to ensure that the initial number of open zones is 0,
+ * in sync with the open zone accounting done when the mount
+ * option ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
+ */
+ if (ztype == ZONEFS_ZTYPE_SEQ &&
+ (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
+ zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
+ ret = zonefs_zone_mgmt(sb, z, REQ_OP_ZONE_CLOSE);
+ if (ret)
+ return ret;
}
+ zonefs_account_active(sb, z);
+
n++;
}
- zonefs_info(sb, "Zone group \"%s\" has %u file%s\n",
- zgroup_name, n, n > 1 ? "s" : "");
-
- sbi->s_nr_files[type] = n;
- ret = 0;
+ if (WARN_ON_ONCE(n != zgroup->g_nr_zones))
+ return -EINVAL;
-free:
- kfree(file_name);
+ zonefs_info(sb, "Zone group \"%s\" has %u file%s\n",
+ zonefs_zgroup_name(ztype),
+ zgroup->g_nr_zones,
+ zgroup->g_nr_zones > 1 ? "s" : "");
- return ret;
+ return 0;
}
-static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx,
- void *data)
+static void zonefs_free_zgroups(struct super_block *sb)
{
- struct zonefs_zone_data *zd = data;
-
- /*
- * Count the number of usable zones: the first zone at index 0 contains
- * the super block and is ignored.
- */
- switch (zone->type) {
- case BLK_ZONE_TYPE_CONVENTIONAL:
- zone->wp = zone->start + zone->len;
- if (idx)
- zd->nr_zones[ZONEFS_ZTYPE_CNV]++;
- break;
- case BLK_ZONE_TYPE_SEQWRITE_REQ:
- case BLK_ZONE_TYPE_SEQWRITE_PREF:
- if (idx)
- zd->nr_zones[ZONEFS_ZTYPE_SEQ]++;
- break;
- default:
- zonefs_err(zd->sb, "Unsupported zone type 0x%x\n",
- zone->type);
- return -EIO;
- }
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ enum zonefs_ztype ztype;
- memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone));
+ if (!sbi)
+ return;
- return 0;
+ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
+ kvfree(sbi->s_zgroup[ztype].g_zones);
+ sbi->s_zgroup[ztype].g_zones = NULL;
+ }
}
-static int zonefs_get_zone_info(struct zonefs_zone_data *zd)
+/*
+ * Create a zone group and populate it with zone files.
+ */
+static int zonefs_init_zgroups(struct super_block *sb)
{
- struct block_device *bdev = zd->sb->s_bdev;
+ struct zonefs_zone_data zd;
+ enum zonefs_ztype ztype;
int ret;
- zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone),
- GFP_KERNEL);
- if (!zd->zones)
- return -ENOMEM;
-
- /* Get zones information from the device */
- ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES,
- zonefs_get_zone_info_cb, zd);
- if (ret < 0) {
- zonefs_err(zd->sb, "Zone report failed %d\n", ret);
- return ret;
- }
+ /* First get the device zone information */
+ memset(&zd, 0, sizeof(struct zonefs_zone_data));
+ zd.sb = sb;
+ ret = zonefs_get_zone_info(&zd);
+ if (ret)
+ goto cleanup;
- if (ret != bdev_nr_zones(bdev)) {
- zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n",
- ret, bdev_nr_zones(bdev));
- return -EIO;
+ /* Allocate and initialize the zone groups */
+ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
+ ret = zonefs_init_zgroup(sb, &zd, ztype);
+ if (ret) {
+ zonefs_info(sb,
+ "Zone group \"%s\" initialization failed\n",
+ zonefs_zgroup_name(ztype));
+ break;
+ }
}
- return 0;
-}
+cleanup:
+ zonefs_free_zone_info(&zd);
+ if (ret)
+ zonefs_free_zgroups(sb);
-static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd)
-{
- kvfree(zd->zones);
+ return ret;
}
/*
@@ -1785,6 +1191,50 @@ free_page:
return ret;
}
+static const struct super_operations zonefs_sops = {
+ .alloc_inode = zonefs_alloc_inode,
+ .free_inode = zonefs_free_inode,
+ .statfs = zonefs_statfs,
+ .remount_fs = zonefs_remount,
+ .show_options = zonefs_show_options,
+};
+
+static int zonefs_get_zgroup_inodes(struct super_block *sb)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ struct inode *dir_inode;
+ enum zonefs_ztype ztype;
+
+ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
+ if (!sbi->s_zgroup[ztype].g_nr_zones)
+ continue;
+
+ dir_inode = zonefs_get_zgroup_inode(sb, ztype);
+ if (IS_ERR(dir_inode))
+ return PTR_ERR(dir_inode);
+
+ sbi->s_zgroup[ztype].g_inode = dir_inode;
+ }
+
+ return 0;
+}
+
+static void zonefs_release_zgroup_inodes(struct super_block *sb)
+{
+ struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+ enum zonefs_ztype ztype;
+
+ if (!sbi)
+ return;
+
+ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
+ if (sbi->s_zgroup[ztype].g_inode) {
+ iput(sbi->s_zgroup[ztype].g_inode);
+ sbi->s_zgroup[ztype].g_inode = NULL;
+ }
+ }
+}
+
/*
* Check that the device is zoned. If it is, get the list of zones and create
* sub-directories and files according to the device zone configuration and
@@ -1792,10 +1242,9 @@ free_page:
*/
static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
{
- struct zonefs_zone_data zd;
struct zonefs_sb_info *sbi;
struct inode *inode;
- enum zonefs_ztype t;
+ enum zonefs_ztype ztype;
int ret;
if (!bdev_is_zoned(sb->s_bdev)) {
@@ -1845,16 +1294,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
if (ret)
return ret;
- memset(&zd, 0, sizeof(struct zonefs_zone_data));
- zd.sb = sb;
- ret = zonefs_get_zone_info(&zd);
- if (ret)
- goto cleanup;
-
- ret = zonefs_sysfs_register(sb);
- if (ret)
- goto cleanup;
-
zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev));
if (!sbi->s_max_wro_seq_files &&
@@ -1865,7 +1304,12 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
}
- /* Create root directory inode */
+ /* Initialize the zone groups */
+ ret = zonefs_init_zgroups(sb);
+ if (ret)
+ goto cleanup;
+
+ /* Create the root directory inode */
ret = -ENOMEM;
inode = new_inode(sb);
if (!inode)
@@ -1875,22 +1319,37 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
inode->i_mode = S_IFDIR | 0555;
inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode);
inode->i_op = &zonefs_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
+ inode->i_fop = &zonefs_dir_operations;
+ inode->i_size = 2;
set_nlink(inode, 2);
+ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
+ if (sbi->s_zgroup[ztype].g_nr_zones) {
+ inc_nlink(inode);
+ inode->i_size++;
+ }
+ }
sb->s_root = d_make_root(inode);
if (!sb->s_root)
goto cleanup;
- /* Create and populate files in zone groups directories */
- for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
- ret = zonefs_create_zgroup(&zd, t);
- if (ret)
- break;
- }
+ /*
+ * Take a reference on the zone groups directory inodes
+ * to keep them in the inode cache.
+ */
+ ret = zonefs_get_zgroup_inodes(sb);
+ if (ret)
+ goto cleanup;
+
+ ret = zonefs_sysfs_register(sb);
+ if (ret)
+ goto cleanup;
+
+ return 0;
cleanup:
- zonefs_cleanup_zone_info(&zd);
+ zonefs_release_zgroup_inodes(sb);
+ zonefs_free_zgroups(sb);
return ret;
}
@@ -1905,11 +1364,13 @@ static void zonefs_kill_super(struct super_block *sb)
{
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
- if (sb->s_root)
- d_genocide(sb->s_root);
+ /* Release the reference on the zone group directory inodes */
+ zonefs_release_zgroup_inodes(sb);
- zonefs_sysfs_unregister(sb);
kill_block_super(sb);
+
+ zonefs_sysfs_unregister(sb);
+ zonefs_free_zgroups(sb);
kfree(sbi);
}
diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c
index 9920689dc098..8ccb65c2b419 100644
--- a/fs/zonefs/sysfs.c
+++ b/fs/zonefs/sysfs.c
@@ -79,7 +79,7 @@ static const struct sysfs_ops zonefs_sysfs_attr_ops = {
.show = zonefs_sysfs_attr_show,
};
-static struct kobj_type zonefs_sb_ktype = {
+static const struct kobj_type zonefs_sb_ktype = {
.default_groups = zonefs_sysfs_groups,
.sysfs_ops = &zonefs_sysfs_attr_ops,
.release = zonefs_sysfs_sb_release,
diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h
index 42edcfd393ed..9969db3a9c7d 100644
--- a/fs/zonefs/trace.h
+++ b/fs/zonefs/trace.h
@@ -20,8 +20,9 @@
#define show_dev(dev) MAJOR(dev), MINOR(dev)
TRACE_EVENT(zonefs_zone_mgmt,
- TP_PROTO(struct inode *inode, enum req_op op),
- TP_ARGS(inode, op),
+ TP_PROTO(struct super_block *sb, struct zonefs_zone *z,
+ enum req_op op),
+ TP_ARGS(sb, z, op),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(ino_t, ino)
@@ -30,12 +31,12 @@ TRACE_EVENT(zonefs_zone_mgmt,
__field(sector_t, nr_sectors)
),
TP_fast_assign(
- __entry->dev = inode->i_sb->s_dev;
- __entry->ino = inode->i_ino;
+ __entry->dev = sb->s_dev;
+ __entry->ino =
+ z->z_sector >> ZONEFS_SB(sb)->s_zone_sectors_shift;
__entry->op = op;
- __entry->sector = ZONEFS_I(inode)->i_zsector;
- __entry->nr_sectors =
- ZONEFS_I(inode)->i_zone_size >> SECTOR_SHIFT;
+ __entry->sector = z->z_sector;
+ __entry->nr_sectors = z->z_size >> SECTOR_SHIFT;
),
TP_printk("bdev=(%d,%d), ino=%lu op=%s, sector=%llu, nr_sectors=%llu",
show_dev(__entry->dev), (unsigned long)__entry->ino,
@@ -58,9 +59,10 @@ TRACE_EVENT(zonefs_file_dio_append,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
- __entry->sector = ZONEFS_I(inode)->i_zsector;
+ __entry->sector = zonefs_inode_zone(inode)->z_sector;
__entry->size = size;
- __entry->wpoffset = ZONEFS_I(inode)->i_wpoffset;
+ __entry->wpoffset =
+ zonefs_inode_zone(inode)->z_wpoffset;
__entry->ret = ret;
),
TP_printk("bdev=(%d, %d), ino=%lu, sector=%llu, size=%zu, wpoffset=%llu, ret=%zu",
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
index 1dbe78119ff1..8175652241b5 100644
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -39,31 +39,53 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
return ZONEFS_ZTYPE_SEQ;
}
-#define ZONEFS_ZONE_OPEN (1U << 0)
-#define ZONEFS_ZONE_ACTIVE (1U << 1)
-#define ZONEFS_ZONE_OFFLINE (1U << 2)
-#define ZONEFS_ZONE_READONLY (1U << 3)
+#define ZONEFS_ZONE_INIT_MODE (1U << 0)
+#define ZONEFS_ZONE_OPEN (1U << 1)
+#define ZONEFS_ZONE_ACTIVE (1U << 2)
+#define ZONEFS_ZONE_OFFLINE (1U << 3)
+#define ZONEFS_ZONE_READONLY (1U << 4)
+#define ZONEFS_ZONE_CNV (1U << 31)
/*
- * In-memory inode data.
+ * In-memory per-file inode zone data.
*/
-struct zonefs_inode_info {
- struct inode i_vnode;
+struct zonefs_zone {
+ /* Zone state flags */
+ unsigned int z_flags;
- /* File zone type */
- enum zonefs_ztype i_ztype;
+ /* Zone start sector (512B unit) */
+ sector_t z_sector;
- /* File zone start sector (512B unit) */
- sector_t i_zsector;
+ /* Zone size (bytes) */
+ loff_t z_size;
- /* File zone write pointer position (sequential zones only) */
- loff_t i_wpoffset;
+ /* Zone capacity (file maximum size, bytes) */
+ loff_t z_capacity;
- /* File maximum size */
- loff_t i_max_size;
+ /* Write pointer offset in the zone (sequential zones only, bytes) */
+ loff_t z_wpoffset;
+
+ /* Saved inode uid, gid and access rights */
+ umode_t z_mode;
+ kuid_t z_uid;
+ kgid_t z_gid;
+};
+
+/*
+ * In memory zone group information: all zones of a group are exposed
+ * as files, one file per zone.
+ */
+struct zonefs_zone_group {
+ struct inode *g_inode;
+ unsigned int g_nr_zones;
+ struct zonefs_zone *g_zones;
+};
- /* File zone size */
- loff_t i_zone_size;
+/*
+ * In-memory inode data.
+ */
+struct zonefs_inode_info {
+ struct inode i_vnode;
/*
* To serialise fully against both syscall and mmap based IO and
@@ -82,7 +104,6 @@ struct zonefs_inode_info {
/* guarded by i_truncate_mutex */
unsigned int i_wr_refcnt;
- unsigned int i_flags;
};
static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode)
@@ -90,6 +111,31 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode)
return container_of(inode, struct zonefs_inode_info, i_vnode);
}
+static inline bool zonefs_zone_is_cnv(struct zonefs_zone *z)
+{
+ return z->z_flags & ZONEFS_ZONE_CNV;
+}
+
+static inline bool zonefs_zone_is_seq(struct zonefs_zone *z)
+{
+ return !zonefs_zone_is_cnv(z);
+}
+
+static inline struct zonefs_zone *zonefs_inode_zone(struct inode *inode)
+{
+ return inode->i_private;
+}
+
+static inline bool zonefs_inode_is_cnv(struct inode *inode)
+{
+ return zonefs_zone_is_cnv(zonefs_inode_zone(inode));
+}
+
+static inline bool zonefs_inode_is_seq(struct inode *inode)
+{
+ return zonefs_zone_is_seq(zonefs_inode_zone(inode));
+}
+
/*
* On-disk super block (block 0).
*/
@@ -181,7 +227,7 @@ struct zonefs_sb_info {
uuid_t s_uuid;
unsigned int s_zone_sectors_shift;
- unsigned int s_nr_files[ZONEFS_ZTYPE_MAX];
+ struct zonefs_zone_group s_zgroup[ZONEFS_ZTYPE_MAX];
loff_t s_blocks;
loff_t s_used_blocks;
@@ -209,6 +255,32 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
#define zonefs_warn(sb, format, args...) \
pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args)
+/* In super.c */
+void zonefs_inode_account_active(struct inode *inode);
+int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op);
+void zonefs_i_size_write(struct inode *inode, loff_t isize);
+void zonefs_update_stats(struct inode *inode, loff_t new_isize);
+void __zonefs_io_error(struct inode *inode, bool write);
+
+static inline void zonefs_io_error(struct inode *inode, bool write)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+
+ mutex_lock(&zi->i_truncate_mutex);
+ __zonefs_io_error(inode, write);
+ mutex_unlock(&zi->i_truncate_mutex);
+}
+
+/* In super.c */
+extern const struct inode_operations zonefs_dir_inode_operations;
+extern const struct file_operations zonefs_dir_operations;
+
+/* In file.c */
+extern const struct address_space_operations zonefs_file_aops;
+extern const struct file_operations zonefs_file_operations;
+int zonefs_file_truncate(struct inode *inode, loff_t isize);
+
+/* In sysfs.c */
int zonefs_sysfs_register(struct super_block *sb);
void zonefs_sysfs_unregister(struct super_block *sb);
int zonefs_sysfs_init(void);