diff options
Diffstat (limited to 'fs')
126 files changed, 6074 insertions, 5159 deletions
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 3b7e3b9e4fd2..4c0d53bf931a 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -18,40 +18,38 @@ config CIFS select DNS_RESOLVER select ASN1 select OID_REGISTRY + select NETFS_SUPPORT help - This is the client VFS module for the SMB3 family of NAS protocols, - (including support for the most recent, most secure dialect SMB3.1.1) - as well as for earlier dialects such as SMB2.1, SMB2 and the older - Common Internet File System (CIFS) protocol. CIFS was the successor - to the original dialect, the Server Message Block (SMB) protocol, the - native file sharing mechanism for most early PC operating systems. - - The SMB3 protocol is supported by most modern operating systems - and NAS appliances (e.g. Samba, Windows 10, Windows Server 2016, - MacOS) and even in the cloud (e.g. Microsoft Azure). - The older CIFS protocol was included in Windows NT4, 2000 and XP (and - later) as well by Samba (which provides excellent CIFS and SMB3 - server support for Linux and many other operating systems). Use of - dialects older than SMB2.1 is often discouraged on public networks. + This is the client VFS module for the SMB3 family of network file + protocols (including the most recent, most secure dialect SMB3.1.1). + This module also includes support for earlier dialects such as + SMB2.1, SMB2 and even the old Common Internet File System (CIFS) + protocol. CIFS was the successor to the original network filesystem + protocol, Server Message Block (SMB ie SMB1), the native file sharing + mechanism for most early PC operating systems. + + The SMB3.1.1 protocol is supported by most modern operating systems + and NAS appliances (e.g. Samba, Windows 11, Windows Server 2022, + MacOS) and even in the cloud (e.g. Microsoft Azure) and also by the + Linux kernel server, ksmbd. Support for the older CIFS protocol was + included in Windows NT4, 2000 and XP (and later). Use of dialects + older than SMB2.1 is often discouraged on public networks. This module also provides limited support for OS/2 and Windows ME and similar very old servers. - This module provides an advanced network file system client - for mounting to SMB3 (and CIFS) compliant servers. It includes - support for DFS (hierarchical name space), secure per-user - session establishment via Kerberos or NTLM or NTLMv2, RDMA - (smbdirect), advanced security features, per-share encryption, - directory leases, safe distributed caching (oplock), optional packet - signing, Unicode and other internationalization improvements. + This module provides an advanced network file system client for + mounting to SMB3 (and CIFS) compliant servers. It includes support + for DFS (hierarchical name space), secure per-user session + establishment via Kerberos or NTLMv2, RDMA (smbdirect), advanced + security features, per-share encryption, packet-signing, snapshots, + directory leases, safe distributed caching (leases), multichannel, + Unicode and other internationalization improvements. In general, the default dialects, SMB3 and later, enable better performance, security and features, than would be possible with CIFS. - Note that when mounting to Samba, due to the CIFS POSIX extensions, - CIFS mounts can provide slightly better POSIX compatibility - than SMB3 mounts. SMB2/SMB3 mount options are also - slightly simpler (compared to CIFS) due to protocol improvements. - If you need to mount to Samba, Azure, Macs or Windows from this machine, say Y. + If you need to mount to Samba, Azure, ksmbd, Macs or Windows from this + machine, say Y. config CIFS_STATS2 bool "Extended statistics" @@ -111,12 +109,12 @@ config CIFS_POSIX depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR help Enabling this option will cause the cifs client to attempt to - negotiate a newer dialect with servers, such as Samba 3.0.5 - or later, that optionally can handle more POSIX like (rather - than Windows like) file behavior. It also enables - support for POSIX ACLs (getfacl and setfacl) to servers - (such as Samba 3.10 and later) which can negotiate - CIFS POSIX ACL support. If unsure, say N. + negotiate a feature of the older cifs dialect with servers, such as + Samba 3.0.5 or later, that optionally can handle more POSIX like + (rather than Windows like) file behavior. It also enables support + for POSIX ACLs (getfacl and setfacl) to servers (such as Samba 3.10 + and later) which can negotiate CIFS POSIX ACL support. This config + option is not needed when mounting with SMB3.1.1. If unsure, say N. config CIFS_DEBUG bool "Enable CIFS debugging routines" @@ -178,6 +176,8 @@ config CIFS_NFSD_EXPORT help Allows NFS server to export a CIFS mounted share (nfsd over cifs) +if CIFS + config CIFS_SMB_DIRECT bool "SMB Direct support" depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y @@ -201,3 +201,5 @@ config CIFS_ROOT Enables root file system support over SMB protocol. Most people say N here. + +endif diff --git a/fs/cifs/cached_dir.c b/fs/cifs/cached_dir.c index 60399081046a..75d5e06306ea 100644 --- a/fs/cifs/cached_dir.c +++ b/fs/cifs/cached_dir.c @@ -14,6 +14,7 @@ static struct cached_fid *init_cached_dir(const char *path); static void free_cached_dir(struct cached_fid *cfid); +static void smb2_close_cached_fid(struct kref *ref); static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, const char *path, @@ -181,12 +182,13 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - oparms.tcon = tcon; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE); - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.fid = pfid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE), + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .fid = pfid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -220,8 +222,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, } goto oshr_free; } - - atomic_inc(&tcon->num_remote_opens); + cfid->tcon = tcon; + cfid->is_open = true; o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; @@ -233,12 +235,12 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) goto oshr_free; - smb2_parse_contexts(server, o_rsp, &oparms.fid->epoch, oparms.fid->lease_key, &oplock, NULL, NULL); - + if (!(oplock & SMB2_LEASE_READ_CACHING_HE)) + goto oshr_free; qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) goto oshr_free; @@ -259,9 +261,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, } } cfid->dentry = dentry; - cfid->tcon = tcon; cfid->time = jiffies; - cfid->is_open = true; cfid->has_lease = true; oshr_free: @@ -271,7 +271,7 @@ oshr_free: free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); spin_lock(&cfids->cfid_list_lock); - if (!cfid->has_lease) { + if (rc && !cfid->has_lease) { if (cfid->on_list) { list_del(&cfid->entry); cfid->on_list = false; @@ -280,13 +280,27 @@ oshr_free: rc = -ENOENT; } spin_unlock(&cfids->cfid_list_lock); + if (!rc && !cfid->has_lease) { + /* + * We are guaranteed to have two references at this point. + * One for the caller and one for a potential lease. + * Release the Lease-ref so that the directory will be closed + * when the caller closes the cached handle. + */ + kref_put(&cfid->refcount, smb2_close_cached_fid); + } if (rc) { + if (cfid->is_open) + SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, + cfid->fid.volatile_fid); free_cached_dir(cfid); cfid = NULL; } - if (rc == 0) + if (rc == 0) { *ret_cfid = cfid; + atomic_inc(&tcon->num_remote_opens); + } return rc; } @@ -335,6 +349,7 @@ smb2_close_cached_fid(struct kref *ref) if (cfid->is_open) { SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); + atomic_dec(&cfid->tcon->num_remote_opens); } free_cached_dir(cfid); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 56b23def4c95..1911f7016fa1 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/string.h> #include <linux/ctype.h> +#include <linux/kstrtox.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/uaccess.h> @@ -455,8 +456,10 @@ skip_rdma: spin_lock(&ses->iface_lock); if (ses->iface_count) - seq_printf(m, "\n\n\tServer interfaces: %zu", - ses->iface_count); + seq_printf(m, "\n\n\tServer interfaces: %zu" + "\tLast updated: %lu seconds ago", + ses->iface_count, + (jiffies - ses->iface_last_update) / HZ); j = 0; list_for_each_entry(iface, &ses->iface_list, iface_head) { @@ -787,7 +790,7 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, rc = get_user(c[0], buffer); if (rc) return rc; - if (strtobool(c, &bv) == 0) + if (kstrtobool(c, &bv) == 0) cifsFYI = bv; else if ((c[0] > '1') && (c[0] <= '9')) cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */ @@ -947,7 +950,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, if (count < 3) { /* single char or single char followed by null */ - if (strtobool(flags_string, &bv) == 0) { + if (kstrtobool(flags_string, &bv) == 0) { global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF; return count; } else if (!isdigit(flags_string[0])) { diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 7f102ffeb675..e4d751b0c812 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -24,7 +24,7 @@ struct cifs_spnego_msg { uint32_t flags; uint32_t sesskey_len; uint32_t secblob_len; - uint8_t data[1]; + uint8_t data[]; }; #ifdef __KERNEL__ diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 9a2d390bd06f..f5b6df82e857 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1428,14 +1428,15 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, tcon = tlink_tcon(tlink); xid = get_xid(); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = READ_CONTROL; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = READ_CONTROL, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (!rc) { @@ -1494,14 +1495,15 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, else access_flags = WRITE_DAC; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = access_flags; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = access_flags, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) { diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index cbc18b4a9cb2..357bd27a7fd1 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -24,12 +24,156 @@ #include "../smbfs_common/arc4.h" #include <crypto/aead.h> +/* + * Hash data from a BVEC-type iterator. + */ +static int cifs_shash_bvec(const struct iov_iter *iter, ssize_t maxsize, + struct shash_desc *shash) +{ + const struct bio_vec *bv = iter->bvec; + unsigned long start = iter->iov_offset; + unsigned int i; + void *p; + int ret; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + p = kmap_local_page(bv[i].bv_page); + ret = crypto_shash_update(shash, p + off, len); + kunmap_local(p); + if (ret < 0) + return ret; + + maxsize -= len; + if (maxsize <= 0) + break; + start = 0; + } + + return 0; +} + +/* + * Hash data from a KVEC-type iterator. + */ +static int cifs_shash_kvec(const struct iov_iter *iter, ssize_t maxsize, + struct shash_desc *shash) +{ + const struct kvec *kv = iter->kvec; + unsigned long start = iter->iov_offset; + unsigned int i; + int ret; + + for (i = 0; i < iter->nr_segs; i++) { + size_t len; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + ret = crypto_shash_update(shash, kv[i].iov_base + start, len); + if (ret < 0) + return ret; + maxsize -= len; + + if (maxsize <= 0) + break; + start = 0; + } + + return 0; +} + +/* + * Hash data from an XARRAY-type iterator. + */ +static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize, + struct shash_desc *shash) +{ + struct folio *folios[16], *folio; + unsigned int nr, i, j, npages; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t last, index = start / PAGE_SIZE; + ssize_t ret = 0; + size_t len, offset, foffset; + void *p; + + if (maxsize == 0) + return 0; + + last = (start + maxsize - 1) / PAGE_SIZE; + do { + nr = xa_extract(iter->xarray, (void **)folios, index, last, + ARRAY_SIZE(folios), XA_PRESENT); + if (nr == 0) + return -EIO; + + for (i = 0; i < nr; i++) { + folio = folios[i]; + npages = folio_nr_pages(folio); + foffset = start - folio_pos(folio); + offset = foffset % PAGE_SIZE; + for (j = foffset / PAGE_SIZE; j < npages; j++) { + len = min_t(size_t, maxsize, PAGE_SIZE - offset); + p = kmap_local_page(folio_page(folio, j)); + ret = crypto_shash_update(shash, p, len); + kunmap_local(p); + if (ret < 0) + return ret; + maxsize -= len; + if (maxsize <= 0) + return 0; + start += len; + offset = 0; + index++; + } + } + } while (nr == ARRAY_SIZE(folios)); + return 0; +} + +/* + * Pass the data from an iterator into a hash. + */ +static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize, + struct shash_desc *shash) +{ + if (maxsize == 0) + return 0; + + switch (iov_iter_type(iter)) { + case ITER_BVEC: + return cifs_shash_bvec(iter, maxsize, shash); + case ITER_KVEC: + return cifs_shash_kvec(iter, maxsize, shash); + case ITER_XARRAY: + return cifs_shash_xarray(iter, maxsize, shash); + default: + pr_err("cifs_shash_iter(%u) unsupported\n", iov_iter_type(iter)); + WARN_ON_ONCE(1); + return -EIO; + } +} + int __cifs_calc_signature(struct smb_rqst *rqst, - struct TCP_Server_Info *server, char *signature, - struct shash_desc *shash) + struct TCP_Server_Info *server, char *signature, + struct shash_desc *shash) { int i; - int rc; + ssize_t rc; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; @@ -61,25 +205,9 @@ int __cifs_calc_signature(struct smb_rqst *rqst, } } - /* now hash over the rq_pages array */ - for (i = 0; i < rqst->rq_npages; i++) { - void *kaddr; - unsigned int len, offset; - - rqst_page_get_length(rqst, i, &len, &offset); - - kaddr = (char *) kmap(rqst->rq_pages[i]) + offset; - - rc = crypto_shash_update(shash, kaddr, len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with payload\n", - __func__); - kunmap(rqst->rq_pages[i]); - return rc; - } - - kunmap(rqst->rq_pages[i]); - } + rc = cifs_shash_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), shash); + if (rc < 0) + return rc; rc = crypto_shash_final(shash, signature); if (rc) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index cb7c5460a80b..cbcf210d56e4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1359,7 +1359,7 @@ const struct file_operations cifs_file_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1379,7 +1379,7 @@ const struct file_operations cifs_file_strict_ops = { .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1399,7 +1399,7 @@ const struct file_operations cifs_file_direct_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1417,7 +1417,7 @@ const struct file_operations cifs_file_nobrl_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1435,7 +1435,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, - .splice_read = generic_file_splice_read, + .splice_read = cifs_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1453,7 +1453,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = direct_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index b58cd737b21e..71fe0a0a7992 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -100,6 +100,9 @@ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); +extern ssize_t cifs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); @@ -110,6 +113,9 @@ extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); extern int cifs_readdir(struct file *file, struct dir_context *ctx); +extern void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len); +extern void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len); +extern void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len); /* Functions related to dir entries */ extern const struct dentry_operations cifs_dentry_ops; @@ -154,5 +160,5 @@ extern const struct export_operations cifs_export_ops; /* when changing internal version - update following two lines at same time */ #define SMB3_PRODUCT_BUILD 41 -#define CIFS_VERSION "2.41" +#define CIFS_VERSION "2.42" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cd8171a1c9a0..a99883f16d94 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -78,10 +78,6 @@ #define SMB_ECHO_INTERVAL_MAX 600 #define SMB_ECHO_INTERVAL_DEFAULT 60 -/* dns resolution intervals in seconds */ -#define SMB_DNS_RESOLVE_INTERVAL_MIN 120 -#define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600 - /* smb multichannel query server interfaces interval in seconds */ #define SMB_INTERFACE_POLL_INTERVAL 600 @@ -217,11 +213,9 @@ static inline void cifs_free_open_info(struct cifs_open_info_data *data) struct smb_rqst { struct kvec *rq_iov; /* array of kvecs */ unsigned int rq_nvec; /* number of kvecs in array */ - struct page **rq_pages; /* pointer to array of page ptrs */ - unsigned int rq_offset; /* the offset to the 1st page */ - unsigned int rq_npages; /* number pages in array */ - unsigned int rq_pagesz; /* page size to use */ - unsigned int rq_tailsz; /* length of last page */ + size_t rq_iter_size; /* Amount of data in ->rq_iter */ + struct iov_iter rq_iter; /* Data iterator */ + struct xarray rq_buffer; /* Page buffer for encryption */ }; struct mid_q_entry; @@ -692,7 +686,6 @@ struct TCP_Server_Info { /* point to the SMBD connection if RDMA is used instead of socket */ struct smbd_connection *smbd_conn; struct delayed_work echo; /* echo ping workqueue job */ - struct delayed_work resolve; /* dns resolution workqueue job */ char *smallbuf; /* pointer to current "small" buffer */ char *bigbuf; /* pointer to current "big" buffer */ /* Total size of this PDU. Only valid from cifs_demultiplex_thread */ @@ -1427,10 +1420,11 @@ struct cifs_aio_ctx { struct cifsFileInfo *cfile; struct bio_vec *bv; loff_t pos; - unsigned int npages; + unsigned int nr_pinned_pages; ssize_t rc; unsigned int len; unsigned int total_len; + unsigned int bv_need_unpin; /* If ->bv[] needs unpinning */ bool should_dirty; /* * Indicates if this aio_ctx is for direct_io, @@ -1448,28 +1442,18 @@ struct cifs_readdata { struct address_space *mapping; struct cifs_aio_ctx *ctx; __u64 offset; + ssize_t got_bytes; unsigned int bytes; - unsigned int got_bytes; pid_t pid; int result; struct work_struct work; - int (*read_into_pages)(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - unsigned int len); - int (*copy_into_pages)(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - struct iov_iter *iter); + struct iov_iter iter; struct kvec iov[2]; struct TCP_Server_Info *server; #ifdef CONFIG_CIFS_SMB_DIRECT struct smbd_mr *mr; #endif - unsigned int pagesz; - unsigned int page_offset; - unsigned int tailsz; struct cifs_credits credits; - unsigned int nr_pages; - struct page **pages; }; /* asynchronous write support */ @@ -1481,6 +1465,8 @@ struct cifs_writedata { struct work_struct work; struct cifsFileInfo *cfile; struct cifs_aio_ctx *ctx; + struct iov_iter iter; + struct bio_vec *bv; __u64 offset; pid_t pid; unsigned int bytes; @@ -1489,12 +1475,7 @@ struct cifs_writedata { #ifdef CONFIG_CIFS_SMB_DIRECT struct smbd_mr *mr; #endif - unsigned int pagesz; - unsigned int page_offset; - unsigned int tailsz; struct cifs_credits credits; - unsigned int nr_pages; - struct page **pages; }; /* @@ -2154,15 +2135,21 @@ static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const dst->FileNameLength = src->FileNameLength; } -static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, - int num_rqst, - const u8 *sig) +static inline int cifs_get_num_sgs(const struct smb_rqst *rqst, + int num_rqst, + const u8 *sig) { unsigned int len, skip; unsigned int nents = 0; unsigned long addr; int i, j; + /* + * The first rqst has a transform header where the first 20 bytes are + * not part of the encrypted blob. + */ + skip = 20; + /* Assumes the first rqst has a transform header as the first iov. * I.e. * rqst[0].rq_iov[0] is transform header @@ -2170,14 +2157,22 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, * rqst[1+].rq_iov[0+] data to be encrypted/decrypted */ for (i = 0; i < num_rqst; i++) { - /* - * The first rqst has a transform header where the - * first 20 bytes are not part of the encrypted blob. + /* We really don't want a mixture of pinned and unpinned pages + * in the sglist. It's hard to keep track of which is what. + * Instead, we convert to a BVEC-type iterator higher up. */ + if (WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter))) + return -EIO; + + /* We also don't want to have any extra refs or pins to clean + * up in the sglist. + */ + if (WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter))) + return -EIO; + for (j = 0; j < rqst[i].rq_nvec; j++) { struct kvec *iov = &rqst[i].rq_iov[j]; - skip = (i == 0) && (j == 0) ? 20 : 0; addr = (unsigned long)iov->iov_base + skip; if (unlikely(is_vmalloc_addr((void *)addr))) { len = iov->iov_len - skip; @@ -2186,8 +2181,9 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, } else { nents++; } + skip = 0; } - nents += rqst[i].rq_npages; + nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX); } nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE); return nents; @@ -2196,9 +2192,9 @@ static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst, /* We can not use the normal sg_set_buf() as we will sometimes pass a * stack object as buf. */ -static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg, - const void *buf, - unsigned int buflen) +static inline void cifs_sg_set_buf(struct sg_table *sgtable, + const void *buf, + unsigned int buflen) { unsigned long addr = (unsigned long)buf; unsigned int off = offset_in_page(addr); @@ -2208,16 +2204,17 @@ static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg, do { unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off); - sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off); + sg_set_page(&sgtable->sgl[sgtable->nents++], + vmalloc_to_page((void *)addr), len, off); off = 0; addr += PAGE_SIZE; buflen -= len; } while (buflen); } else { - sg_set_page(sg++, virt_to_page(addr), buflen, off); + sg_set_page(&sgtable->sgl[sgtable->nents++], + virt_to_page(addr), buflen, off); } - return sg; } #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 623caece2b10..445e3eaebcc1 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -562,7 +562,7 @@ typedef union smb_com_session_setup_andx { __u32 Reserved; __le32 Capabilities; /* see below */ __le16 ByteCount; - unsigned char SecurityBlob[1]; /* followed by */ + unsigned char SecurityBlob[]; /* followed by */ /* STRING NativeOS */ /* STRING NativeLanMan */ } __attribute__((packed)) req; /* NTLM request format (with @@ -582,7 +582,7 @@ typedef union smb_com_session_setup_andx { __u32 Reserved; /* see below */ __le32 Capabilities; __le16 ByteCount; - unsigned char CaseInsensitivePassword[1]; /* followed by: */ + unsigned char CaseInsensitivePassword[]; /* followed by: */ /* unsigned char * CaseSensitivePassword; */ /* STRING AccountName */ /* STRING PrimaryDomain */ @@ -599,7 +599,7 @@ typedef union smb_com_session_setup_andx { __le16 Action; /* see below */ __le16 SecurityBlobLength; __u16 ByteCount; - unsigned char SecurityBlob[1]; /* followed by */ + unsigned char SecurityBlob[]; /* followed by */ /* unsigned char * NativeOS; */ /* unsigned char * NativeLanMan; */ /* unsigned char * PrimaryDomain; */ @@ -618,7 +618,7 @@ typedef union smb_com_session_setup_andx { __le16 PasswordLength; __u32 Reserved; /* encrypt key len and offset */ __le16 ByteCount; - unsigned char AccountPassword[1]; /* followed by */ + unsigned char AccountPassword[]; /* followed by */ /* STRING AccountName */ /* STRING PrimaryDomain */ /* STRING NativeOS */ @@ -632,7 +632,7 @@ typedef union smb_com_session_setup_andx { __le16 AndXOffset; __le16 Action; /* see below */ __u16 ByteCount; - unsigned char NativeOS[1]; /* followed by */ + unsigned char NativeOS[]; /* followed by */ /* unsigned char * NativeLanMan; */ /* unsigned char * PrimaryDomain; */ } __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */ @@ -693,7 +693,7 @@ typedef struct smb_com_tconx_req { __le16 Flags; /* see below */ __le16 PasswordLength; __le16 ByteCount; - unsigned char Password[1]; /* followed by */ + unsigned char Password[]; /* followed by */ /* STRING Path *//* \\server\share name */ /* STRING Service */ } __attribute__((packed)) TCONX_REQ; @@ -705,7 +705,7 @@ typedef struct smb_com_tconx_rsp { __le16 AndXOffset; __le16 OptionalSupport; /* see below */ __u16 ByteCount; - unsigned char Service[1]; /* always ASCII, not Unicode */ + unsigned char Service[]; /* always ASCII, not Unicode */ /* STRING NativeFileSystem */ } __attribute__((packed)) TCONX_RSP; @@ -718,7 +718,7 @@ typedef struct smb_com_tconx_rsp_ext { __le32 MaximalShareAccessRights; __le32 GuestMaximalShareAccessRights; __u16 ByteCount; - unsigned char Service[1]; /* always ASCII, not Unicode */ + unsigned char Service[]; /* always ASCII, not Unicode */ /* STRING NativeFileSystem */ } __attribute__((packed)) TCONX_RSP_EXT; @@ -755,14 +755,14 @@ typedef struct smb_com_echo_req { struct smb_hdr hdr; __le16 EchoCount; __le16 ByteCount; - char Data[1]; + char Data[]; } __attribute__((packed)) ECHO_REQ; typedef struct smb_com_echo_rsp { struct smb_hdr hdr; __le16 SequenceNumber; __le16 ByteCount; - char Data[1]; + char Data[]; } __attribute__((packed)) ECHO_RSP; typedef struct smb_com_logoff_andx_req { @@ -862,7 +862,7 @@ typedef struct smb_com_open_req { /* also handles create */ __le32 ImpersonationLevel; __u8 SecurityFlags; __le16 ByteCount; - char fileName[1]; + char fileName[]; } __attribute__((packed)) OPEN_REQ; /* open response: oplock levels */ @@ -937,7 +937,7 @@ typedef struct smb_com_openx_req { __le32 Timeout; __le32 Reserved; __le16 ByteCount; /* file name follows */ - char fileName[1]; + char fileName[]; } __attribute__((packed)) OPENX_REQ; typedef struct smb_com_openx_rsp { @@ -1085,7 +1085,7 @@ typedef struct smb_com_lock_req { __le16 NumberOfUnlocks; __le16 NumberOfLocks; __le16 ByteCount; - LOCKING_ANDX_RANGE Locks[1]; + LOCKING_ANDX_RANGE Locks[]; } __attribute__((packed)) LOCK_REQ; /* lock type */ @@ -1114,7 +1114,7 @@ typedef struct smb_com_rename_req { __le16 SearchAttributes; /* target file attributes */ __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII or Unicode */ - unsigned char OldFileName[1]; + unsigned char OldFileName[]; /* followed by __u8 BufferFormat2 */ /* followed by NewFileName */ } __attribute__((packed)) RENAME_REQ; @@ -1134,7 +1134,7 @@ typedef struct smb_com_copy_req { __le16 Flags; __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII or Unicode */ - unsigned char OldFileName[1]; + unsigned char OldFileName[]; /* followed by __u8 BufferFormat2 */ /* followed by NewFileName string */ } __attribute__((packed)) COPY_REQ; @@ -1144,7 +1144,7 @@ typedef struct smb_com_copy_rsp { __le16 CopyCount; /* number of files copied */ __u16 ByteCount; /* may be zero */ __u8 BufferFormat; /* 0x04 - only present if errored file follows */ - unsigned char ErrorFileName[1]; /* only present if error in copy */ + unsigned char ErrorFileName[]; /* only present if error in copy */ } __attribute__((packed)) COPY_RSP; #define CREATE_HARD_LINK 0x103 @@ -1158,7 +1158,7 @@ typedef struct smb_com_nt_rename_req { /* A5 - also used for create hardlink */ __le32 ClusterCount; __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII or Unicode */ - unsigned char OldFileName[1]; + unsigned char OldFileName[]; /* followed by __u8 BufferFormat2 */ /* followed by NewFileName */ } __attribute__((packed)) NT_RENAME_REQ; @@ -1173,7 +1173,7 @@ typedef struct smb_com_delete_file_req { __le16 SearchAttributes; __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char fileName[1]; + unsigned char fileName[]; } __attribute__((packed)) DELETE_FILE_REQ; typedef struct smb_com_delete_file_rsp { @@ -1185,7 +1185,7 @@ typedef struct smb_com_delete_directory_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char DirName[1]; + unsigned char DirName[]; } __attribute__((packed)) DELETE_DIRECTORY_REQ; typedef struct smb_com_delete_directory_rsp { @@ -1197,7 +1197,7 @@ typedef struct smb_com_create_directory_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char DirName[1]; + unsigned char DirName[]; } __attribute__((packed)) CREATE_DIRECTORY_REQ; typedef struct smb_com_create_directory_rsp { @@ -1209,7 +1209,7 @@ typedef struct smb_com_query_information_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; /* 1 + namelen + 1 */ __u8 BufferFormat; /* 4 = ASCII */ - unsigned char FileName[1]; + unsigned char FileName[]; } __attribute__((packed)) QUERY_INFORMATION_REQ; typedef struct smb_com_query_information_rsp { @@ -1229,7 +1229,7 @@ typedef struct smb_com_setattr_req { __le16 reserved[5]; /* must be zero */ __u16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ - unsigned char fileName[1]; + unsigned char fileName[]; } __attribute__((packed)) SETATTR_REQ; typedef struct smb_com_setattr_rsp { @@ -1311,7 +1311,7 @@ typedef struct smb_com_transaction_ioctl_req { __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */ __le16 ByteCount; __u8 Pad[3]; - __u8 Data[1]; + __u8 Data[]; } __attribute__((packed)) TRANSACT_IOCTL_REQ; typedef struct smb_com_transaction_compr_ioctl_req { @@ -1430,7 +1430,7 @@ typedef struct smb_com_transaction_change_notify_req { __u8 Reserved2; __le16 ByteCount; /* __u8 Pad[3];*/ -/* __u8 Data[1];*/ +/* __u8 Data[];*/ } __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ; /* BB eventually change to use generic ntransact rsp struct @@ -1519,7 +1519,7 @@ struct cifs_quota_data { __u64 space_used; __u64 soft_limit; __u64 hard_limit; - char sid[1]; /* variable size? */ + char sid[]; /* variable size? */ } __attribute__((packed)); /* quota sub commands */ @@ -1671,7 +1671,7 @@ typedef struct smb_com_transaction2_qpi_req { __u8 Pad; __le16 InformationLevel; __u32 Reserved4; - char FileName[1]; + char FileName[]; } __attribute__((packed)) TRANSACTION2_QPI_REQ; typedef struct smb_com_transaction2_qpi_rsp { @@ -1704,7 +1704,7 @@ typedef struct smb_com_transaction2_spi_req { __u16 Pad1; __le16 InformationLevel; __u32 Reserved4; - char FileName[1]; + char FileName[]; } __attribute__((packed)) TRANSACTION2_SPI_REQ; typedef struct smb_com_transaction2_spi_rsp { @@ -1809,7 +1809,7 @@ typedef struct smb_com_transaction2_ffirst_req { __le16 SearchFlags; __le16 InformationLevel; __le32 SearchStorageType; - char FileName[1]; + char FileName[]; } __attribute__((packed)) TRANSACTION2_FFIRST_REQ; typedef struct smb_com_transaction2_ffirst_rsp { @@ -2020,7 +2020,7 @@ typedef struct smb_com_transaction2_get_dfs_refer_req { perhaps?) followed by one byte pad - doesn't seem to matter though */ __le16 MaxReferralLevel; - char RequestFileName[1]; + char RequestFileName[]; } __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_REQ; #define DFS_VERSION cpu_to_le16(0x0003) @@ -2049,7 +2049,7 @@ struct get_dfs_referral_rsp { __le16 PathConsumed; __le16 NumberOfReferrals; __le32 DFSFlags; - REFERRAL3 referrals[1]; /* array of level 3 dfs_referral structures */ + REFERRAL3 referrals[]; /* array of level 3 dfs_referral structures */ /* followed by the strings pointed to by the referral structures */ } __packed; @@ -2284,7 +2284,10 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */ __le32 Mode; __le32 AlignmentRequirement; __le32 FileNameLength; - char FileName[1]; + union { + char __pad; + DECLARE_FLEX_ARRAY(char, FileName); + }; } __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ typedef struct { @@ -2322,7 +2325,7 @@ typedef struct { } __attribute__((packed)) FILE_UNIX_BASIC_INFO; /* level 0x200 QPathInfo */ typedef struct { - char LinkDest[1]; + DECLARE_FLEX_ARRAY(char, LinkDest); } __attribute__((packed)) FILE_UNIX_LINK_INFO; /* level 0x201 QPathInfo */ /* The following three structures are needed only for @@ -2371,7 +2374,7 @@ struct file_end_of_file_info { } __attribute__((packed)); /* size info, level 0x104 for set, 0x106 for query */ struct file_alt_name_info { - __u8 alt_name[1]; + DECLARE_FLEX_ARRAY(__u8, alt_name); } __attribute__((packed)); /* level 0x0108 */ struct file_stream_info { @@ -2480,7 +2483,10 @@ typedef struct { __le32 NextEntryOffset; __u32 ResumeKey; /* as with FileIndex - no need to convert */ FILE_UNIX_BASIC_INFO basic; - char FileName[1]; + union { + char __pad; + DECLARE_FLEX_ARRAY(char, FileName); + }; } __attribute__((packed)) FILE_UNIX_INFO; /* level 0x202 */ typedef struct { @@ -2494,7 +2500,7 @@ typedef struct { __le64 AllocationSize; __le32 ExtFileAttributes; __le32 FileNameLength; - char FileName[1]; + char FileName[]; } __attribute__((packed)) FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */ typedef struct { @@ -2509,7 +2515,7 @@ typedef struct { __le32 ExtFileAttributes; __le32 FileNameLength; __le32 EaSize; /* length of the xattrs */ - char FileName[1]; + char FileName[]; } __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */ typedef struct { @@ -2526,7 +2532,7 @@ typedef struct { __le32 EaSize; /* EA size */ __le32 Reserved; __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ - char FileName[1]; + char FileName[]; } __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */ typedef struct { @@ -2544,7 +2550,7 @@ typedef struct { __u8 ShortNameLength; __u8 Reserved; __u8 ShortName[24]; - char FileName[1]; + char FileName[]; } __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */ typedef struct { @@ -2559,7 +2565,7 @@ typedef struct { __le32 AllocationSize; __le16 Attributes; /* verify not u32 */ __u8 FileNameLength; - char FileName[1]; + char FileName[]; } __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */ @@ -2569,21 +2575,11 @@ struct win_dev { __le64 minor; } __attribute__((packed)); -struct gea { - unsigned char name_len; - char name[1]; -} __attribute__((packed)); - -struct gealist { - unsigned long list_len; - struct gea list[1]; -} __attribute__((packed)); - struct fea { unsigned char EA_flags; __u8 name_len; __le16 value_len; - char name[1]; + char name[]; /* optionally followed by value */ } __attribute__((packed)); /* flags for _FEA.fEA */ @@ -2591,7 +2587,7 @@ struct fea { struct fealist { __le32 list_len; - struct fea list[1]; + struct fea list; } __attribute__((packed)); /* used to hold an arbitrary blob of data */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index b8a47704a6ef..b7a36ebd0f2f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -244,6 +244,9 @@ extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, unsigned int to_read); +int cifs_read_iter_from_socket(struct TCP_Server_Info *server, + struct iov_iter *iter, + unsigned int to_read); extern int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb); void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx); int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx); @@ -581,10 +584,7 @@ int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); int cifs_async_writev(struct cifs_writedata *wdata, void (*release)(struct kref *kref)); void cifs_writev_complete(struct work_struct *work); -struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, - work_func_t complete); -struct cifs_writedata *cifs_writedata_direct_alloc(struct page **pages, - work_func_t complete); +struct cifs_writedata *cifs_writedata_alloc(work_func_t complete); void cifs_writedata_release(struct kref *refcount); int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, @@ -601,13 +601,10 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, enum securityEnum); struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); -int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); int cifs_alloc_hash(const char *name, struct shash_desc **sdesc); void cifs_free_hash(struct shash_desc **sdesc); -void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset); struct cifs_chan * cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 60dd4e37030a..a24e4ddf8043 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -25,6 +25,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/uaccess.h> #include "cifspdu.h" +#include "cifsfs.h" #include "cifsglob.h" #include "cifsacl.h" #include "cifsproto.h" @@ -1295,11 +1296,8 @@ cifs_readv_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = tcon->ses->server; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, - .rq_pages = rdata->pages, - .rq_offset = rdata->page_offset, - .rq_npages = rdata->nr_pages, - .rq_pagesz = rdata->pagesz, - .rq_tailsz = rdata->tailsz }; + .rq_iter_size = iov_iter_count(&rdata->iter), + .rq_iter = rdata->iter }; struct cifs_credits credits = { .value = 1, .instance = 0 }; cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n", @@ -1738,11 +1736,8 @@ cifs_async_writev(struct cifs_writedata *wdata, rqst.rq_iov = iov; rqst.rq_nvec = 2; - rqst.rq_pages = wdata->pages; - rqst.rq_offset = wdata->page_offset; - rqst.rq_npages = wdata->nr_pages; - rqst.rq_pagesz = wdata->pagesz; - rqst.rq_tailsz = wdata->tailsz; + rqst.rq_iter = wdata->iter; + rqst.rq_iter_size = iov_iter_count(&wdata->iter); cifs_dbg(FYI, "async write at %llu %u bytes\n", wdata->offset, wdata->bytes); @@ -5373,14 +5368,15 @@ CIFSSMBSetPathInfoFB(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; int rc; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = fileName; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = fileName, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) @@ -5787,7 +5783,7 @@ QAllEAsRetry: /* account for ea list len */ list_len -= 4; - temp_fea = ea_response_data->list; + temp_fea = &ea_response_data->list; temp_ptr = (char *)temp_fea; while (list_len > 0) { unsigned int name_len; @@ -5902,7 +5898,7 @@ SetEARetry: else name_len = strnlen(ea_name, 255); - count = sizeof(*parm_data) + ea_value_len + name_len; + count = sizeof(*parm_data) + 1 + ea_value_len + name_len; pSMB->MaxParameterCount = cpu_to_le16(2); /* BB find max SMB PDU from sess */ pSMB->MaxDataCount = cpu_to_le16(1000); @@ -5926,14 +5922,14 @@ SetEARetry: byte_count = 3 /* pad */ + params + count; pSMB->DataCount = cpu_to_le16(count); parm_data->list_len = cpu_to_le32(count); - parm_data->list[0].EA_flags = 0; + parm_data->list.EA_flags = 0; /* we checked above that name len is less than 255 */ - parm_data->list[0].name_len = (__u8)name_len; + parm_data->list.name_len = (__u8)name_len; /* EA names are always ASCII */ if (ea_name) - strncpy(parm_data->list[0].name, ea_name, name_len); - parm_data->list[0].name[name_len] = 0; - parm_data->list[0].value_len = cpu_to_le16(ea_value_len); + strncpy(parm_data->list.name, ea_name, name_len); + parm_data->list.name[name_len] = '\0'; + parm_data->list.value_len = cpu_to_le16(ea_value_len); /* caller ensures that ea_value_len is less than 64K but we need to ensure that it fits within the smb */ @@ -5941,7 +5937,7 @@ SetEARetry: negotiated SMB buffer size BB */ /* if (ea_value_len > buffer_size - 512 (enough for header)) */ if (ea_value_len) - memcpy(parm_data->list[0].name+name_len+1, + memcpy(parm_data->list.name + name_len + 1, ea_value, ea_value_len); pSMB->TotalDataCount = pSMB->DataCount; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e6088d96eb04..ec020d860be3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -79,8 +79,6 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) int len; char *unc; struct sockaddr_storage ss; - time64_t expiry, now; - unsigned long ttl = SMB_DNS_RESOLVE_INTERVAL_DEFAULT; if (!server->hostname) return -EINVAL; @@ -102,29 +100,19 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) ss = server->dstaddr; spin_unlock(&server->srv_lock); - rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, &expiry); + rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, NULL); kfree(unc); if (rc < 0) { cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n", __func__, server->hostname, rc); - goto requeue_resolve; + } else { + spin_lock(&server->srv_lock); + memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr)); + spin_unlock(&server->srv_lock); + rc = 0; } - spin_lock(&server->srv_lock); - memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr)); - spin_unlock(&server->srv_lock); - - now = ktime_get_real_seconds(); - if (expiry && expiry > now) - /* To make sure we don't use the cached entry, retry 1s */ - ttl = max_t(unsigned long, expiry - now, SMB_DNS_RESOLVE_INTERVAL_MIN) + 1; - -requeue_resolve: - cifs_dbg(FYI, "%s: next dns resolution scheduled for %lu seconds in the future\n", - __func__, ttl); - mod_delayed_work(cifsiod_wq, &server->resolve, (ttl * HZ)); - return rc; } @@ -148,26 +136,6 @@ static void smb2_query_server_interfaces(struct work_struct *work) (SMB_INTERFACE_POLL_INTERVAL * HZ)); } -static void cifs_resolve_server(struct work_struct *work) -{ - int rc; - struct TCP_Server_Info *server = container_of(work, - struct TCP_Server_Info, resolve.work); - - cifs_server_lock(server); - - /* - * Resolve the hostname again to make sure that IP address is up-to-date. - */ - rc = reconn_set_ipaddr_from_hostname(server); - if (rc) { - cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", - __func__, rc); - } - - cifs_server_unlock(server); -} - /* * Update the tcpStatus for the server. * This is used to signal the cifsd thread to call cifs_reconnect @@ -766,6 +734,20 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, return cifs_readv_from_socket(server, &smb_msg); } +int +cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter, + unsigned int to_read) +{ + struct msghdr smb_msg = { .msg_iter = *iter }; + int ret; + + iov_iter_truncate(&smb_msg.msg_iter, to_read); + ret = cifs_readv_from_socket(server, &smb_msg); + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + static bool is_smb_response(struct TCP_Server_Info *server, unsigned char type) { @@ -926,7 +908,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) spin_unlock(&server->srv_lock); cancel_delayed_work_sync(&server->echo); - cancel_delayed_work_sync(&server->resolve); spin_lock(&server->srv_lock); server->tcpStatus = CifsExiting; @@ -1550,7 +1531,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) cifs_put_tcp_session(server->primary_server, from_reconnect); cancel_delayed_work_sync(&server->echo); - cancel_delayed_work_sync(&server->resolve); if (from_reconnect) /* @@ -1656,7 +1636,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); - INIT_DELAYED_WORK(&tcp_ses->resolve, cifs_resolve_server); INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server); mutex_init(&tcp_ses->reconnect_mutex); #ifdef CONFIG_CIFS_DFS_UPCALL @@ -1745,12 +1724,6 @@ smbd_connected: /* queue echo request delayed work */ queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); - /* queue dns resolution delayed work */ - cifs_dbg(FYI, "%s: next dns resolution scheduled for %d seconds in the future\n", - __func__, SMB_DNS_RESOLVE_INTERVAL_DEFAULT); - - queue_delayed_work(cifsiod_wq, &tcp_ses->resolve, (SMB_DNS_RESOLVE_INTERVAL_DEFAULT * HZ)); - return tcp_ses; out_err_crypto_release: @@ -2844,72 +2817,48 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * negprot - BB check reconnection in case where second * sessinit is sent but no second negprot */ - struct rfc1002_session_packet *ses_init_buf; - unsigned int req_noscope_len; - struct smb_hdr *smb_buf; + struct rfc1002_session_packet req = {}; + struct smb_hdr *smb_buf = (struct smb_hdr *)&req; + unsigned int len; - ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), - GFP_KERNEL); + req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name); - if (ses_init_buf) { - ses_init_buf->trailer.session_req.called_len = 32; + if (server->server_RFC1001_name[0] != 0) + rfc1002mangle(req.trailer.session_req.called_name, + server->server_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(req.trailer.session_req.called_name, + DEFAULT_CIFS_CALLED_NAME, + RFC1001_NAME_LEN_WITH_NULL); - if (server->server_RFC1001_name[0] != 0) - rfc1002mangle(ses_init_buf->trailer. - session_req.called_name, - server->server_RFC1001_name, - RFC1001_NAME_LEN_WITH_NULL); - else - rfc1002mangle(ses_init_buf->trailer. - session_req.called_name, - DEFAULT_CIFS_CALLED_NAME, - RFC1001_NAME_LEN_WITH_NULL); + req.trailer.session_req.calling_len = sizeof(req.trailer.session_req.calling_name); - ses_init_buf->trailer.session_req.calling_len = 32; + /* calling name ends in null (byte 16) from old smb convention */ + if (server->workstation_RFC1001_name[0] != 0) + rfc1002mangle(req.trailer.session_req.calling_name, + server->workstation_RFC1001_name, + RFC1001_NAME_LEN_WITH_NULL); + else + rfc1002mangle(req.trailer.session_req.calling_name, + "LINUX_CIFS_CLNT", + RFC1001_NAME_LEN_WITH_NULL); - /* - * calling name ends in null (byte 16) from old smb - * convention. - */ - if (server->workstation_RFC1001_name[0] != 0) - rfc1002mangle(ses_init_buf->trailer. - session_req.calling_name, - server->workstation_RFC1001_name, - RFC1001_NAME_LEN_WITH_NULL); - else - rfc1002mangle(ses_init_buf->trailer. - session_req.calling_name, - "LINUX_CIFS_CLNT", - RFC1001_NAME_LEN_WITH_NULL); - - ses_init_buf->trailer.session_req.scope1 = 0; - ses_init_buf->trailer.session_req.scope2 = 0; - smb_buf = (struct smb_hdr *)ses_init_buf; - - /* sizeof RFC1002_SESSION_REQUEST with no scopes */ - req_noscope_len = sizeof(struct rfc1002_session_packet) - 2; - - /* == cpu_to_be32(0x81000044) */ - smb_buf->smb_buf_length = - cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len); - rc = smb_send(server, smb_buf, 0x44); - kfree(ses_init_buf); - /* - * RFC1001 layer in at least one server - * requires very short break before negprot - * presumably because not expecting negprot - * to follow so fast. This is a simple - * solution that works without - * complicating the code and causes no - * significant slowing down on mount - * for everyone else - */ - usleep_range(1000, 2000); - } /* - * else the negprot may still work without this - * even though malloc failed + * As per rfc1002, @len must be the number of bytes that follows the + * length field of a rfc1002 session request payload. */ + len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req); + + smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len); + rc = smb_send(server, smb_buf, len); + /* + * RFC1001 layer in at least one server requires very short break before + * negprot presumably because not expecting negprot to follow so fast. + * This is a simple solution that works without complicating the code + * and causes no significant slowing down on mount for everyone else + */ + usleep_range(1000, 2000); return rc; } @@ -3760,16 +3709,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; - struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; - struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr; bool is_binding = false; spin_lock(&ses->ses_lock); - if (server->dstaddr.ss_family == AF_INET6) - scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); - else - scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); - if (ses->ses_status != SES_GOOD && ses->ses_status != SES_NEW && ses->ses_status != SES_NEED_RECON) { @@ -3793,6 +3738,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, ses->ses_status = SES_IN_SETUP; spin_unlock(&ses->ses_lock); + /* update ses ip_addr only for primary chan */ + if (server == pserver) { + if (server->dstaddr.ss_family == AF_INET6) + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); + else + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); + } + if (!is_binding) { ses->capabilities = server->capabilities; if (!linuxExtEnabled) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 2b6076324ffc..30b1e1bfd204 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -304,15 +304,16 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned if (!tcon->unix_ext && (mode & S_IWUGO) == 0) create_options |= CREATE_OPTION_READONLY; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = desired_access; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.disposition = disposition; - oparms.path = full_path; - oparms.fid = fid; - oparms.reconnect = false; - oparms.mode = mode; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = desired_access, + .create_options = cifs_create_options(cifs_sb, create_options), + .disposition = disposition, + .path = full_path, + .fid = fid, + .mode = mode, + }; rc = server->ops->open(xid, &oparms, oplock, buf); if (rc) { cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 2870e3b6ffe8..0e602173ac76 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -38,6 +38,125 @@ #include "cached_dir.h" /* + * Remove the dirty flags from a span of pages. + */ +static void cifs_undirty_folios(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each_marked(&xas, folio, end, PAGECACHE_TAG_DIRTY) { + xas_pause(&xas); + rcu_read_unlock(); + folio_lock(folio); + folio_clear_dirty_for_io(folio); + folio_unlock(folio); + rcu_read_lock(); + } + + rcu_read_unlock(); +} + +/* + * Completion of write to server. + */ +void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + if (!len) + return; + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, end) { + if (!folio_test_writeback(folio)) { + WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", + len, start, folio_index(folio), end); + continue; + } + + folio_detach_private(folio); + folio_end_writeback(folio); + } + + rcu_read_unlock(); +} + +/* + * Failure of write to server. + */ +void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + if (!len) + return; + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, end) { + if (!folio_test_writeback(folio)) { + WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", + len, start, folio_index(folio), end); + continue; + } + + folio_set_error(folio); + folio_end_writeback(folio); + } + + rcu_read_unlock(); +} + +/* + * Redirty pages after a temporary failure. + */ +void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int len) +{ + struct address_space *mapping = inode->i_mapping; + struct folio *folio; + pgoff_t end; + + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); + + if (!len) + return; + + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, folio, end) { + if (!folio_test_writeback(folio)) { + WARN_ONCE(1, "bad %x @%llx page %lx %lx\n", + len, start, folio_index(folio), end); + continue; + } + + filemap_dirty_folio(folio->mapping, folio); + folio_end_writeback(folio); + } + + rcu_read_unlock(); +} + +/* * Mark as invalid, all open files on tree connections since they * were closed when session to server was lost. */ @@ -261,14 +380,15 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_ if (f_flags & O_DIRECT) create_options |= CREATE_NO_BUFFER; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = desired_access; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.disposition = disposition; - oparms.path = full_path; - oparms.fid = fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = desired_access, + .create_options = cifs_create_options(cifs_sb, create_options), + .disposition = disposition, + .path = full_path, + .fid = fid, + }; rc = server->ops->open(xid, &oparms, oplock, buf); if (rc) @@ -849,14 +969,16 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &cfile->fid); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = desired_access; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.disposition = disposition; - oparms.path = full_path; - oparms.fid = &cfile->fid; - oparms.reconnect = true; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = desired_access, + .create_options = cifs_create_options(cifs_sb, create_options), + .disposition = disposition, + .path = full_path, + .fid = &cfile->fid, + .reconnect = true, + }; /* * Can not refresh inode by passing in file_info buf to be returned by @@ -2296,7 +2418,6 @@ cifs_writedata_release(struct kref *refcount) if (wdata->cfile) cifsFileInfo_put(wdata->cfile); - kvfree(wdata->pages); kfree(wdata); } @@ -2307,51 +2428,49 @@ cifs_writedata_release(struct kref *refcount) static void cifs_writev_requeue(struct cifs_writedata *wdata) { - int i, rc = 0; + int rc = 0; struct inode *inode = d_inode(wdata->cfile->dentry); struct TCP_Server_Info *server; - unsigned int rest_len; + unsigned int rest_len = wdata->bytes; + loff_t fpos = wdata->offset; server = tlink_tcon(wdata->cfile->tlink)->ses->server; - i = 0; - rest_len = wdata->bytes; do { struct cifs_writedata *wdata2; - unsigned int j, nr_pages, wsize, tailsz, cur_len; + unsigned int wsize, cur_len; wsize = server->ops->wp_retry_size(inode); if (wsize < rest_len) { - nr_pages = wsize / PAGE_SIZE; - if (!nr_pages) { + if (wsize < PAGE_SIZE) { rc = -EOPNOTSUPP; break; } - cur_len = nr_pages * PAGE_SIZE; - tailsz = PAGE_SIZE; + cur_len = min(round_down(wsize, PAGE_SIZE), rest_len); } else { - nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE); cur_len = rest_len; - tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE; } - wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete); + wdata2 = cifs_writedata_alloc(cifs_writev_complete); if (!wdata2) { rc = -ENOMEM; break; } - for (j = 0; j < nr_pages; j++) { - wdata2->pages[j] = wdata->pages[i + j]; - lock_page(wdata2->pages[j]); - clear_page_dirty_for_io(wdata2->pages[j]); - } - wdata2->sync_mode = wdata->sync_mode; - wdata2->nr_pages = nr_pages; - wdata2->offset = page_offset(wdata2->pages[0]); - wdata2->pagesz = PAGE_SIZE; - wdata2->tailsz = tailsz; - wdata2->bytes = cur_len; + wdata2->offset = fpos; + wdata2->bytes = cur_len; + wdata2->iter = wdata->iter; + + iov_iter_advance(&wdata2->iter, fpos - wdata->offset); + iov_iter_truncate(&wdata2->iter, wdata2->bytes); + + if (iov_iter_is_xarray(&wdata2->iter)) + /* Check for pages having been redirtied and clean + * them. We can do this by walking the xarray. If + * it's not an xarray, then it's a DIO and we shouldn't + * be mucking around with the page bits. + */ + cifs_undirty_folios(inode, fpos, cur_len); rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &wdata2->cfile); @@ -2366,33 +2485,22 @@ cifs_writev_requeue(struct cifs_writedata *wdata) cifs_writedata_release); } - for (j = 0; j < nr_pages; j++) { - unlock_page(wdata2->pages[j]); - if (rc != 0 && !is_retryable_error(rc)) { - SetPageError(wdata2->pages[j]); - end_page_writeback(wdata2->pages[j]); - put_page(wdata2->pages[j]); - } - } - kref_put(&wdata2->refcount, cifs_writedata_release); if (rc) { if (is_retryable_error(rc)) continue; - i += nr_pages; + fpos += cur_len; + rest_len -= cur_len; break; } + fpos += cur_len; rest_len -= cur_len; - i += nr_pages; - } while (i < wdata->nr_pages); + } while (rest_len > 0); - /* cleanup remaining pages from the original wdata */ - for (; i < wdata->nr_pages; i++) { - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - put_page(wdata->pages[i]); - } + /* Clean up remaining pages from the original wdata */ + if (iov_iter_is_xarray(&wdata->iter)) + cifs_pages_write_failed(inode, fpos, rest_len); if (rc != 0 && !is_retryable_error(rc)) mapping_set_error(inode->i_mapping, rc); @@ -2405,7 +2513,6 @@ cifs_writev_complete(struct work_struct *work) struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); struct inode *inode = d_inode(wdata->cfile->dentry); - int i = 0; if (wdata->result == 0) { spin_lock(&inode->i_lock); @@ -2416,45 +2523,24 @@ cifs_writev_complete(struct work_struct *work) } else if (wdata->sync_mode == WB_SYNC_ALL && wdata->result == -EAGAIN) return cifs_writev_requeue(wdata); - for (i = 0; i < wdata->nr_pages; i++) { - struct page *page = wdata->pages[i]; + if (wdata->result == -EAGAIN) + cifs_pages_write_redirty(inode, wdata->offset, wdata->bytes); + else if (wdata->result < 0) + cifs_pages_write_failed(inode, wdata->offset, wdata->bytes); + else + cifs_pages_written_back(inode, wdata->offset, wdata->bytes); - if (wdata->result == -EAGAIN) - __set_page_dirty_nobuffers(page); - else if (wdata->result < 0) - SetPageError(page); - end_page_writeback(page); - cifs_readpage_to_fscache(inode, page); - put_page(page); - } if (wdata->result != -EAGAIN) mapping_set_error(inode->i_mapping, wdata->result); kref_put(&wdata->refcount, cifs_writedata_release); } -struct cifs_writedata * -cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete) -{ - struct cifs_writedata *writedata = NULL; - struct page **pages = - kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (pages) { - writedata = cifs_writedata_direct_alloc(pages, complete); - if (!writedata) - kvfree(pages); - } - - return writedata; -} - -struct cifs_writedata * -cifs_writedata_direct_alloc(struct page **pages, work_func_t complete) +struct cifs_writedata *cifs_writedata_alloc(work_func_t complete) { struct cifs_writedata *wdata; wdata = kzalloc(sizeof(*wdata), GFP_NOFS); if (wdata != NULL) { - wdata->pages = pages; kref_init(&wdata->refcount); INIT_LIST_HEAD(&wdata->list); init_completion(&wdata->done); @@ -2463,7 +2549,6 @@ cifs_writedata_direct_alloc(struct page **pages, work_func_t complete) return wdata; } - static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) { struct address_space *mapping = page->mapping; @@ -2522,310 +2607,372 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) return rc; } -static struct cifs_writedata * -wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, - pgoff_t end, pgoff_t *index, - unsigned int *found_pages) +/* + * Extend the region to be written back to include subsequent contiguously + * dirty pages if possible, but don't sleep while doing so. + */ +static void cifs_extend_writeback(struct address_space *mapping, + long *_count, + loff_t start, + int max_pages, + size_t max_len, + unsigned int *_len) { - struct cifs_writedata *wdata; - - wdata = cifs_writedata_alloc((unsigned int)tofind, - cifs_writev_complete); - if (!wdata) - return NULL; - - *found_pages = find_get_pages_range_tag(mapping, index, end, - PAGECACHE_TAG_DIRTY, tofind, wdata->pages); - return wdata; -} + struct folio_batch batch; + struct folio *folio; + unsigned int psize, nr_pages; + size_t len = *_len; + pgoff_t index = (start + len) / PAGE_SIZE; + bool stop = true; + unsigned int i; + XA_STATE(xas, &mapping->i_pages, index); -static unsigned int -wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages, - struct address_space *mapping, - struct writeback_control *wbc, - pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done) -{ - unsigned int nr_pages = 0, i; - struct page *page; + folio_batch_init(&batch); - for (i = 0; i < found_pages; i++) { - page = wdata->pages[i]; - /* - * At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping + do { + /* Firstly, we gather up a batch of contiguous dirty pages + * under the RCU read lock - but we can't clear the dirty flags + * there if any of those pages are mapped. */ + rcu_read_lock(); - if (nr_pages == 0) - lock_page(page); - else if (!trylock_page(page)) - break; - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - break; - } + xas_for_each(&xas, folio, ULONG_MAX) { + stop = true; + if (xas_retry(&xas, folio)) + continue; + if (xa_is_value(folio)) + break; + if (folio_index(folio) != index) + break; + if (!folio_try_get_rcu(folio)) { + xas_reset(&xas); + continue; + } + nr_pages = folio_nr_pages(folio); + if (nr_pages > max_pages) + break; - if (!wbc->range_cyclic && page->index > end) { - *done = true; - unlock_page(page); - break; - } + /* Has the page moved or been split? */ + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); + break; + } - if (*next && (page->index != *next)) { - /* Not next consecutive page */ - unlock_page(page); - break; - } + if (!folio_trylock(folio)) { + folio_put(folio); + break; + } + if (!folio_test_dirty(folio) || folio_test_writeback(folio)) { + folio_unlock(folio); + folio_put(folio); + break; + } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + max_pages -= nr_pages; + psize = folio_size(folio); + len += psize; + stop = false; + if (max_pages <= 0 || len >= max_len || *_count <= 0) + stop = true; - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - break; + index += nr_pages; + if (!folio_batch_add(&batch, folio)) + break; + if (stop) + break; } - /* - * This actually clears the dirty bit in the radix tree. - * See cifs_writepage() for more commentary. + if (!stop) + xas_pause(&xas); + rcu_read_unlock(); + + /* Now, if we obtained any pages, we can shift them to being + * writable and mark them for caching. */ - set_page_writeback(page); - if (page_offset(page) >= i_size_read(mapping->host)) { - *done = true; - unlock_page(page); - end_page_writeback(page); + if (!folio_batch_count(&batch)) break; - } - wdata->pages[i] = page; - *next = page->index + 1; - ++nr_pages; - } - - /* reset index to refind any pages skipped */ - if (nr_pages == 0) - *index = wdata->pages[0]->index + 1; - - /* put any pages we aren't going to use */ - for (i = nr_pages; i < found_pages; i++) { - put_page(wdata->pages[i]); - wdata->pages[i] = NULL; - } - - return nr_pages; -} - -static int -wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, - struct address_space *mapping, struct writeback_control *wbc) -{ - int rc; - - wdata->sync_mode = wbc->sync_mode; - wdata->nr_pages = nr_pages; - wdata->offset = page_offset(wdata->pages[0]); - wdata->pagesz = PAGE_SIZE; - wdata->tailsz = min(i_size_read(mapping->host) - - page_offset(wdata->pages[nr_pages - 1]), - (loff_t)PAGE_SIZE); - wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz; - wdata->pid = wdata->cfile->pid; - - rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes); - if (rc) - return rc; - - if (wdata->cfile->invalidHandle) - rc = -EAGAIN; - else - rc = wdata->server->ops->async_writev(wdata, - cifs_writedata_release); - - return rc; -} + for (i = 0; i < folio_batch_count(&batch); i++) { + folio = batch.folios[i]; + /* The folio should be locked, dirty and not undergoing + * writeback from the loop above. + */ + if (!folio_clear_dirty_for_io(folio)) + WARN_ON(1); + if (folio_start_writeback(folio)) + WARN_ON(1); -static int -cifs_writepage_locked(struct page *page, struct writeback_control *wbc); + *_count -= folio_nr_pages(folio); + folio_unlock(folio); + } -static int cifs_write_one_page(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret; + folio_batch_release(&batch); + cond_resched(); + } while (!stop); - ret = cifs_writepage_locked(page, wbc); - unlock_page(page); - mapping_set_error(mapping, ret); - return ret; + *_len = len; } -static int cifs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +/* + * Write back the locked page and any subsequent non-locked dirty pages. + */ +static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping, + struct writeback_control *wbc, + struct folio *folio, + loff_t start, loff_t end) { struct inode *inode = mapping->host; - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct TCP_Server_Info *server; - bool done = false, scanned = false, range_whole = false; - pgoff_t end, index; struct cifs_writedata *wdata; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_credits credits_on_stack; + struct cifs_credits *credits = &credits_on_stack; struct cifsFileInfo *cfile = NULL; - int rc = 0; - int saved_rc = 0; - unsigned int xid; + unsigned int xid, wsize, len; + loff_t i_size = i_size_read(inode); + size_t max_len; + long count = wbc->nr_to_write; + int rc; - /* - * If wsize is smaller than the page cache size, default to writing - * one page at a time. - */ - if (cifs_sb->ctx->wsize < PAGE_SIZE) - return write_cache_pages(mapping, wbc, cifs_write_one_page, - mapping); + /* The folio should be locked, dirty and not undergoing writeback. */ + if (folio_start_writeback(folio)) + WARN_ON(1); + + count -= folio_nr_pages(folio); + len = folio_size(folio); xid = get_xid(); - if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ - end = -1; - } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = true; - scanned = true; - } server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses); -retry: - while (!done && index <= end) { - unsigned int i, nr_pages, found_pages, wsize; - pgoff_t next = 0, tofind, saved_index = index; - struct cifs_credits credits_on_stack; - struct cifs_credits *credits = &credits_on_stack; - int get_file_rc = 0; + rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile); + if (rc) { + cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", rc); + goto err_xid; + } - if (cfile) - cifsFileInfo_put(cfile); + rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize, + &wsize, credits); + if (rc != 0) + goto err_close; - rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile); + wdata = cifs_writedata_alloc(cifs_writev_complete); + if (!wdata) { + rc = -ENOMEM; + goto err_uncredit; + } - /* in case of an error store it to return later */ - if (rc) - get_file_rc = rc; + wdata->sync_mode = wbc->sync_mode; + wdata->offset = folio_pos(folio); + wdata->pid = cfile->pid; + wdata->credits = credits_on_stack; + wdata->cfile = cfile; + wdata->server = server; + cfile = NULL; + + /* Find all consecutive lockable dirty pages, stopping when we find a + * page that is not immediately lockable, is not dirty or is missing, + * or we reach the end of the range. + */ + if (start < i_size) { + /* Trim the write to the EOF; the extra data is ignored. Also + * put an upper limit on the size of a single storedata op. + */ + max_len = wsize; + max_len = min_t(unsigned long long, max_len, end - start + 1); + max_len = min_t(unsigned long long, max_len, i_size - start); - rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize, - &wsize, credits); - if (rc != 0) { - done = true; - break; - } + if (len < max_len) { + int max_pages = INT_MAX; - tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1; +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + max_pages = server->smbd_conn->max_frmr_depth; +#endif + max_pages -= folio_nr_pages(folio); - wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index, - &found_pages); - if (!wdata) { - rc = -ENOMEM; - done = true; - add_credits_and_wake_if(server, credits, 0); - break; + if (max_pages > 0) + cifs_extend_writeback(mapping, &count, start, + max_pages, max_len, &len); } + len = min_t(loff_t, len, max_len); + } - if (found_pages == 0) { - kref_put(&wdata->refcount, cifs_writedata_release); - add_credits_and_wake_if(server, credits, 0); - break; - } + wdata->bytes = len; + + /* We now have a contiguous set of dirty pages, each with writeback + * set; the first page is still locked at this point, but all the rest + * have been unlocked. + */ + folio_unlock(folio); + + if (start < i_size) { + iov_iter_xarray(&wdata->iter, ITER_SOURCE, &mapping->i_pages, + start, len); - nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc, - end, &index, &next, &done); + rc = adjust_credits(wdata->server, &wdata->credits, wdata->bytes); + if (rc) + goto err_wdata; - /* nothing to write? */ - if (nr_pages == 0) { + if (wdata->cfile->invalidHandle) + rc = -EAGAIN; + else + rc = wdata->server->ops->async_writev(wdata, + cifs_writedata_release); + if (rc >= 0) { kref_put(&wdata->refcount, cifs_writedata_release); - add_credits_and_wake_if(server, credits, 0); - continue; + goto err_close; } + } else { + /* The dirty region was entirely beyond the EOF. */ + cifs_pages_written_back(inode, start, len); + rc = 0; + } - wdata->credits = credits_on_stack; - wdata->cfile = cfile; - wdata->server = server; - cfile = NULL; +err_wdata: + kref_put(&wdata->refcount, cifs_writedata_release); +err_uncredit: + add_credits_and_wake_if(server, credits, 0); +err_close: + if (cfile) + cifsFileInfo_put(cfile); +err_xid: + free_xid(xid); + if (rc == 0) { + wbc->nr_to_write = count; + } else if (is_retryable_error(rc)) { + cifs_pages_write_redirty(inode, start, len); + } else { + cifs_pages_write_failed(inode, start, len); + mapping_set_error(mapping, rc); + } + /* Indication to update ctime and mtime as close is deferred */ + set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); + return rc; +} - if (!wdata->cfile) { - cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", - get_file_rc); - if (is_retryable_error(get_file_rc)) - rc = get_file_rc; - else - rc = -EBADF; - } else - rc = wdata_send_pages(wdata, nr_pages, mapping, wbc); +/* + * write a region of pages back to the server + */ +static int cifs_writepages_region(struct address_space *mapping, + struct writeback_control *wbc, + loff_t start, loff_t end, loff_t *_next) +{ + struct folio *folio; + struct page *head_page; + ssize_t ret; + int n, skips = 0; - for (i = 0; i < nr_pages; ++i) - unlock_page(wdata->pages[i]); + do { + pgoff_t index = start / PAGE_SIZE; - /* send failure -- clean up the mess */ - if (rc != 0) { - add_credits_and_wake_if(server, &wdata->credits, 0); - for (i = 0; i < nr_pages; ++i) { - if (is_retryable_error(rc)) - redirty_page_for_writepage(wbc, - wdata->pages[i]); - else - SetPageError(wdata->pages[i]); - end_page_writeback(wdata->pages[i]); - put_page(wdata->pages[i]); + n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, + PAGECACHE_TAG_DIRTY, 1, &head_page); + if (!n) + break; + + folio = page_folio(head_page); + start = folio_pos(folio); /* May regress with THPs */ + + /* At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) { + folio_put(folio); + return ret; + } + } else { + if (!folio_trylock(folio)) { + folio_put(folio); + return 0; } - if (!is_retryable_error(rc)) - mapping_set_error(mapping, rc); } - kref_put(&wdata->refcount, cifs_writedata_release); - if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) { - index = saved_index; + if (folio_mapping(folio) != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + folio_put(folio); continue; } - /* Return immediately if we received a signal during writing */ - if (is_interrupt_error(rc)) { - done = true; - break; + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode != WB_SYNC_NONE) { + folio_wait_writeback(folio); +#ifdef CONFIG_CIFS_FSCACHE + folio_wait_fscache(folio); +#endif + } else { + start += folio_size(folio); + } + folio_put(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) + break; + skips++; + } + continue; } - if (rc != 0 && saved_rc == 0) - saved_rc = rc; + if (!folio_clear_dirty_for_io(folio)) + /* We hold the page lock - it should've been dirty. */ + WARN_ON(1); - wbc->nr_to_write -= nr_pages; - if (wbc->nr_to_write <= 0) - done = true; + ret = cifs_write_back_from_locked_folio(mapping, wbc, folio, start, end); + folio_put(folio); + if (ret < 0) + return ret; - index = next; - } + start += ret; + cond_resched(); + } while (wbc->nr_to_write > 0); - if (!scanned && !done) { - /* - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - scanned = true; - index = 0; - goto retry; - } + *_next = start; + return 0; +} - if (saved_rc != 0) - rc = saved_rc; +/* + * Write some of the pending data back to the server + */ +static int cifs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + loff_t start, next; + int ret; - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + /* We have to be careful as we can end up racing with setattr() + * truncating the pagecache since the caller doesn't take a lock here + * to prevent it. + */ - if (cfile) - cifsFileInfo_put(cfile); - free_xid(xid); - /* Indication to update ctime and mtime as close is deferred */ - set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); - return rc; + if (wbc->range_cyclic) { + start = mapping->writeback_index * PAGE_SIZE; + ret = cifs_writepages_region(mapping, wbc, start, LLONG_MAX, &next); + if (ret == 0) { + mapping->writeback_index = next / PAGE_SIZE; + if (start > 0 && wbc->nr_to_write > 0) { + ret = cifs_writepages_region(mapping, wbc, 0, + start, &next); + if (ret == 0) + mapping->writeback_index = + next / PAGE_SIZE; + } + } + } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { + ret = cifs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next); + if (wbc->nr_to_write > 0 && ret == 0) + mapping->writeback_index = next / PAGE_SIZE; + } else { + ret = cifs_writepages_region(mapping, wbc, + wbc->range_start, wbc->range_end, &next); + } + + return ret; } static int @@ -2877,6 +3024,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct cifsFileInfo *cfile = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); + struct folio *folio = page_folio(page); __u32 pid; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) @@ -2887,14 +3035,14 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n", page, pos, copied); - if (PageChecked(page)) { + if (folio_test_checked(folio)) { if (copied == len) - SetPageUptodate(page); - ClearPageChecked(page); - } else if (!PageUptodate(page) && copied == PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); + folio_clear_checked(folio); + } else if (!folio_test_uptodate(folio) && copied == PAGE_SIZE) + folio_mark_uptodate(folio); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { char *page_data; unsigned offset = pos & (PAGE_SIZE - 1); unsigned int xid; @@ -3054,57 +3202,13 @@ int cifs_flush(struct file *file, fl_owner_t id) return rc; } -static int -cifs_write_allocate_pages(struct page **pages, unsigned long num_pages) -{ - int rc = 0; - unsigned long i; - - for (i = 0; i < num_pages; i++) { - pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!pages[i]) { - /* - * save number of pages we have already allocated and - * return with ENOMEM error - */ - num_pages = i; - rc = -ENOMEM; - break; - } - } - - if (rc) { - for (i = 0; i < num_pages; i++) - put_page(pages[i]); - } - return rc; -} - -static inline -size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len) -{ - size_t num_pages; - size_t clen; - - clen = min_t(const size_t, len, wsize); - num_pages = DIV_ROUND_UP(clen, PAGE_SIZE); - - if (cur_len) - *cur_len = clen; - - return num_pages; -} - static void cifs_uncached_writedata_release(struct kref *refcount) { - int i; struct cifs_writedata *wdata = container_of(refcount, struct cifs_writedata, refcount); kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release); - for (i = 0; i < wdata->nr_pages; i++) - put_page(wdata->pages[i]); cifs_writedata_release(refcount); } @@ -3131,48 +3235,6 @@ cifs_uncached_writev_complete(struct work_struct *work) } static int -wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, - size_t *len, unsigned long *num_pages) -{ - size_t save_len, copied, bytes, cur_len = *len; - unsigned long i, nr_pages = *num_pages; - - save_len = cur_len; - for (i = 0; i < nr_pages; i++) { - bytes = min_t(const size_t, cur_len, PAGE_SIZE); - copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from); - cur_len -= copied; - /* - * If we didn't copy as much as we expected, then that - * may mean we trod into an unmapped area. Stop copying - * at that point. On the next pass through the big - * loop, we'll likely end up getting a zero-length - * write and bailing out of it. - */ - if (copied < bytes) - break; - } - cur_len = save_len - cur_len; - *len = cur_len; - - /* - * If we have no data to send, then that probably means that - * the copy above failed altogether. That's most likely because - * the address in the iovec was bogus. Return -EFAULT and let - * the caller free anything we allocated and bail out. - */ - if (!cur_len) - return -EFAULT; - - /* - * i + 1 now represents the number of pages we actually used in - * the copy phase above. - */ - *num_pages = i + 1; - return 0; -} - -static int cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list, struct cifs_aio_ctx *ctx) { @@ -3242,23 +3304,57 @@ fail: return rc; } +/* + * Select span of a bvec iterator we're going to use. Limit it by both maximum + * size and maximum number of segments. + */ +static size_t cifs_limit_bvec_subset(const struct iov_iter *iter, size_t max_size, + size_t max_segs, unsigned int *_nsegs) +{ + const struct bio_vec *bvecs = iter->bvec; + unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0; + size_t len, span = 0, n = iter->count; + size_t skip = iter->iov_offset; + + if (WARN_ON(!iov_iter_is_bvec(iter)) || n == 0) + return 0; + + while (n && ix < nbv && skip) { + len = bvecs[ix].bv_len; + if (skip < len) + break; + skip -= len; + n -= len; + ix++; + } + + while (n && ix < nbv) { + len = min3(n, bvecs[ix].bv_len - skip, max_size); + span += len; + nsegs++; + ix++; + if (span >= max_size || nsegs >= max_segs) + break; + skip = 0; + n -= len; + } + + *_nsegs = nsegs; + return span; +} + static int -cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, +cifs_write_from_iter(loff_t fpos, size_t len, struct iov_iter *from, struct cifsFileInfo *open_file, struct cifs_sb_info *cifs_sb, struct list_head *wdata_list, struct cifs_aio_ctx *ctx) { int rc = 0; - size_t cur_len; - unsigned long nr_pages, num_pages, i; + size_t cur_len, max_len; struct cifs_writedata *wdata; - struct iov_iter saved_from = *from; - loff_t saved_offset = offset; pid_t pid; struct TCP_Server_Info *server; - struct page **pagevec; - size_t start; - unsigned int xid; + unsigned int xid, max_segs = INT_MAX; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -3268,10 +3364,20 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); xid = get_xid(); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + max_segs = server->smbd_conn->max_frmr_depth; +#endif + do { - unsigned int wsize; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; + unsigned int wsize, nsegs = 0; + + if (signal_pending(current)) { + rc = -EINTR; + break; + } if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, false); @@ -3286,99 +3392,42 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, if (rc) break; - cur_len = min_t(const size_t, len, wsize); - - if (ctx->direct_io) { - ssize_t result; - - result = iov_iter_get_pages_alloc2( - from, &pagevec, cur_len, &start); - if (result < 0) { - cifs_dbg(VFS, - "direct_writev couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", - result, iov_iter_type(from), - from->iov_offset, from->count); - dump_stack(); - - rc = result; - add_credits_and_wake_if(server, credits, 0); - break; - } - cur_len = (size_t)result; - - nr_pages = - (cur_len + start + PAGE_SIZE - 1) / PAGE_SIZE; - - wdata = cifs_writedata_direct_alloc(pagevec, - cifs_uncached_writev_complete); - if (!wdata) { - rc = -ENOMEM; - for (i = 0; i < nr_pages; i++) - put_page(pagevec[i]); - kvfree(pagevec); - add_credits_and_wake_if(server, credits, 0); - break; - } - - - wdata->page_offset = start; - wdata->tailsz = - nr_pages > 1 ? - cur_len - (PAGE_SIZE - start) - - (nr_pages - 2) * PAGE_SIZE : - cur_len; - } else { - nr_pages = get_numpages(wsize, len, &cur_len); - wdata = cifs_writedata_alloc(nr_pages, - cifs_uncached_writev_complete); - if (!wdata) { - rc = -ENOMEM; - add_credits_and_wake_if(server, credits, 0); - break; - } - - rc = cifs_write_allocate_pages(wdata->pages, nr_pages); - if (rc) { - kvfree(wdata->pages); - kfree(wdata); - add_credits_and_wake_if(server, credits, 0); - break; - } - - num_pages = nr_pages; - rc = wdata_fill_from_iovec( - wdata, from, &cur_len, &num_pages); - if (rc) { - for (i = 0; i < nr_pages; i++) - put_page(wdata->pages[i]); - kvfree(wdata->pages); - kfree(wdata); - add_credits_and_wake_if(server, credits, 0); - break; - } + max_len = min_t(const size_t, len, wsize); + if (!max_len) { + rc = -EAGAIN; + add_credits_and_wake_if(server, credits, 0); + break; + } - /* - * Bring nr_pages down to the number of pages we - * actually used, and free any pages that we didn't use. - */ - for ( ; nr_pages > num_pages; nr_pages--) - put_page(wdata->pages[nr_pages - 1]); + cur_len = cifs_limit_bvec_subset(from, max_len, max_segs, &nsegs); + cifs_dbg(FYI, "write_from_iter len=%zx/%zx nsegs=%u/%lu/%u\n", + cur_len, max_len, nsegs, from->nr_segs, max_segs); + if (cur_len == 0) { + rc = -EIO; + add_credits_and_wake_if(server, credits, 0); + break; + } - wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); + wdata = cifs_writedata_alloc(cifs_uncached_writev_complete); + if (!wdata) { + rc = -ENOMEM; + add_credits_and_wake_if(server, credits, 0); + break; } wdata->sync_mode = WB_SYNC_ALL; - wdata->nr_pages = nr_pages; - wdata->offset = (__u64)offset; - wdata->cfile = cifsFileInfo_get(open_file); - wdata->server = server; - wdata->pid = pid; - wdata->bytes = cur_len; - wdata->pagesz = PAGE_SIZE; - wdata->credits = credits_on_stack; - wdata->ctx = ctx; + wdata->offset = (__u64)fpos; + wdata->cfile = cifsFileInfo_get(open_file); + wdata->server = server; + wdata->pid = pid; + wdata->bytes = cur_len; + wdata->credits = credits_on_stack; + wdata->iter = *from; + wdata->ctx = ctx; kref_get(&ctx->refcount); + iov_iter_truncate(&wdata->iter, cur_len); + rc = adjust_credits(server, &wdata->credits, wdata->bytes); if (!rc) { @@ -3393,16 +3442,14 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, add_credits_and_wake_if(server, &wdata->credits, 0); kref_put(&wdata->refcount, cifs_uncached_writedata_release); - if (rc == -EAGAIN) { - *from = saved_from; - iov_iter_advance(from, offset - saved_offset); + if (rc == -EAGAIN) continue; - } break; } list_add_tail(&wdata->list, wdata_list); - offset += cur_len; + iov_iter_advance(from, cur_len); + fpos += cur_len; len -= cur_len; } while (len > 0); @@ -3501,20 +3548,8 @@ static ssize_t __cifs_writev( struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct cifs_aio_ctx *ctx; - struct iov_iter saved_from = *from; - size_t len = iov_iter_count(from); int rc; - /* - * iov_iter_get_pages_alloc doesn't work with ITER_KVEC. - * In this case, fall back to non-direct write function. - * this could be improved by getting pages directly in ITER_KVEC - */ - if (direct && iov_iter_is_kvec(from)) { - cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n"); - direct = false; - } - rc = generic_write_checks(iocb, from); if (rc <= 0) return rc; @@ -3536,23 +3571,54 @@ static ssize_t __cifs_writev( ctx->iocb = iocb; ctx->pos = iocb->ki_pos; + ctx->direct_io = direct; + ctx->nr_pinned_pages = 0; - if (direct) { - ctx->direct_io = true; - ctx->iter = *from; - ctx->len = len; - } else { - rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE); - if (rc) { + if (user_backed_iter(from)) { + /* + * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as + * they contain references to the calling process's virtual + * memory layout which won't be available in an async worker + * thread. This also takes a pin on every folio involved. + */ + rc = netfs_extract_user_iter(from, iov_iter_count(from), + &ctx->iter, 0); + if (rc < 0) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; } + + ctx->nr_pinned_pages = rc; + ctx->bv = (void *)ctx->iter.bvec; + ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter); + } else if ((iov_iter_is_bvec(from) || iov_iter_is_kvec(from)) && + !is_sync_kiocb(iocb)) { + /* + * If the op is asynchronous, we need to copy the list attached + * to a BVEC/KVEC-type iterator, but we assume that the storage + * will be pinned by the caller; in any case, we may or may not + * be able to pin the pages, so we don't try. + */ + ctx->bv = (void *)dup_iter(&ctx->iter, from, GFP_KERNEL); + if (!ctx->bv) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -ENOMEM; + } + } else { + /* + * Otherwise, we just pass the iterator down as-is and rely on + * the caller to make sure the pages referred to by the + * iterator don't evaporate. + */ + ctx->iter = *from; } + ctx->len = iov_iter_count(&ctx->iter); + /* grab a lock here due to read response handlers can access ctx */ mutex_lock(&ctx->aio_mutex); - rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &saved_from, + rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &ctx->iter, cfile, cifs_sb, &ctx->list, ctx); /* @@ -3695,14 +3761,12 @@ out: return written; } -static struct cifs_readdata * -cifs_readdata_direct_alloc(struct page **pages, work_func_t complete) +static struct cifs_readdata *cifs_readdata_alloc(work_func_t complete) { struct cifs_readdata *rdata; rdata = kzalloc(sizeof(*rdata), GFP_KERNEL); - if (rdata != NULL) { - rdata->pages = pages; + if (rdata) { kref_init(&rdata->refcount); INIT_LIST_HEAD(&rdata->list); init_completion(&rdata->done); @@ -3712,27 +3776,14 @@ cifs_readdata_direct_alloc(struct page **pages, work_func_t complete) return rdata; } -static struct cifs_readdata * -cifs_readdata_alloc(unsigned int nr_pages, work_func_t complete) -{ - struct page **pages = - kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - struct cifs_readdata *ret = NULL; - - if (pages) { - ret = cifs_readdata_direct_alloc(pages, complete); - if (!ret) - kfree(pages); - } - - return ret; -} - void cifs_readdata_release(struct kref *refcount) { struct cifs_readdata *rdata = container_of(refcount, struct cifs_readdata, refcount); + + if (rdata->ctx) + kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); #ifdef CONFIG_CIFS_SMB_DIRECT if (rdata->mr) { smbd_deregister_mr(rdata->mr); @@ -3742,85 +3793,9 @@ cifs_readdata_release(struct kref *refcount) if (rdata->cfile) cifsFileInfo_put(rdata->cfile); - kvfree(rdata->pages); kfree(rdata); } -static int -cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages) -{ - int rc = 0; - struct page *page; - unsigned int i; - - for (i = 0; i < nr_pages; i++) { - page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!page) { - rc = -ENOMEM; - break; - } - rdata->pages[i] = page; - } - - if (rc) { - unsigned int nr_page_failed = i; - - for (i = 0; i < nr_page_failed; i++) { - put_page(rdata->pages[i]); - rdata->pages[i] = NULL; - } - } - return rc; -} - -static void -cifs_uncached_readdata_release(struct kref *refcount) -{ - struct cifs_readdata *rdata = container_of(refcount, - struct cifs_readdata, refcount); - unsigned int i; - - kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); - for (i = 0; i < rdata->nr_pages; i++) { - put_page(rdata->pages[i]); - } - cifs_readdata_release(refcount); -} - -/** - * cifs_readdata_to_iov - copy data from pages in response to an iovec - * @rdata: the readdata response with list of pages holding data - * @iter: destination for our data - * - * This function copies data from a list of pages in a readdata response into - * an array of iovecs. It will first calculate where the data should go - * based on the info in the readdata and then copy the data into that spot. - */ -static int -cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) -{ - size_t remaining = rdata->got_bytes; - unsigned int i; - - for (i = 0; i < rdata->nr_pages; i++) { - struct page *page = rdata->pages[i]; - size_t copy = min_t(size_t, remaining, PAGE_SIZE); - size_t written; - - if (unlikely(iov_iter_is_pipe(iter))) { - void *addr = kmap_atomic(page); - - written = copy_to_iter(addr, copy, iter); - kunmap_atomic(addr); - } else - written = copy_page_to_iter(page, 0, copy, iter); - remaining -= written; - if (written < copy && iov_iter_count(iter) > 0) - break; - } - return remaining ? -EFAULT : 0; -} - static void collect_uncached_read_data(struct cifs_aio_ctx *ctx); static void @@ -3832,81 +3807,7 @@ cifs_uncached_readv_complete(struct work_struct *work) complete(&rdata->done); collect_uncached_read_data(rdata->ctx); /* the below call can possibly free the last ref to aio ctx */ - kref_put(&rdata->refcount, cifs_uncached_readdata_release); -} - -static int -uncached_fill_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, struct iov_iter *iter, - unsigned int len) -{ - int result = 0; - unsigned int i; - unsigned int nr_pages = rdata->nr_pages; - unsigned int page_offset = rdata->page_offset; - - rdata->got_bytes = 0; - rdata->tailsz = PAGE_SIZE; - for (i = 0; i < nr_pages; i++) { - struct page *page = rdata->pages[i]; - size_t n; - unsigned int segment_size = rdata->pagesz; - - if (i == 0) - segment_size -= page_offset; - else - page_offset = 0; - - - if (len <= 0) { - /* no need to hold page hostage */ - rdata->pages[i] = NULL; - rdata->nr_pages--; - put_page(page); - continue; - } - - n = len; - if (len >= segment_size) - /* enough data to fill the page */ - n = segment_size; - else - rdata->tailsz = len; - len -= n; - - if (iter) - result = copy_page_from_iter( - page, page_offset, n, iter); -#ifdef CONFIG_CIFS_SMB_DIRECT - else if (rdata->mr) - result = n; -#endif - else - result = cifs_read_page_from_socket( - server, page, page_offset, n); - if (result < 0) - break; - - rdata->got_bytes += result; - } - - return result != -ECONNABORTED && rdata->got_bytes > 0 ? - rdata->got_bytes : result; -} - -static int -cifs_uncached_read_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, unsigned int len) -{ - return uncached_fill_pages(server, rdata, NULL, len); -} - -static int -cifs_uncached_copy_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - struct iov_iter *iter) -{ - return uncached_fill_pages(server, rdata, iter, iter->count); + kref_put(&rdata->refcount, cifs_readdata_release); } static int cifs_resend_rdata(struct cifs_readdata *rdata, @@ -3977,37 +3878,36 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata, } while (rc == -EAGAIN); fail: - kref_put(&rdata->refcount, cifs_uncached_readdata_release); + kref_put(&rdata->refcount, cifs_readdata_release); return rc; } static int -cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, +cifs_send_async_read(loff_t fpos, size_t len, struct cifsFileInfo *open_file, struct cifs_sb_info *cifs_sb, struct list_head *rdata_list, struct cifs_aio_ctx *ctx) { struct cifs_readdata *rdata; - unsigned int npages, rsize; + unsigned int rsize, nsegs, max_segs = INT_MAX; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; - size_t cur_len; + size_t cur_len, max_len; int rc; pid_t pid; struct TCP_Server_Info *server; - struct page **pagevec; - size_t start; - struct iov_iter direct_iov = ctx->iter; server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->smbd_conn) + max_segs = server->smbd_conn->max_frmr_depth; +#endif + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; else pid = current->tgid; - if (ctx->direct_io) - iov_iter_advance(&direct_iov, offset - ctx->pos); - do { if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, true); @@ -4027,78 +3927,37 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, if (rc) break; - cur_len = min_t(const size_t, len, rsize); - - if (ctx->direct_io) { - ssize_t result; - - result = iov_iter_get_pages_alloc2( - &direct_iov, &pagevec, - cur_len, &start); - if (result < 0) { - cifs_dbg(VFS, - "Couldn't get user pages (rc=%zd) iter type %d iov_offset %zd count %zd\n", - result, iov_iter_type(&direct_iov), - direct_iov.iov_offset, - direct_iov.count); - dump_stack(); - - rc = result; - add_credits_and_wake_if(server, credits, 0); - break; - } - cur_len = (size_t)result; - - rdata = cifs_readdata_direct_alloc( - pagevec, cifs_uncached_readv_complete); - if (!rdata) { - add_credits_and_wake_if(server, credits, 0); - rc = -ENOMEM; - break; - } - - npages = (cur_len + start + PAGE_SIZE-1) / PAGE_SIZE; - rdata->page_offset = start; - rdata->tailsz = npages > 1 ? - cur_len-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE : - cur_len; - - } else { + max_len = min_t(size_t, len, rsize); - npages = DIV_ROUND_UP(cur_len, PAGE_SIZE); - /* allocate a readdata struct */ - rdata = cifs_readdata_alloc(npages, - cifs_uncached_readv_complete); - if (!rdata) { - add_credits_and_wake_if(server, credits, 0); - rc = -ENOMEM; - break; - } - - rc = cifs_read_allocate_pages(rdata, npages); - if (rc) { - kvfree(rdata->pages); - kfree(rdata); - add_credits_and_wake_if(server, credits, 0); - break; - } + cur_len = cifs_limit_bvec_subset(&ctx->iter, max_len, + max_segs, &nsegs); + cifs_dbg(FYI, "read-to-iter len=%zx/%zx nsegs=%u/%lu/%u\n", + cur_len, max_len, nsegs, ctx->iter.nr_segs, max_segs); + if (cur_len == 0) { + rc = -EIO; + add_credits_and_wake_if(server, credits, 0); + break; + } - rdata->tailsz = PAGE_SIZE; + rdata = cifs_readdata_alloc(cifs_uncached_readv_complete); + if (!rdata) { + add_credits_and_wake_if(server, credits, 0); + rc = -ENOMEM; + break; } - rdata->server = server; - rdata->cfile = cifsFileInfo_get(open_file); - rdata->nr_pages = npages; - rdata->offset = offset; - rdata->bytes = cur_len; - rdata->pid = pid; - rdata->pagesz = PAGE_SIZE; - rdata->read_into_pages = cifs_uncached_read_into_pages; - rdata->copy_into_pages = cifs_uncached_copy_into_pages; - rdata->credits = credits_on_stack; - rdata->ctx = ctx; + rdata->server = server; + rdata->cfile = cifsFileInfo_get(open_file); + rdata->offset = fpos; + rdata->bytes = cur_len; + rdata->pid = pid; + rdata->credits = credits_on_stack; + rdata->ctx = ctx; kref_get(&ctx->refcount); + rdata->iter = ctx->iter; + iov_iter_truncate(&rdata->iter, cur_len); + rc = adjust_credits(server, &rdata->credits, rdata->bytes); if (!rc) { @@ -4110,17 +3969,15 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, if (rc) { add_credits_and_wake_if(server, &rdata->credits, 0); - kref_put(&rdata->refcount, - cifs_uncached_readdata_release); - if (rc == -EAGAIN) { - iov_iter_revert(&direct_iov, cur_len); + kref_put(&rdata->refcount, cifs_readdata_release); + if (rc == -EAGAIN) continue; - } break; } list_add_tail(&rdata->list, rdata_list); - offset += cur_len; + iov_iter_advance(&ctx->iter, cur_len); + fpos += cur_len; len -= cur_len; } while (len > 0); @@ -4162,22 +4019,6 @@ again: list_del_init(&rdata->list); INIT_LIST_HEAD(&tmp_list); - /* - * Got a part of data and then reconnect has - * happened -- fill the buffer and continue - * reading. - */ - if (got_bytes && got_bytes < rdata->bytes) { - rc = 0; - if (!ctx->direct_io) - rc = cifs_readdata_to_iov(rdata, to); - if (rc) { - kref_put(&rdata->refcount, - cifs_uncached_readdata_release); - continue; - } - } - if (ctx->direct_io) { /* * Re-use rdata as this is a @@ -4194,7 +4035,7 @@ again: &tmp_list, ctx); kref_put(&rdata->refcount, - cifs_uncached_readdata_release); + cifs_readdata_release); } list_splice(&tmp_list, &ctx->list); @@ -4202,8 +4043,6 @@ again: goto again; } else if (rdata->result) rc = rdata->result; - else if (!ctx->direct_io) - rc = cifs_readdata_to_iov(rdata, to); /* if there was a short read -- discard anything left */ if (rdata->got_bytes && rdata->got_bytes < rdata->bytes) @@ -4212,7 +4051,7 @@ again: ctx->total_len += rdata->got_bytes; } list_del_init(&rdata->list); - kref_put(&rdata->refcount, cifs_uncached_readdata_release); + kref_put(&rdata->refcount, cifs_readdata_release); } if (!ctx->direct_io) @@ -4244,16 +4083,6 @@ static ssize_t __cifs_readv( loff_t offset = iocb->ki_pos; struct cifs_aio_ctx *ctx; - /* - * iov_iter_get_pages_alloc() doesn't work with ITER_KVEC, - * fall back to data copy read path - * this could be improved by getting pages directly in ITER_KVEC - */ - if (direct && iov_iter_is_kvec(to)) { - cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n"); - direct = false; - } - len = iov_iter_count(to); if (!len) return 0; @@ -4272,26 +4101,53 @@ static ssize_t __cifs_readv( if (!ctx) return -ENOMEM; - ctx->cfile = cifsFileInfo_get(cfile); + ctx->pos = offset; + ctx->direct_io = direct; + ctx->len = len; + ctx->cfile = cifsFileInfo_get(cfile); + ctx->nr_pinned_pages = 0; if (!is_sync_kiocb(iocb)) ctx->iocb = iocb; - if (user_backed_iter(to)) - ctx->should_dirty = true; - - if (direct) { - ctx->pos = offset; - ctx->direct_io = true; - ctx->iter = *to; - ctx->len = len; - } else { - rc = setup_aio_ctx_iter(ctx, to, ITER_DEST); - if (rc) { + if (user_backed_iter(to)) { + /* + * Extract IOVEC/UBUF-type iterators to a BVEC-type iterator as + * they contain references to the calling process's virtual + * memory layout which won't be available in an async worker + * thread. This also takes a pin on every folio involved. + */ + rc = netfs_extract_user_iter(to, iov_iter_count(to), + &ctx->iter, 0); + if (rc < 0) { kref_put(&ctx->refcount, cifs_aio_ctx_release); return rc; } - len = ctx->len; + + ctx->nr_pinned_pages = rc; + ctx->bv = (void *)ctx->iter.bvec; + ctx->bv_need_unpin = iov_iter_extract_will_pin(&ctx->iter); + ctx->should_dirty = true; + } else if ((iov_iter_is_bvec(to) || iov_iter_is_kvec(to)) && + !is_sync_kiocb(iocb)) { + /* + * If the op is asynchronous, we need to copy the list attached + * to a BVEC/KVEC-type iterator, but we assume that the storage + * will be retained by the caller; in any case, we may or may + * not be able to pin the pages, so we don't try. + */ + ctx->bv = (void *)dup_iter(&ctx->iter, to, GFP_KERNEL); + if (!ctx->bv) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -ENOMEM; + } + } else { + /* + * Otherwise, we just pass the iterator down as-is and rely on + * the caller to make sure the pages referred to by the + * iterator don't evaporate. + */ + ctx->iter = *to; } if (direct) { @@ -4490,23 +4346,22 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) * If the page is mmap'ed into a process' page tables, then we need to make * sure that it doesn't change while being written back. */ -static vm_fault_t -cifs_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); - /* Wait for the page to be written to the cache before we allow it to - * be modified. We then assume the entire page will need writing back. + /* Wait for the folio to be written to the cache before we allow it to + * be modified. We then assume the entire folio will need writing back. */ #ifdef CONFIG_CIFS_FSCACHE - if (PageFsCache(page) && - wait_on_page_fscache_killable(page) < 0) + if (folio_test_fscache(folio) && + folio_wait_fscache_killable(folio) < 0) return VM_FAULT_RETRY; #endif - wait_on_page_writeback(page); + folio_wait_writeback(folio); - if (lock_page_killable(page) < 0) + if (folio_lock_killable(folio) < 0) return VM_FAULT_RETRY; return VM_FAULT_LOCKED; } @@ -4554,149 +4409,72 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) return rc; } -static void -cifs_readv_complete(struct work_struct *work) +/* + * Unlock a bunch of folios in the pagecache. + */ +static void cifs_unlock_folios(struct address_space *mapping, pgoff_t first, pgoff_t last) { - unsigned int i, got_bytes; - struct cifs_readdata *rdata = container_of(work, - struct cifs_readdata, work); + struct folio *folio; + XA_STATE(xas, &mapping->i_pages, first); - got_bytes = rdata->got_bytes; - for (i = 0; i < rdata->nr_pages; i++) { - struct page *page = rdata->pages[i]; - - if (rdata->result == 0 || - (rdata->result == -EAGAIN && got_bytes)) { - flush_dcache_page(page); - SetPageUptodate(page); - } else - SetPageError(page); - - if (rdata->result == 0 || - (rdata->result == -EAGAIN && got_bytes)) - cifs_readpage_to_fscache(rdata->mapping->host, page); - - unlock_page(page); - - got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); - - put_page(page); - rdata->pages[i] = NULL; + rcu_read_lock(); + xas_for_each(&xas, folio, last) { + folio_unlock(folio); } - kref_put(&rdata->refcount, cifs_readdata_release); + rcu_read_unlock(); } -static int -readpages_fill_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, struct iov_iter *iter, - unsigned int len) +static void cifs_readahead_complete(struct work_struct *work) { - int result = 0; - unsigned int i; - u64 eof; - pgoff_t eof_index; - unsigned int nr_pages = rdata->nr_pages; - unsigned int page_offset = rdata->page_offset; - - /* determine the eof that the server (probably) has */ - eof = CIFS_I(rdata->mapping->host)->server_eof; - eof_index = eof ? (eof - 1) >> PAGE_SHIFT : 0; - cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index); - - rdata->got_bytes = 0; - rdata->tailsz = PAGE_SIZE; - for (i = 0; i < nr_pages; i++) { - struct page *page = rdata->pages[i]; - unsigned int to_read = rdata->pagesz; - size_t n; - - if (i == 0) - to_read -= page_offset; - else - page_offset = 0; - - n = to_read; - - if (len >= to_read) { - len -= to_read; - } else if (len > 0) { - /* enough for partial page, fill and zero the rest */ - zero_user(page, len + page_offset, to_read - len); - n = rdata->tailsz = len; - len = 0; - } else if (page->index > eof_index) { - /* - * The VFS will not try to do readahead past the - * i_size, but it's possible that we have outstanding - * writes with gaps in the middle and the i_size hasn't - * caught up yet. Populate those with zeroed out pages - * to prevent the VFS from repeatedly attempting to - * fill them until the writes are flushed. - */ - zero_user(page, 0, PAGE_SIZE); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - put_page(page); - rdata->pages[i] = NULL; - rdata->nr_pages--; - continue; - } else { - /* no need to hold page hostage */ - unlock_page(page); - put_page(page); - rdata->pages[i] = NULL; - rdata->nr_pages--; - continue; - } + struct cifs_readdata *rdata = container_of(work, + struct cifs_readdata, work); + struct folio *folio; + pgoff_t last; + bool good = rdata->result == 0 || (rdata->result == -EAGAIN && rdata->got_bytes); - if (iter) - result = copy_page_from_iter( - page, page_offset, n, iter); -#ifdef CONFIG_CIFS_SMB_DIRECT - else if (rdata->mr) - result = n; -#endif - else - result = cifs_read_page_from_socket( - server, page, page_offset, n); - if (result < 0) - break; + XA_STATE(xas, &rdata->mapping->i_pages, rdata->offset / PAGE_SIZE); - rdata->got_bytes += result; - } + if (good) + cifs_readahead_to_fscache(rdata->mapping->host, + rdata->offset, rdata->bytes); - return result != -ECONNABORTED && rdata->got_bytes > 0 ? - rdata->got_bytes : result; -} + if (iov_iter_count(&rdata->iter) > 0) + iov_iter_zero(iov_iter_count(&rdata->iter), &rdata->iter); -static int -cifs_readpages_read_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, unsigned int len) -{ - return readpages_fill_pages(server, rdata, NULL, len); -} + last = (rdata->offset + rdata->bytes - 1) / PAGE_SIZE; -static int -cifs_readpages_copy_into_pages(struct TCP_Server_Info *server, - struct cifs_readdata *rdata, - struct iov_iter *iter) -{ - return readpages_fill_pages(server, rdata, iter, iter->count); + rcu_read_lock(); + xas_for_each(&xas, folio, last) { + if (good) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + folio_unlock(folio); + } + rcu_read_unlock(); + + kref_put(&rdata->refcount, cifs_readdata_release); } static void cifs_readahead(struct readahead_control *ractl) { - int rc; struct cifsFileInfo *open_file = ractl->file->private_data; struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file); struct TCP_Server_Info *server; - pid_t pid; - unsigned int xid, nr_pages, last_batch_size = 0, cache_nr_pages = 0; - pgoff_t next_cached = ULONG_MAX; + unsigned int xid, nr_pages, cache_nr_pages = 0; + unsigned int ra_pages; + pgoff_t next_cached = ULONG_MAX, ra_index; bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) && cifs_inode_cookie(ractl->mapping->host)->cache_priv; bool check_cache = caching; + pid_t pid; + int rc = 0; + + /* Note that readahead_count() lags behind our dequeuing of pages from + * the ractl, wo we have to keep track for ourselves. + */ + ra_pages = readahead_count(ractl); + ra_index = readahead_index(ractl); xid = get_xid(); @@ -4705,22 +4483,21 @@ static void cifs_readahead(struct readahead_control *ractl) else pid = current->tgid; - rc = 0; server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", - __func__, ractl->file, ractl->mapping, readahead_count(ractl)); + __func__, ractl->file, ractl->mapping, ra_pages); /* * Chop the readahead request up into rsize-sized read requests. */ - while ((nr_pages = readahead_count(ractl) - last_batch_size)) { - unsigned int i, got, rsize; - struct page *page; + while ((nr_pages = ra_pages)) { + unsigned int i, rsize; struct cifs_readdata *rdata; struct cifs_credits credits_on_stack; struct cifs_credits *credits = &credits_on_stack; - pgoff_t index = readahead_index(ractl) + last_batch_size; + struct folio *folio; + pgoff_t fsize; /* * Find out if we have anything cached in the range of @@ -4729,21 +4506,22 @@ static void cifs_readahead(struct readahead_control *ractl) if (caching) { if (check_cache) { rc = cifs_fscache_query_occupancy( - ractl->mapping->host, index, nr_pages, + ractl->mapping->host, ra_index, nr_pages, &next_cached, &cache_nr_pages); if (rc < 0) caching = false; check_cache = false; } - if (index == next_cached) { + if (ra_index == next_cached) { /* * TODO: Send a whole batch of pages to be read * by the cache. */ - struct folio *folio = readahead_folio(ractl); - - last_batch_size = folio_nr_pages(folio); + folio = readahead_folio(ractl); + fsize = folio_nr_pages(folio); + ra_pages -= fsize; + ra_index += fsize; if (cifs_readpage_from_fscache(ractl->mapping->host, &folio->page) < 0) { /* @@ -4754,8 +4532,8 @@ static void cifs_readahead(struct readahead_control *ractl) caching = false; } folio_unlock(folio); - next_cached++; - cache_nr_pages--; + next_cached += fsize; + cache_nr_pages -= fsize; if (cache_nr_pages == 0) check_cache = true; continue; @@ -4780,8 +4558,9 @@ static void cifs_readahead(struct readahead_control *ractl) &rsize, credits); if (rc) break; - nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl)); - nr_pages = min_t(size_t, nr_pages, next_cached - index); + nr_pages = min_t(size_t, rsize / PAGE_SIZE, ra_pages); + if (next_cached != ULONG_MAX) + nr_pages = min_t(size_t, nr_pages, next_cached - ra_index); /* * Give up immediately if rsize is too small to read an entire @@ -4794,33 +4573,31 @@ static void cifs_readahead(struct readahead_control *ractl) break; } - rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); + rdata = cifs_readdata_alloc(cifs_readahead_complete); if (!rdata) { /* best to give up if we're out of mem */ add_credits_and_wake_if(server, credits, 0); break; } - got = __readahead_batch(ractl, rdata->pages, nr_pages); - if (got != nr_pages) { - pr_warn("__readahead_batch() returned %u/%u\n", - got, nr_pages); - nr_pages = got; - } - - rdata->nr_pages = nr_pages; - rdata->bytes = readahead_batch_length(ractl); + rdata->offset = ra_index * PAGE_SIZE; + rdata->bytes = nr_pages * PAGE_SIZE; rdata->cfile = cifsFileInfo_get(open_file); rdata->server = server; rdata->mapping = ractl->mapping; - rdata->offset = readahead_pos(ractl); rdata->pid = pid; - rdata->pagesz = PAGE_SIZE; - rdata->tailsz = PAGE_SIZE; - rdata->read_into_pages = cifs_readpages_read_into_pages; - rdata->copy_into_pages = cifs_readpages_copy_into_pages; rdata->credits = credits_on_stack; + for (i = 0; i < nr_pages; i++) { + if (!readahead_folio(ractl)) + WARN_ON(1); + } + ra_pages -= nr_pages; + ra_index += nr_pages; + + iov_iter_xarray(&rdata->iter, ITER_DEST, &rdata->mapping->i_pages, + rdata->offset, rdata->bytes); + rc = adjust_credits(server, &rdata->credits, rdata->bytes); if (!rc) { if (rdata->cfile->invalidHandle) @@ -4831,18 +4608,15 @@ static void cifs_readahead(struct readahead_control *ractl) if (rc) { add_credits_and_wake_if(server, &rdata->credits, 0); - for (i = 0; i < rdata->nr_pages; i++) { - page = rdata->pages[i]; - unlock_page(page); - put_page(page); - } + cifs_unlock_folios(rdata->mapping, + rdata->offset / PAGE_SIZE, + (rdata->offset + rdata->bytes - 1) / PAGE_SIZE); /* Fallback to the readpage in error/reconnect cases */ kref_put(&rdata->refcount, cifs_readdata_release); break; } kref_put(&rdata->refcount, cifs_readdata_release); - last_batch_size = nr_pages; } free_xid(xid); @@ -4884,10 +4658,6 @@ static int cifs_readpage_worker(struct file *file, struct page *page, flush_dcache_page(page); SetPageUptodate(page); - - /* send this page to the cache */ - cifs_readpage_to_fscache(file_inode(file), page); - rc = 0; io_error: @@ -5274,3 +5044,19 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .launder_folio = cifs_launder_folio, .migrate_folio = filemap_migrate_folio, }; + +/* + * Splice data from a file into a pipe. + */ +ssize_t cifs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + if (unlikely(*ppos >= file_inode(in)->i_sb->s_maxbytes)) + return 0; + if (unlikely(!len)) + return 0; + if (in->f_flags & O_DIRECT) + return direct_splice_read(in, ppos, pipe, len, flags); + return filemap_splice_read(in, ppos, pipe, len, flags); +} diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 0911327ebfde..8f6909d633da 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -163,20 +163,16 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) /* * Fallback page writing interface. */ -static int fscache_fallback_write_page(struct inode *inode, struct page *page, - bool no_space_allocated_yet) +static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_t len, + bool no_space_allocated_yet) { struct netfs_cache_resources cres; struct fscache_cookie *cookie = cifs_inode_cookie(inode); struct iov_iter iter; - struct bio_vec bvec; - loff_t start = page_offset(page); - size_t len = PAGE_SIZE; int ret; memset(&cres, 0, sizeof(cres)); - bvec_set_page(&bvec, page, PAGE_SIZE, 0); - iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); + iov_iter_xarray(&iter, ITER_SOURCE, &inode->i_mapping->i_pages, start, len); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) @@ -185,7 +181,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), no_space_allocated_yet); if (ret == 0) - ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL); + ret = fscache_write(&cres, start, &iter, NULL, NULL); fscache_end_operation(&cres); return ret; } @@ -209,12 +205,12 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) return 0; } -void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) +void __cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len) { - cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", - __func__, cifs_inode_cookie(inode), page, inode); + cifs_dbg(FYI, "%s: (fsc: %p, p: %llx, l: %zx, i: %p)\n", + __func__, cifs_inode_cookie(inode), pos, len, inode); - fscache_fallback_write_page(inode, page, true); + fscache_fallback_write_pages(inode, pos, len, true); } /* diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 67b601041f0a..173999610997 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -90,7 +90,7 @@ static inline int cifs_fscache_query_occupancy(struct inode *inode, } extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage); -extern void __cifs_readpage_to_fscache(struct inode *pinode, struct page *ppage); +extern void __cifs_readahead_to_fscache(struct inode *pinode, loff_t pos, size_t len); static inline int cifs_readpage_from_fscache(struct inode *inode, @@ -101,11 +101,11 @@ static inline int cifs_readpage_from_fscache(struct inode *inode, return -ENOBUFS; } -static inline void cifs_readpage_to_fscache(struct inode *inode, - struct page *page) +static inline void cifs_readahead_to_fscache(struct inode *inode, + loff_t pos, size_t len) { if (cifs_inode_cookie(inode)) - __cifs_readpage_to_fscache(inode, page); + __cifs_readahead_to_fscache(inode, pos, len); } #else /* CONFIG_CIFS_FSCACHE */ @@ -141,7 +141,7 @@ cifs_readpage_from_fscache(struct inode *inode, struct page *page) } static inline -void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {} +void cifs_readahead_to_fscache(struct inode *inode, loff_t pos, size_t len) {} #endif /* CONFIG_CIFS_FSCACHE */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 11cdc7cfe0ba..1087ac6104a9 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -508,14 +508,15 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_READ; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_READ, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; @@ -1518,14 +1519,15 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, goto out; } - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = DELETE | FILE_WRITE_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc != 0) @@ -2112,15 +2114,16 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - /* open the file to be renamed -- we need DELETE perms */ - oparms.desired_access = DELETE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = from_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + /* open the file to be renamed -- we need DELETE perms */ + .desired_access = DELETE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = from_path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc == 0) { diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 4510dea77be3..7d97c10f2453 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -271,14 +271,15 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, int buf_type = CIFS_NO_BUFFER; FILE_ALL_INFO file_info; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_READ; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_READ, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, &file_info); if (rc) @@ -313,14 +314,15 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {0}; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_CREATE; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_CREATE, + .path = path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) @@ -355,13 +357,14 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct smb2_file_all_info *pfile_info = NULL; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_READ; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_READ, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .fid = &fid, + }; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (utf16_path == NULL) @@ -421,14 +424,15 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_CREATE; - oparms.fid = &fid; - oparms.reconnect = false; - oparms.mode = 0644; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_CREATE, + .fid = &fid, + .mode = 0644, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, NULL); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 95cc4d7dd806..2905734eb289 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -966,16 +966,22 @@ cifs_aio_ctx_release(struct kref *refcount) /* * ctx->bv is only set if setup_aio_ctx_iter() was call successfuly - * which means that iov_iter_get_pages() was a success and thus that - * we have taken reference on pages. + * which means that iov_iter_extract_pages() was a success and thus + * that we may have references or pins on pages that we need to + * release. */ if (ctx->bv) { - unsigned i; + if (ctx->should_dirty || ctx->bv_need_unpin) { + unsigned int i; - for (i = 0; i < ctx->npages; i++) { - if (ctx->should_dirty) - set_page_dirty(ctx->bv[i].bv_page); - put_page(ctx->bv[i].bv_page); + for (i = 0; i < ctx->nr_pinned_pages; i++) { + struct page *page = ctx->bv[i].bv_page; + + if (ctx->should_dirty) + set_page_dirty(page); + if (ctx->bv_need_unpin) + unpin_user_page(page); + } } kvfree(ctx->bv); } @@ -983,94 +989,6 @@ cifs_aio_ctx_release(struct kref *refcount) kfree(ctx); } -#define CIFS_AIO_KMALLOC_LIMIT (1024 * 1024) - -int -setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) -{ - ssize_t rc; - unsigned int cur_npages; - unsigned int npages = 0; - unsigned int i; - size_t len; - size_t count = iov_iter_count(iter); - unsigned int saved_len; - size_t start; - unsigned int max_pages = iov_iter_npages(iter, INT_MAX); - struct page **pages = NULL; - struct bio_vec *bv = NULL; - - if (iov_iter_is_kvec(iter)) { - memcpy(&ctx->iter, iter, sizeof(*iter)); - ctx->len = count; - iov_iter_advance(iter, count); - return 0; - } - - if (array_size(max_pages, sizeof(*bv)) <= CIFS_AIO_KMALLOC_LIMIT) - bv = kmalloc_array(max_pages, sizeof(*bv), GFP_KERNEL); - - if (!bv) { - bv = vmalloc(array_size(max_pages, sizeof(*bv))); - if (!bv) - return -ENOMEM; - } - - if (array_size(max_pages, sizeof(*pages)) <= CIFS_AIO_KMALLOC_LIMIT) - pages = kmalloc_array(max_pages, sizeof(*pages), GFP_KERNEL); - - if (!pages) { - pages = vmalloc(array_size(max_pages, sizeof(*pages))); - if (!pages) { - kvfree(bv); - return -ENOMEM; - } - } - - saved_len = count; - - while (count && npages < max_pages) { - rc = iov_iter_get_pages2(iter, pages, count, max_pages, &start); - if (rc < 0) { - cifs_dbg(VFS, "Couldn't get user pages (rc=%zd)\n", rc); - break; - } - - if (rc > count) { - cifs_dbg(VFS, "get pages rc=%zd more than %zu\n", rc, - count); - break; - } - - count -= rc; - rc += start; - cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE); - - if (npages + cur_npages > max_pages) { - cifs_dbg(VFS, "out of vec array capacity (%u vs %u)\n", - npages + cur_npages, max_pages); - break; - } - - for (i = 0; i < cur_npages; i++) { - len = rc > PAGE_SIZE ? PAGE_SIZE : rc; - bvec_set_page(&bv[npages + i], pages[i], len - start, - start); - rc -= len; - start = 0; - } - - npages += cur_npages; - } - - kvfree(pages); - ctx->bv = bv; - ctx->len = saved_len - count; - ctx->npages = npages; - iov_iter_bvec(&ctx->iter, rw, ctx->bv, npages, ctx->len); - return 0; -} - /** * cifs_alloc_hash - allocate hash and hash context together * @name: The name of the crypto hash algo @@ -1128,25 +1046,6 @@ cifs_free_hash(struct shash_desc **sdesc) *sdesc = NULL; } -/** - * rqst_page_get_length - obtain the length and offset for a page in smb_rqst - * @rqst: The request descriptor - * @page: The index of the page to query - * @len: Where to store the length for this page: - * @offset: Where to store the offset for this page - */ -void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page, - unsigned int *len, unsigned int *offset) -{ - *len = rqst->rq_pagesz; - *offset = (page == 0) ? rqst->rq_offset : 0; - - if (rqst->rq_npages == 1 || page == rqst->rq_npages-1) - *len = rqst->rq_tailsz; - else if (page == 0) - *len = rqst->rq_pagesz - rqst->rq_offset; -} - void extract_unc_hostname(const char *unc, const char **h, size_t *len) { const char *end; diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 55758b9ec877..2c5dde2ece58 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -83,7 +83,7 @@ typedef struct _NEGOTIATE_MESSAGE { SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ /* SECURITY_BUFFER for version info not present since we do not set the version is present flag */ - char DomainString[0]; + char DomainString[]; /* followed by WorkstationString */ } __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE; @@ -135,7 +135,7 @@ typedef struct _AUTHENTICATE_MESSAGE { __le32 NegotiateFlags; /* SECURITY_BUFFER for version info not present since we do not set the version is present flag */ - char UserString[0]; + char UserString[]; } __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE; /* diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 2d75ba5aaa8a..ef638086d734 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -495,7 +495,7 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) FIND_FILE_STANDARD_INFO *pfData; pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo; - new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + + new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + 1 + pfData->FileNameLength; } else { u32 next_offset = le32_to_cpu(pDirInfo->NextEntryOffset); @@ -513,9 +513,9 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) new_entry, end_of_smb, old_entry); return NULL; } else if (((level == SMB_FIND_FILE_INFO_STANDARD) && - (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) + (new_entry + sizeof(FIND_FILE_STANDARD_INFO) + 1 > end_of_smb)) || ((level != SMB_FIND_FILE_INFO_STANDARD) && - (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { + (new_entry + sizeof(FILE_DIRECTORY_INFO) + 1 > end_of_smb))) { cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n", new_entry, end_of_smb); return NULL; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index c47b254f0d1e..d2cbae4b5d21 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -480,7 +480,6 @@ out: * remove this channel */ cancel_delayed_work_sync(&chan->server->echo); - cancel_delayed_work_sync(&chan->server->resolve); cancel_delayed_work_sync(&chan->server->reconnect); spin_lock(&ses->chan_lock); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 4cb364454e13..abda6148be10 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -576,14 +576,15 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) return 0; - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = FILE_READ_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; /* Need to check if this is a symbolic link or not */ tmprc = CIFS_open(xid, &oparms, &oplock, NULL); @@ -823,14 +824,15 @@ smb_set_file_info(struct inode *inode, const char *full_path, goto out; } - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); rc = CIFS_open(xid, &oparms, &oplock, NULL); @@ -998,15 +1000,16 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, goto out; } - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.create_options = cifs_create_options(cifs_sb, - OPEN_REPARSE_POINT); - oparms.disposition = FILE_OPEN; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = FILE_READ_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, + OPEN_REPARSE_POINT), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) @@ -1115,15 +1118,16 @@ cifs_make_node(unsigned int xid, struct inode *inode, cifs_dbg(FYI, "sfu compat create special file\n"); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL); - oparms.disposition = FILE_CREATE; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL), + .disposition = FILE_CREATE, + .path = full_path, + .fid = &fid, + }; if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 9f1dd04b555a..e0ee96d69d49 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -35,7 +35,7 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp, ErrorContextData) + sizeof(struct smb2_symlink_err_rsp)); - if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err)) + if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err) + 1) return ERR_PTR(-EINVAL); p = (struct smb2_error_context_rsp *)err->ErrorData; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 8521adf9ce79..37b4cd59245d 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -105,14 +105,15 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto finished; } - vars->oparms.tcon = tcon; - vars->oparms.desired_access = desired_access; - vars->oparms.disposition = create_disposition; - vars->oparms.create_options = cifs_create_options(cifs_sb, create_options); - vars->oparms.fid = &fid; - vars->oparms.reconnect = false; - vars->oparms.mode = mode; - vars->oparms.cifs_sb = cifs_sb; + vars->oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = desired_access, + .disposition = create_disposition, + .create_options = cifs_create_options(cifs_sb, create_options), + .fid = &fid, + .mode = mode, + .cifs_sb = cifs_sb, + }; rqst[num_rqst].rq_iov = &vars->open_iov[0]; rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 572293c18e16..3935a60db5c3 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -113,7 +113,7 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, } else if (nc_offset + 1 == non_ctxlen) { cifs_dbg(FYI, "no SPNEGO security blob in negprot rsp\n"); size_of_pad_before_neg_ctxts = 0; - } else if (non_ctxlen == SMB311_NEGPROT_BASE_SIZE) + } else if (non_ctxlen == SMB311_NEGPROT_BASE_SIZE + 1) /* has padding, but no SPNEGO blob */ size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen + 1; else diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index cb2deac6b2d7..f79b075f2992 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -729,12 +729,13 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct cached_fid *cfid = NULL; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = open_cached_dir(xid, tcon, "", cifs_sb, false, &cfid); if (rc == 0) @@ -771,12 +772,13 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); @@ -816,12 +818,13 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, &err_iov, &err_buftype); @@ -1097,13 +1100,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.desired_access = FILE_WRITE_EA; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_WRITE_EA, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -1453,12 +1456,12 @@ smb2_ioctl_query_info(const unsigned int xid, rqst[0].rq_iov = &vars->open_iov[0]; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, create_options), + .fid = &fid, + }; if (qi.flags & PASSTHRU_FSCTL) { switch (qi.info_type & FSCTL_DEVICE_ACCESS_MASK) { @@ -2088,12 +2091,13 @@ smb3_notify(const unsigned int xid, struct file *pfile, } tcon = cifs_sb_master_tcon(cifs_sb); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, NULL); @@ -2159,12 +2163,13 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -2490,12 +2495,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - oparms.tcon = tcon; - oparms.desired_access = desired_access; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = desired_access, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -2623,12 +2629,13 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon, if (!tcon->posix_extensions) return smb2_queryfs(xid, tcon, cifs_sb, buf); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, 0), + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); @@ -2916,13 +2923,13 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, create_options); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, create_options), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -3056,13 +3063,13 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - memset(&oparms, 0, sizeof(oparms)); - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; - oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT); - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = FILE_READ_ATTRIBUTES, + .disposition = FILE_OPEN, + .create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT), + .fid = &fid, + }; rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, utf16_path); @@ -3196,17 +3203,20 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, return ERR_PTR(rc); } - oparms.tcon = tcon; - oparms.desired_access = READ_CONTROL; - oparms.disposition = FILE_OPEN; - /* - * When querying an ACL, even if the file is a symlink we want to open - * the source not the target, and so the protocol requires that the - * client specify this flag when opening a reparse point - */ - oparms.create_options = cifs_create_options(cifs_sb, 0) | OPEN_REPARSE_POINT; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = READ_CONTROL, + .disposition = FILE_OPEN, + /* + * When querying an ACL, even if the file is a symlink + * we want to open the source not the target, and so + * the protocol requires that the client specify this + * flag when opening a reparse point + */ + .create_options = cifs_create_options(cifs_sb, 0) | + OPEN_REPARSE_POINT, + .fid = &fid, + }; if (info & SACL_SECINFO) oparms.desired_access |= SYSTEM_SECURITY; @@ -3265,13 +3275,14 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen, return rc; } - oparms.tcon = tcon; - oparms.desired_access = access_flags; - oparms.create_options = cifs_create_options(cifs_sb, 0); - oparms.disposition = FILE_OPEN; - oparms.path = path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .desired_access = access_flags, + .create_options = cifs_create_options(cifs_sb, 0), + .disposition = FILE_OPEN, + .path = path, + .fid = &fid, + }; rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL, NULL); @@ -4227,8 +4238,8 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, - struct aead_request **req, struct scatterlist **sgl, - unsigned int *num_sgs) + struct aead_request **req, struct sg_table *sgt, + unsigned int *num_sgs, size_t *sensitive_size) { unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm); unsigned int iv_size = crypto_aead_ivsize(tfm); @@ -4236,70 +4247,75 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst u8 *p; *num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig); + if (IS_ERR_VALUE((long)(int)*num_sgs)) + return ERR_PTR(*num_sgs); len = iv_size; len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); - len += *num_sgs * sizeof(**sgl); + len += array_size(*num_sgs, sizeof(struct scatterlist)); + *sensitive_size = len; - p = kmalloc(len, GFP_ATOMIC); + p = kvzalloc(len, GFP_NOFS); if (!p) - return NULL; + return ERR_PTR(-ENOMEM); *iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1); *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, crypto_tfm_ctx_alignment()); - *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, - __alignof__(struct scatterlist)); + sgt->sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, + __alignof__(struct scatterlist)); return p; } -static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst, +static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, - struct aead_request **req, struct scatterlist **sgl) + struct aead_request **req, struct scatterlist **sgl, + size_t *sensitive_size) { - unsigned int off, len, skip; - struct scatterlist *sg; - unsigned int num_sgs; - unsigned long addr; - int i, j; + struct sg_table sgtable = {}; + unsigned int skip, num_sgs, i, j; + ssize_t rc; void *p; - p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs); - if (!p) - return NULL; + p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, + &num_sgs, sensitive_size); + if (IS_ERR(p)) + return ERR_CAST(p); - sg_init_table(*sgl, num_sgs); - sg = *sgl; + sg_init_marker(sgtable.sgl, num_sgs); - /* Assumes the first rqst has a transform header as the first iov. - * I.e. - * rqst[0].rq_iov[0] is transform header - * rqst[0].rq_iov[1+] data to be encrypted/decrypted - * rqst[1+].rq_iov[0+] data to be encrypted/decrypted + /* + * The first rqst has a transform header where the + * first 20 bytes are not part of the encrypted blob. */ + skip = 20; + for (i = 0; i < num_rqst; i++) { - /* - * The first rqst has a transform header where the - * first 20 bytes are not part of the encrypted blob. - */ + struct iov_iter *iter = &rqst[i].rq_iter; + size_t count = iov_iter_count(iter); + for (j = 0; j < rqst[i].rq_nvec; j++) { - struct kvec *iov = &rqst[i].rq_iov[j]; + cifs_sg_set_buf(&sgtable, + rqst[i].rq_iov[j].iov_base + skip, + rqst[i].rq_iov[j].iov_len - skip); - skip = (i == 0) && (j == 0) ? 20 : 0; - addr = (unsigned long)iov->iov_base + skip; - len = iov->iov_len - skip; - sg = cifs_sg_set_buf(sg, (void *)addr, len); - } - for (j = 0; j < rqst[i].rq_npages; j++) { - rqst_page_get_length(&rqst[i], j, &len, &off); - sg_set_page(sg++, rqst[i].rq_pages[j], len, off); + /* See the above comment on the 'skip' assignment */ + skip = 0; } + sgtable.orig_nents = sgtable.nents; + + rc = netfs_extract_iter_to_sg(iter, count, &sgtable, + num_sgs - sgtable.nents, 0); + iov_iter_revert(iter, rc); + sgtable.orig_nents = sgtable.nents; } - cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE); + cifs_sg_set_buf(&sgtable, sig, SMB2_SIGNATURE_SIZE); + sg_mark_end(&sgtable.sgl[sgtable.nents - 1]); + *sgl = sgtable.sgl; return p; } @@ -4353,6 +4369,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); void *creq; + size_t sensitive_size; rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { @@ -4386,9 +4403,10 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg); - if (unlikely(!creq)) - return -ENOMEM; + creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg, + &sensitive_size); + if (IS_ERR(creq)) + return PTR_ERR(creq); if (!enc) { memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE); @@ -4416,22 +4434,35 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kfree_sensitive(creq); + kvfree_sensitive(creq, sensitive_size); return rc; } +/* + * Clear a read buffer, discarding the folios which have XA_MARK_0 set. + */ +static void cifs_clear_xarray_buffer(struct xarray *buffer) +{ + struct folio *folio; + + XA_STATE(xas, buffer, 0); + + rcu_read_lock(); + xas_for_each_marked(&xas, folio, ULONG_MAX, XA_MARK_0) { + folio_put(folio); + } + rcu_read_unlock(); + xa_destroy(buffer); +} + void smb3_free_compound_rqst(int num_rqst, struct smb_rqst *rqst) { - int i, j; + int i; - for (i = 0; i < num_rqst; i++) { - if (rqst[i].rq_pages) { - for (j = rqst[i].rq_npages - 1; j >= 0; j--) - put_page(rqst[i].rq_pages[j]); - kfree(rqst[i].rq_pages); - } - } + for (i = 0; i < num_rqst; i++) + if (!xa_empty(&rqst[i].rq_buffer)) + cifs_clear_xarray_buffer(&rqst[i].rq_buffer); } /* @@ -4451,9 +4482,8 @@ static int smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *new_rq, struct smb_rqst *old_rq) { - struct page **pages; struct smb2_transform_hdr *tr_hdr = new_rq[0].rq_iov[0].iov_base; - unsigned int npages; + struct page *page; unsigned int orig_len = 0; int i, j; int rc = -ENOMEM; @@ -4461,40 +4491,45 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, for (i = 1; i < num_rqst; i++) { struct smb_rqst *old = &old_rq[i - 1]; struct smb_rqst *new = &new_rq[i]; + struct xarray *buffer = &new->rq_buffer; + size_t size = iov_iter_count(&old->rq_iter), seg, copied = 0; orig_len += smb_rqst_len(server, old); new->rq_iov = old->rq_iov; new->rq_nvec = old->rq_nvec; - npages = old->rq_npages; - if (!npages) - continue; - - pages = kmalloc_array(npages, sizeof(struct page *), - GFP_KERNEL); - if (!pages) - goto err_free; - - new->rq_pages = pages; - new->rq_npages = npages; - new->rq_offset = old->rq_offset; - new->rq_pagesz = old->rq_pagesz; - new->rq_tailsz = old->rq_tailsz; - - for (j = 0; j < npages; j++) { - pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!pages[j]) - goto err_free; - } + xa_init(buffer); + + if (size > 0) { + unsigned int npages = DIV_ROUND_UP(size, PAGE_SIZE); + + for (j = 0; j < npages; j++) { + void *o; - /* copy pages form the old */ - for (j = 0; j < npages; j++) { - unsigned int offset, len; + rc = -ENOMEM; + page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + if (!page) + goto err_free; + page->index = j; + o = xa_store(buffer, j, page, GFP_KERNEL); + if (xa_is_err(o)) { + rc = xa_err(o); + put_page(page); + goto err_free; + } - rqst_page_get_length(new, j, &len, &offset); + xa_set_mark(buffer, j, XA_MARK_0); - memcpy_page(new->rq_pages[j], offset, - old->rq_pages[j], offset, len); + seg = min_t(size_t, size - copied, PAGE_SIZE); + if (copy_page_from_iter(page, 0, seg, &old->rq_iter) != seg) { + rc = -EFAULT; + goto err_free; + } + copied += seg; + } + iov_iter_xarray(&new->rq_iter, ITER_SOURCE, + buffer, 0, size); + new->rq_iter_size = size; } } @@ -4523,12 +4558,12 @@ smb3_is_transform_hdr(void *buf) static int decrypt_raw_data(struct TCP_Server_Info *server, char *buf, - unsigned int buf_data_size, struct page **pages, - unsigned int npages, unsigned int page_data_size, + unsigned int buf_data_size, struct iov_iter *iter, bool is_offloaded) { struct kvec iov[2]; struct smb_rqst rqst = {NULL}; + size_t iter_size = 0; int rc; iov[0].iov_base = buf; @@ -4538,10 +4573,11 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, rqst.rq_iov = iov; rqst.rq_nvec = 2; - rqst.rq_pages = pages; - rqst.rq_npages = npages; - rqst.rq_pagesz = PAGE_SIZE; - rqst.rq_tailsz = (page_data_size % PAGE_SIZE) ? : PAGE_SIZE; + if (iter) { + rqst.rq_iter = *iter; + rqst.rq_iter_size = iov_iter_count(iter); + iter_size = iov_iter_count(iter); + } rc = crypt_message(server, 1, &rqst, 0); cifs_dbg(FYI, "Decrypt message returned %d\n", rc); @@ -4552,73 +4588,37 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, memmove(buf, iov[1].iov_base, buf_data_size); if (!is_offloaded) - server->total_read = buf_data_size + page_data_size; + server->total_read = buf_data_size + iter_size; return rc; } static int -read_data_into_pages(struct TCP_Server_Info *server, struct page **pages, - unsigned int npages, unsigned int len) +cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size, + unsigned int skip, struct iov_iter *iter) { - int i; - int length; + struct page *page; + unsigned long index; - for (i = 0; i < npages; i++) { - struct page *page = pages[i]; - size_t n; + xa_for_each(pages, index, page) { + size_t n, len = min_t(unsigned int, PAGE_SIZE - skip, data_size); - n = len; - if (len >= PAGE_SIZE) { - /* enough data to fill the page */ - n = PAGE_SIZE; - len -= n; - } else { - zero_user(page, len, PAGE_SIZE - len); - len = 0; + n = copy_page_to_iter(page, skip, len, iter); + if (n != len) { + cifs_dbg(VFS, "%s: something went wrong\n", __func__); + return -EIO; } - length = cifs_read_page_from_socket(server, page, 0, n); - if (length < 0) - return length; - server->total_read += length; + data_size -= n; + skip = 0; } return 0; } static int -init_read_bvec(struct page **pages, unsigned int npages, unsigned int data_size, - unsigned int cur_off, struct bio_vec **page_vec) -{ - struct bio_vec *bvec; - int i; - - bvec = kcalloc(npages, sizeof(struct bio_vec), GFP_KERNEL); - if (!bvec) - return -ENOMEM; - - for (i = 0; i < npages; i++) { - bvec_set_page(&bvec[i], pages[i], - min_t(unsigned int, PAGE_SIZE, data_size), - i == 0 ? cur_off : 0); - data_size -= bvec[i].bv_len; - } - - if (data_size != 0) { - cifs_dbg(VFS, "%s: something went wrong\n", __func__); - kfree(bvec); - return -EIO; - } - - *page_vec = bvec; - return 0; -} - -static int handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, - char *buf, unsigned int buf_len, struct page **pages, - unsigned int npages, unsigned int page_data_size, - bool is_offloaded) + char *buf, unsigned int buf_len, struct xarray *pages, + unsigned int pages_len, bool is_offloaded) { unsigned int data_offset; unsigned int data_len; @@ -4627,9 +4627,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, unsigned int pad_len; struct cifs_readdata *rdata = mid->callback_data; struct smb2_hdr *shdr = (struct smb2_hdr *)buf; - struct bio_vec *bvec = NULL; - struct iov_iter iter; - struct kvec iov; int length; bool use_rdma_mr = false; @@ -4718,7 +4715,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - if (data_len > page_data_size - pad_len) { + if (data_len > pages_len - pad_len) { /* data_len is corrupt -- discard frame */ rdata->result = -EIO; if (is_offloaded) @@ -4728,8 +4725,9 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - rdata->result = init_read_bvec(pages, npages, page_data_size, - cur_off, &bvec); + /* Copy the data to the output I/O iterator. */ + rdata->result = cifs_copy_pages_to_iter(pages, pages_len, + cur_off, &rdata->iter); if (rdata->result != 0) { if (is_offloaded) mid->mid_state = MID_RESPONSE_MALFORMED; @@ -4737,14 +4735,16 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, dequeue_mid(mid, rdata->result); return 0; } + rdata->got_bytes = pages_len; - iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len); } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ - WARN_ONCE(npages > 0, "read data can be either in buf or in pages"); - iov.iov_base = buf + data_offset; - iov.iov_len = data_len; - iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len); + WARN_ONCE(pages && !xa_empty(pages), + "read data can be either in buf or in pages"); + length = copy_to_iter(buf + data_offset, data_len, &rdata->iter); + if (length < 0) + return length; + rdata->got_bytes = data_len; } else { /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); @@ -4756,26 +4756,18 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - length = rdata->copy_into_pages(server, rdata, &iter); - - kfree(bvec); - - if (length < 0) - return length; - if (is_offloaded) mid->mid_state = MID_RESPONSE_RECEIVED; else dequeue_mid(mid, false); - return length; + return 0; } struct smb2_decrypt_work { struct work_struct decrypt; struct TCP_Server_Info *server; - struct page **ppages; + struct xarray buffer; char *buf; - unsigned int npages; unsigned int len; }; @@ -4784,11 +4776,13 @@ static void smb2_decrypt_offload(struct work_struct *work) { struct smb2_decrypt_work *dw = container_of(work, struct smb2_decrypt_work, decrypt); - int i, rc; + int rc; struct mid_q_entry *mid; + struct iov_iter iter; + iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, dw->len); rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size, - dw->ppages, dw->npages, dw->len, true); + &iter, true); if (rc) { cifs_dbg(VFS, "error decrypting rc=%d\n", rc); goto free_pages; @@ -4802,7 +4796,7 @@ static void smb2_decrypt_offload(struct work_struct *work) mid->decrypted = true; rc = handle_read_data(dw->server, mid, dw->buf, dw->server->vals->read_rsp_size, - dw->ppages, dw->npages, dw->len, + &dw->buffer, dw->len, true); if (rc >= 0) { #ifdef CONFIG_CIFS_STATS2 @@ -4835,10 +4829,7 @@ static void smb2_decrypt_offload(struct work_struct *work) } free_pages: - for (i = dw->npages-1; i >= 0; i--) - put_page(dw->ppages[i]); - - kfree(dw->ppages); + cifs_clear_xarray_buffer(&dw->buffer); cifs_small_buf_release(dw->buf); kfree(dw); } @@ -4848,47 +4839,66 @@ static int receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, int *num_mids) { + struct page *page; char *buf = server->smallbuf; struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; - unsigned int npages; - struct page **pages; - unsigned int len; + struct iov_iter iter; + unsigned int len, npages; unsigned int buflen = server->pdu_size; int rc; int i = 0; struct smb2_decrypt_work *dw; + dw = kzalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL); + if (!dw) + return -ENOMEM; + xa_init(&dw->buffer); + INIT_WORK(&dw->decrypt, smb2_decrypt_offload); + dw->server = server; + *num_mids = 1; len = min_t(unsigned int, buflen, server->vals->read_rsp_size + sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1; rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len); if (rc < 0) - return rc; + goto free_dw; server->total_read += rc; len = le32_to_cpu(tr_hdr->OriginalMessageSize) - server->vals->read_rsp_size; + dw->len = len; npages = DIV_ROUND_UP(len, PAGE_SIZE); - pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - rc = -ENOMEM; - goto discard_data; - } - + rc = -ENOMEM; for (; i < npages; i++) { - pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!pages[i]) { - rc = -ENOMEM; + void *old; + + page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); + if (!page) + goto discard_data; + page->index = i; + old = xa_store(&dw->buffer, i, page, GFP_KERNEL); + if (xa_is_err(old)) { + rc = xa_err(old); + put_page(page); goto discard_data; } + xa_set_mark(&dw->buffer, i, XA_MARK_0); } - /* read read data into pages */ - rc = read_data_into_pages(server, pages, npages, len); - if (rc) - goto free_pages; + iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, npages * PAGE_SIZE); + + /* Read the data into the buffer and clear excess bufferage. */ + rc = cifs_read_iter_from_socket(server, &iter, dw->len); + if (rc < 0) + goto discard_data; + + server->total_read += rc; + if (rc < npages * PAGE_SIZE) + iov_iter_zero(npages * PAGE_SIZE - rc, &iter); + iov_iter_revert(&iter, npages * PAGE_SIZE); + iov_iter_truncate(&iter, dw->len); rc = cifs_discard_remaining_data(server); if (rc) @@ -4901,39 +4911,28 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, if ((server->min_offload) && (server->in_flight > 1) && (server->pdu_size >= server->min_offload)) { - dw = kmalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL); - if (dw == NULL) - goto non_offloaded_decrypt; - dw->buf = server->smallbuf; server->smallbuf = (char *)cifs_small_buf_get(); - INIT_WORK(&dw->decrypt, smb2_decrypt_offload); - - dw->npages = npages; - dw->server = server; - dw->ppages = pages; - dw->len = len; queue_work(decrypt_wq, &dw->decrypt); *num_mids = 0; /* worker thread takes care of finding mid */ return -1; } -non_offloaded_decrypt: rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size, - pages, npages, len, false); + &iter, false); if (rc) goto free_pages; *mid = smb2_find_mid(server, buf); - if (*mid == NULL) + if (*mid == NULL) { cifs_dbg(FYI, "mid not found\n"); - else { + } else { cifs_dbg(FYI, "mid found\n"); (*mid)->decrypted = true; rc = handle_read_data(server, *mid, buf, server->vals->read_rsp_size, - pages, npages, len, false); + &dw->buffer, dw->len, false); if (rc >= 0) { if (server->ops->is_network_name_deleted) { server->ops->is_network_name_deleted(buf, @@ -4943,9 +4942,9 @@ non_offloaded_decrypt: } free_pages: - for (i = i - 1; i >= 0; i--) - put_page(pages[i]); - kfree(pages); + cifs_clear_xarray_buffer(&dw->buffer); +free_dw: + kfree(dw); return rc; discard_data: cifs_discard_remaining_data(server); @@ -4983,7 +4982,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, server->total_read += length; buf_size = pdu_length - sizeof(struct smb2_transform_hdr); - length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0, false); + length = decrypt_raw_data(server, buf, buf_size, NULL, false); if (length) return length; @@ -5082,7 +5081,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) char *buf = server->large_buf ? server->bigbuf : server->smallbuf; return handle_read_data(server, mid, buf, server->pdu_size, - NULL, 0, 0, false); + NULL, 0, false); } static int @@ -5134,15 +5133,16 @@ smb2_make_node(unsigned int xid, struct inode *inode, cifs_dbg(FYI, "sfu compat create special file\n"); - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL); - oparms.disposition = FILE_CREATE; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = GENERIC_WRITE, + .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | + CREATE_OPTION_SPECIAL), + .disposition = FILE_CREATE, + .path = full_path, + .fid = &fid, + }; if (tcon->ses->server->oplocks) oplock = REQ_OPLOCK; @@ -5629,7 +5629,7 @@ struct smb_version_values smb20_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5651,7 +5651,7 @@ struct smb_version_values smb21_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5672,7 +5672,7 @@ struct smb_version_values smb3any_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5693,7 +5693,7 @@ struct smb_version_values smbdefault_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5714,7 +5714,7 @@ struct smb_version_values smb30_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5735,7 +5735,7 @@ struct smb_version_values smb302_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5756,7 +5756,7 @@ struct smb_version_values smb311_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2c9ffa921e6f..ca9d7110ddcb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -139,6 +139,66 @@ out: return; } +static int wait_for_server_reconnect(struct TCP_Server_Info *server, + __le16 smb2_command, bool retry) +{ + int timeout = 10; + int rc; + + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&server->srv_lock); + return 0; + } + timeout *= server->nr_targets; + spin_unlock(&server->srv_lock); + + /* + * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE + * here since they are implicitly done when session drops. + */ + switch (smb2_command) { + /* + * BB Should we keep oplock break and add flush to exceptions? + */ + case SMB2_TREE_DISCONNECT: + case SMB2_CANCEL: + case SMB2_CLOSE: + case SMB2_OPLOCK_BREAK: + return -EAGAIN; + } + + /* + * Give demultiplex thread up to 10 seconds to each target available for + * reconnect -- should be greater than cifs socket timeout which is 7 + * seconds. + * + * On "soft" mounts we wait once. Hard mounts keep retrying until + * process is killed or server comes back on-line. + */ + do { + rc = wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), + timeout * HZ); + if (rc < 0) { + cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", + __func__); + return -ERESTARTSYS; + } + + /* are we still trying to reconnect? */ + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&server->srv_lock); + return 0; + } + spin_unlock(&server->srv_lock); + } while (retry); + + cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); + return -EHOSTDOWN; +} + static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server) @@ -146,7 +206,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, int rc = 0; struct nls_table *nls_codepage; struct cifs_ses *ses; - int retries; /* * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so @@ -184,61 +243,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, (!tcon->ses->server) || !server) return -EIO; - ses = tcon->ses; - retries = server->nr_targets; - - /* - * Give demultiplex thread up to 10 seconds to each target available for - * reconnect -- should be greater than cifs socket timeout which is 7 - * seconds. - */ - while (server->tcpStatus == CifsNeedReconnect) { - /* - * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE - * here since they are implicitly done when session drops. - */ - switch (smb2_command) { - /* - * BB Should we keep oplock break and add flush to exceptions? - */ - case SMB2_TREE_DISCONNECT: - case SMB2_CANCEL: - case SMB2_CLOSE: - case SMB2_OPLOCK_BREAK: - return -EAGAIN; - } - - rc = wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), - 10 * HZ); - if (rc < 0) { - cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n", - __func__); - return -ERESTARTSYS; - } - - /* are we still trying to reconnect? */ - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&server->srv_lock); - break; - } - spin_unlock(&server->srv_lock); - - if (retries && --retries) - continue; + rc = wait_for_server_reconnect(server, smb2_command, tcon->retry); + if (rc) + return rc; - /* - * on "soft" mounts we wait once. Hard mounts keep - * retrying until process is killed or server comes - * back on-line - */ - if (!tcon->retry) { - cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); - return -EHOSTDOWN; - } - retries = server->nr_targets; - } + ses = tcon->ses; spin_lock(&ses->chan_lock); if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { @@ -1364,7 +1373,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = - cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */); + cpu_to_le16(sizeof(struct smb2_sess_setup_req)); req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); memset(&rqst, 0, sizeof(struct smb_rqst)); @@ -1858,12 +1867,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, if (unc_path == NULL) return -ENOMEM; - unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp) + 1; - unc_path_len *= 2; - if (unc_path_len < 2) { + unc_path_len = cifs_strtoUTF16(unc_path, tree, strlen(tree), cp); + if (unc_path_len <= 0) { kfree(unc_path); return -EINVAL; } + unc_path_len *= 2; /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */ tcon->tid = 0; @@ -1883,9 +1892,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, iov[0].iov_len = total_len - 1; /* Testing shows that buffer offset must be at location of Buffer[0] */ - req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req) - - 1 /* pad */); - req->PathLength = cpu_to_le16(unc_path_len - 2); + req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req)); + req->PathLength = cpu_to_le16(unc_path_len); iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; @@ -3764,7 +3772,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, ses->Suid, (u8)watch_tree, completion_filter); /* validate that notify information is plausible */ if ((rsp_iov.iov_base == NULL) || - (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp))) + (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp) + 1)) goto cnotify_exit; smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base; @@ -3898,7 +3906,7 @@ void smb2_reconnect_server(struct work_struct *work) goto done; /* allocate a dummy tcon struct used for reconnect */ - tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); + tcon = tconInfoAlloc(); if (!tcon) { resched = true; list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { @@ -3921,7 +3929,7 @@ void smb2_reconnect_server(struct work_struct *work) list_del_init(&ses->rlist); cifs_put_smb_ses(ses); } - kfree(tcon); + tconInfoFree(tcon); done: cifs_dbg(FYI, "Reconnecting tcons and channels finished\n"); @@ -4054,6 +4062,36 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, return rc; } +#ifdef CONFIG_CIFS_SMB_DIRECT +static inline bool smb3_use_rdma_offload(struct cifs_io_parms *io_parms) +{ + struct TCP_Server_Info *server = io_parms->server; + struct cifs_tcon *tcon = io_parms->tcon; + + /* we can only offload if we're connected */ + if (!server || !tcon) + return false; + + /* we can only offload on an rdma connection */ + if (!server->rdma || !server->smbd_conn) + return false; + + /* we don't support signed offload yet */ + if (server->sign) + return false; + + /* we don't support encrypted offload yet */ + if (smb3_encryption_required(tcon)) + return false; + + /* offload also has its overhead, so only do it if desired */ + if (io_parms->length < server->smbd_conn->rdma_readwrite_threshold) + return false; + + return true; +} +#endif /* CONFIG_CIFS_SMB_DIRECT */ + /* * To form a chain of read requests, any read requests after the first should * have the end_of_chain boolean set to true. @@ -4097,16 +4135,12 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * If we want to do a RDMA write, fill in and append * smbd_buffer_descriptor_v1 to the end of read request */ - if (server->rdma && rdata && !server->sign && - rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { - + if (smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; bool need_invalidate = server->dialect == SMB30_PROT_ID; - rdata->mr = smbd_register_mr( - server->smbd_conn, rdata->pages, - rdata->nr_pages, rdata->page_offset, - rdata->tailsz, true, need_invalidate); + rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->iter, + true, need_invalidate); if (!rdata->mr) return -EAGAIN; @@ -4163,15 +4197,9 @@ smb2_readv_callback(struct mid_q_entry *mid) (struct smb2_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], - .rq_nvec = 1, }; - - if (rdata->got_bytes) { - rqst.rq_pages = rdata->pages; - rqst.rq_offset = rdata->page_offset; - rqst.rq_npages = rdata->nr_pages; - rqst.rq_pagesz = rdata->pagesz; - rqst.rq_tailsz = rdata->tailsz; - } + .rq_nvec = 1, + .rq_iter = rdata->iter, + .rq_iter_size = iov_iter_count(&rdata->iter), }; WARN_ONCE(rdata->server != mid->server, "rdata server %p != mid server %p", @@ -4189,6 +4217,8 @@ smb2_readv_callback(struct mid_q_entry *mid) if (server->sign && !mid->decrypted) { int rc; + iov_iter_revert(&rqst.rq_iter, rdata->got_bytes); + iov_iter_truncate(&rqst.rq_iter, rdata->got_bytes); rc = smb2_verify_signature(&rqst, server); if (rc) cifs_tcon_dbg(VFS, "SMB signature verification returned error = %d\n", @@ -4495,10 +4525,27 @@ smb2_async_writev(struct cifs_writedata *wdata, struct kvec iov[1]; struct smb_rqst rqst = { }; unsigned int total_len; + struct cifs_io_parms _io_parms; + struct cifs_io_parms *io_parms = NULL; if (!wdata->server) server = wdata->server = cifs_pick_channel(tcon->ses); + /* + * in future we may get cifs_io_parms passed in from the caller, + * but for now we construct it here... + */ + _io_parms = (struct cifs_io_parms) { + .tcon = tcon, + .server = server, + .offset = wdata->offset, + .length = wdata->bytes, + .persistent_fid = wdata->cfile->fid.persistent_fid, + .volatile_fid = wdata->cfile->fid.volatile_fid, + .pid = wdata->pid, + }; + io_parms = &_io_parms; + rc = smb2_plain_req_init(SMB2_WRITE, tcon, server, (void **) &req, &total_len); if (rc) @@ -4508,49 +4555,44 @@ smb2_async_writev(struct cifs_writedata *wdata, flags |= CIFS_TRANSFORM_REQ; shdr = (struct smb2_hdr *)req; - shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid); + shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = wdata->cfile->fid.persistent_fid; - req->VolatileFileId = wdata->cfile->fid.volatile_fid; + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; - req->Channel = 0; - req->Offset = cpu_to_le64(wdata->offset); + req->Channel = SMB2_CHANNEL_NONE; + req->Offset = cpu_to_le64(io_parms->offset); req->DataOffset = cpu_to_le16( offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; - trace_smb3_write_enter(0 /* xid */, wdata->cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes); + trace_smb3_write_enter(0 /* xid */, + io_parms->persistent_fid, + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, + io_parms->length); + #ifdef CONFIG_CIFS_SMB_DIRECT /* * If we want to do a server RDMA read, fill in and append * smbd_buffer_descriptor_v1 to the end of write request */ - if (server->rdma && !server->sign && wdata->bytes >= - server->smbd_conn->rdma_readwrite_threshold) { - + if (smb3_use_rdma_offload(io_parms)) { struct smbd_buffer_descriptor_v1 *v1; + size_t data_size = iov_iter_count(&wdata->iter); bool need_invalidate = server->dialect == SMB30_PROT_ID; - wdata->mr = smbd_register_mr( - server->smbd_conn, wdata->pages, - wdata->nr_pages, wdata->page_offset, - wdata->tailsz, false, need_invalidate); + wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->iter, + false, need_invalidate); if (!wdata->mr) { rc = -EAGAIN; goto async_writev_out; } req->Length = 0; req->DataOffset = 0; - if (wdata->nr_pages > 1) - req->RemainingBytes = - cpu_to_le32( - (wdata->nr_pages - 1) * wdata->pagesz - - wdata->page_offset + wdata->tailsz - ); - else - req->RemainingBytes = cpu_to_le32(wdata->tailsz); + req->RemainingBytes = cpu_to_le32(data_size); req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; if (need_invalidate) req->Channel = SMB2_CHANNEL_RDMA_V1; @@ -4569,26 +4611,21 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_iov = iov; rqst.rq_nvec = 1; - rqst.rq_pages = wdata->pages; - rqst.rq_offset = wdata->page_offset; - rqst.rq_npages = wdata->nr_pages; - rqst.rq_pagesz = wdata->pagesz; - rqst.rq_tailsz = wdata->tailsz; + rqst.rq_iter = wdata->iter; + rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter); #ifdef CONFIG_CIFS_SMB_DIRECT - if (wdata->mr) { + if (wdata->mr) iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); - rqst.rq_npages = 0; - } #endif - cifs_dbg(FYI, "async write at %llu %u bytes\n", - wdata->offset, wdata->bytes); + cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", + io_parms->offset, io_parms->length, iov_iter_count(&rqst.rq_iter)); #ifdef CONFIG_CIFS_SMB_DIRECT /* For RDMA read, I/O size is in RemainingBytes not in Length */ if (!wdata->mr) - req->Length = cpu_to_le32(wdata->bytes); + req->Length = cpu_to_le32(io_parms->length); #else - req->Length = cpu_to_le32(wdata->bytes); + req->Length = cpu_to_le32(io_parms->length); #endif if (wdata->credits.value > 0) { @@ -4596,7 +4633,7 @@ smb2_async_writev(struct cifs_writedata *wdata, SMB2_MAX_BUFFER_SIZE)); shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8); - rc = adjust_credits(server, &wdata->credits, wdata->bytes); + rc = adjust_credits(server, &wdata->credits, io_parms->length); if (rc) goto async_writev_out; @@ -4609,9 +4646,12 @@ smb2_async_writev(struct cifs_writedata *wdata, if (rc) { trace_smb3_write_err(0 /* no xid */, - req->PersistentFileId, - tcon->tid, tcon->ses->Suid, wdata->offset, - wdata->bytes, rc); + io_parms->persistent_fid, + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, + io_parms->length, + rc); kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); } @@ -4906,7 +4946,7 @@ int SMB2_query_directory_init(const unsigned int xid, memcpy(bufptr, &asteriks, len); req->FileNameOffset = - cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1); + cpu_to_le16(sizeof(struct smb2_query_directory_req)); req->FileNameLength = cpu_to_le16(len); /* * BB could be 30 bytes or so longer if we used SMB2 specific @@ -4951,10 +4991,10 @@ smb2_parse_query_directory(struct cifs_tcon *tcon, switch (srch_inf->info_level) { case SMB_FIND_FILE_DIRECTORY_INFO: - info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1; + info_buf_size = sizeof(FILE_DIRECTORY_INFO); break; case SMB_FIND_FILE_ID_FULL_DIR_INFO: - info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; + info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO); break; case SMB_FIND_FILE_POSIX_INFO: /* note that posix payload are variable size */ @@ -5102,8 +5142,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->VolatileFileId = volatile_fid; req->AdditionalInformation = cpu_to_le32(additional_info); - req->BufferOffset = - cpu_to_le16(sizeof(struct smb2_set_info_req) - 1); + req->BufferOffset = cpu_to_le16(sizeof(struct smb2_set_info_req)); req->BufferLength = cpu_to_le32(*size); memcpy(req->Buffer, *data, *size); @@ -5337,9 +5376,9 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, req->VolatileFileId = volatile_fid; /* 1 for pad */ req->InputBufferOffset = - cpu_to_le16(sizeof(struct smb2_query_info_req) - 1); + cpu_to_le16(sizeof(struct smb2_query_info_req)); req->OutputBufferLength = cpu_to_le32( - outbuf_len + sizeof(struct smb2_query_info_rsp) - 1); + outbuf_len + sizeof(struct smb2_query_info_rsp)); iov->iov_base = (char *)req; iov->iov_len = total_len; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 1237bb86e93a..2114e8a0c63a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -57,7 +57,7 @@ struct smb2_rdma_crypto_transform { #define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL #define SMB2_SYMLINK_STRUCT_SIZE \ - (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + (sizeof(struct smb2_err_rsp) + sizeof(struct smb2_symlink_err_rsp)) #define SYMLINK_ERROR_TAG 0x4c4d5953 @@ -371,7 +371,7 @@ struct smb2_file_id_extd_directory_info { __le32 EaSize; /* EA size */ __le32 ReparsePointTag; /* valid if FILE_ATTR_REPARSE_POINT set in FileAttributes */ __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit */ - char FileName[1]; + char FileName[]; } __packed; /* level 60 */ extern char smb2_padding[7]; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 8c816b25ce7c..55b6e319a61d 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -34,16 +34,21 @@ static int smbd_post_recv( struct smbd_response *response); static int smbd_post_send_empty(struct smbd_connection *info); -static int smbd_post_send_data( - struct smbd_connection *info, - struct kvec *iov, int n_vec, int remaining_data_length); -static int smbd_post_send_page(struct smbd_connection *info, - struct page *page, unsigned long offset, - size_t size, int remaining_data_length); static void destroy_mr_list(struct smbd_connection *info); static int allocate_mr_list(struct smbd_connection *info); +struct smb_extract_to_rdma { + struct ib_sge *sge; + unsigned int nr_sge; + unsigned int max_sge; + struct ib_device *device; + u32 local_dma_lkey; + enum dma_data_direction direction; +}; +static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, + struct smb_extract_to_rdma *rdma); + /* SMBD version number */ #define SMBD_V1 0x0100 @@ -823,16 +828,16 @@ static int smbd_post_send(struct smbd_connection *info, return rc; } -static int smbd_post_send_sgl(struct smbd_connection *info, - struct scatterlist *sgl, int data_length, int remaining_data_length) +static int smbd_post_send_iter(struct smbd_connection *info, + struct iov_iter *iter, + int *_remaining_data_length) { - int num_sgs; int i, rc; int header_length; + int data_length; struct smbd_request *request; struct smbd_data_transfer *packet; int new_credits; - struct scatterlist *sg; wait_credit: /* Wait for send credits. A SMBD packet needs one credit */ @@ -876,6 +881,30 @@ wait_send_queue: } request->info = info; + memset(request->sge, 0, sizeof(request->sge)); + + /* Fill in the data payload to find out how much data we can add */ + if (iter) { + struct smb_extract_to_rdma extract = { + .nr_sge = 1, + .max_sge = SMBDIRECT_MAX_SEND_SGE, + .sge = request->sge, + .device = info->id->device, + .local_dma_lkey = info->pd->local_dma_lkey, + .direction = DMA_TO_DEVICE, + }; + + rc = smb_extract_iter_to_rdma(iter, *_remaining_data_length, + &extract); + if (rc < 0) + goto err_dma; + data_length = rc; + request->num_sge = extract.nr_sge; + *_remaining_data_length -= data_length; + } else { + data_length = 0; + request->num_sge = 1; + } /* Fill in the packet header */ packet = smbd_request_payload(request); @@ -897,7 +926,7 @@ wait_send_queue: else packet->data_offset = cpu_to_le32(24); packet->data_length = cpu_to_le32(data_length); - packet->remaining_data_length = cpu_to_le32(remaining_data_length); + packet->remaining_data_length = cpu_to_le32(*_remaining_data_length); packet->padding = 0; log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", @@ -913,7 +942,6 @@ wait_send_queue: if (!data_length) header_length = offsetof(struct smbd_data_transfer, padding); - request->num_sge = 1; request->sge[0].addr = ib_dma_map_single(info->id->device, (void *)packet, header_length, @@ -927,23 +955,6 @@ wait_send_queue: request->sge[0].length = header_length; request->sge[0].lkey = info->pd->local_dma_lkey; - /* Fill in the packet data payload */ - num_sgs = sgl ? sg_nents(sgl) : 0; - for_each_sg(sgl, sg, num_sgs, i) { - request->sge[i+1].addr = - ib_dma_map_page(info->id->device, sg_page(sg), - sg->offset, sg->length, DMA_TO_DEVICE); - if (ib_dma_mapping_error( - info->id->device, request->sge[i+1].addr)) { - rc = -EIO; - request->sge[i+1].addr = 0; - goto err_dma; - } - request->sge[i+1].length = sg->length; - request->sge[i+1].lkey = info->pd->local_dma_lkey; - request->num_sge++; - } - rc = smbd_post_send(info, request); if (!rc) return 0; @@ -976,61 +987,16 @@ err_wait_credit: } /* - * Send a page - * page: the page to send - * offset: offset in the page to send - * size: length in the page to send - * remaining_data_length: remaining data to send in this payload - */ -static int smbd_post_send_page(struct smbd_connection *info, struct page *page, - unsigned long offset, size_t size, int remaining_data_length) -{ - struct scatterlist sgl; - - sg_init_table(&sgl, 1); - sg_set_page(&sgl, page, size, offset); - - return smbd_post_send_sgl(info, &sgl, size, remaining_data_length); -} - -/* * Send an empty message * Empty message is used to extend credits to peer to for keep live * while there is no upper layer payload to send at the time */ static int smbd_post_send_empty(struct smbd_connection *info) { - info->count_send_empty++; - return smbd_post_send_sgl(info, NULL, 0, 0); -} - -/* - * Send a data buffer - * iov: the iov array describing the data buffers - * n_vec: number of iov array - * remaining_data_length: remaining data to send following this packet - * in segmented SMBD packet - */ -static int smbd_post_send_data( - struct smbd_connection *info, struct kvec *iov, int n_vec, - int remaining_data_length) -{ - int i; - u32 data_length = 0; - struct scatterlist sgl[SMBDIRECT_MAX_SEND_SGE - 1]; - - if (n_vec > SMBDIRECT_MAX_SEND_SGE - 1) { - cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); - return -EINVAL; - } + int remaining_data_length = 0; - sg_init_table(sgl, n_vec); - for (i = 0; i < n_vec; i++) { - data_length += iov[i].iov_len; - sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len); - } - - return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length); + info->count_send_empty++; + return smbd_post_send_iter(info, NULL, &remaining_data_length); } /* @@ -1700,6 +1666,7 @@ static struct smbd_connection *_smbd_get_connection( allocate_mr_failed: /* At this point, need to a full transport shutdown */ + server->smbd_conn = info; smbd_destroy(server); return NULL; @@ -1985,18 +1952,10 @@ int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst_array) { struct smbd_connection *info = server->smbd_conn; - struct kvec vecs[SMBDIRECT_MAX_SEND_SGE - 1]; - int nvecs; - int size; - unsigned int buflen, remaining_data_length; - unsigned int offset, remaining_vec_data_length; - int start, i, j; - int max_iov_size = - info->max_send_size - sizeof(struct smbd_data_transfer); - struct kvec *iov; - int rc; struct smb_rqst *rqst; - int rqst_idx; + struct iov_iter iter; + unsigned int remaining_data_length, klen; + int rc, i, rqst_idx; if (info->transport_status != SMBD_CONNECTED) return -EAGAIN; @@ -2023,84 +1982,36 @@ int smbd_send(struct TCP_Server_Info *server, rqst_idx = 0; do { rqst = &rqst_array[rqst_idx]; - iov = rqst->rq_iov; cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n", - rqst_idx, smb_rqst_len(server, rqst)); - remaining_vec_data_length = 0; - for (i = 0; i < rqst->rq_nvec; i++) { - remaining_vec_data_length += iov[i].iov_len; - dump_smb(iov[i].iov_base, iov[i].iov_len); - } - - log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d rq_tailsz=%d buflen=%lu\n", - rqst_idx, rqst->rq_nvec, - rqst->rq_npages, rqst->rq_pagesz, - rqst->rq_tailsz, smb_rqst_len(server, rqst)); - - start = 0; - offset = 0; - do { - buflen = 0; - i = start; - j = 0; - while (i < rqst->rq_nvec && - j < SMBDIRECT_MAX_SEND_SGE - 1 && - buflen < max_iov_size) { - - vecs[j].iov_base = iov[i].iov_base + offset; - if (buflen + iov[i].iov_len > max_iov_size) { - vecs[j].iov_len = - max_iov_size - iov[i].iov_len; - buflen = max_iov_size; - offset = vecs[j].iov_len; - } else { - vecs[j].iov_len = - iov[i].iov_len - offset; - buflen += vecs[j].iov_len; - offset = 0; - ++i; - } - ++j; - } + rqst_idx, smb_rqst_len(server, rqst)); + for (i = 0; i < rqst->rq_nvec; i++) + dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len); + + log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n", + rqst_idx, rqst->rq_nvec, remaining_data_length, + iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst)); + + /* Send the metadata pages. */ + klen = 0; + for (i = 0; i < rqst->rq_nvec; i++) + klen += rqst->rq_iov[i].iov_len; + iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); + + rc = smbd_post_send_iter(info, &iter, &remaining_data_length); + if (rc < 0) + break; - remaining_vec_data_length -= buflen; - remaining_data_length -= buflen; - log_write(INFO, "sending %s iov[%d] from start=%d nvecs=%d remaining_data_length=%d\n", - remaining_vec_data_length > 0 ? - "partial" : "complete", - rqst->rq_nvec, start, j, - remaining_data_length); - - start = i; - rc = smbd_post_send_data(info, vecs, j, remaining_data_length); - if (rc) - goto done; - } while (remaining_vec_data_length > 0); - - /* now sending pages if there are any */ - for (i = 0; i < rqst->rq_npages; i++) { - rqst_page_get_length(rqst, i, &buflen, &offset); - nvecs = (buflen + max_iov_size - 1) / max_iov_size; - log_write(INFO, "sending pages buflen=%d nvecs=%d\n", - buflen, nvecs); - for (j = 0; j < nvecs; j++) { - size = min_t(unsigned int, max_iov_size, remaining_data_length); - remaining_data_length -= size; - log_write(INFO, "sending pages i=%d offset=%d size=%d remaining_data_length=%d\n", - i, j * max_iov_size + offset, size, - remaining_data_length); - rc = smbd_post_send_page( - info, rqst->rq_pages[i], - j*max_iov_size + offset, - size, remaining_data_length); - if (rc) - goto done; - } + if (iov_iter_count(&rqst->rq_iter) > 0) { + /* And then the data pages if there are any */ + rc = smbd_post_send_iter(info, &rqst->rq_iter, + &remaining_data_length); + if (rc < 0) + break; } + } while (++rqst_idx < num_rqst); -done: /* * As an optimization, we don't wait for individual I/O to finish * before sending the next one. @@ -2191,10 +2102,10 @@ static void destroy_mr_list(struct smbd_connection *info) cancel_work_sync(&info->mr_recovery_work); list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { if (mr->state == MR_INVALIDATED) - ib_dma_unmap_sg(info->id->device, mr->sgl, - mr->sgl_count, mr->dir); + ib_dma_unmap_sg(info->id->device, mr->sgt.sgl, + mr->sgt.nents, mr->dir); ib_dereg_mr(mr->mr); - kfree(mr->sgl); + kfree(mr->sgt.sgl); kfree(mr); } } @@ -2217,6 +2128,7 @@ static int allocate_mr_list(struct smbd_connection *info) atomic_set(&info->mr_ready_count, 0); atomic_set(&info->mr_used_count, 0); init_waitqueue_head(&info->wait_for_mr_cleanup); + INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); /* Allocate more MRs (2x) than hardware responder_resources */ for (i = 0; i < info->responder_resources * 2; i++) { smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); @@ -2229,11 +2141,10 @@ static int allocate_mr_list(struct smbd_connection *info) info->mr_type, info->max_frmr_depth); goto out; } - smbdirect_mr->sgl = kcalloc( - info->max_frmr_depth, - sizeof(struct scatterlist), - GFP_KERNEL); - if (!smbdirect_mr->sgl) { + smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!smbdirect_mr->sgt.sgl) { log_rdma_mr(ERR, "failed to allocate sgl\n"); ib_dereg_mr(smbdirect_mr->mr); goto out; @@ -2244,15 +2155,15 @@ static int allocate_mr_list(struct smbd_connection *info) list_add_tail(&smbdirect_mr->list, &info->mr_list); atomic_inc(&info->mr_ready_count); } - INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); return 0; out: kfree(smbdirect_mr); list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { + list_del(&smbdirect_mr->list); ib_dereg_mr(smbdirect_mr->mr); - kfree(smbdirect_mr->sgl); + kfree(smbdirect_mr->sgt.sgl); kfree(smbdirect_mr); } return -ENOMEM; @@ -2305,26 +2216,45 @@ again: } /* + * Transcribe the pages from an iterator into an MR scatterlist. + */ +static int smbd_iter_to_mr(struct smbd_connection *info, + struct iov_iter *iter, + struct sg_table *sgt, + unsigned int max_sg) +{ + int ret; + + memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist)); + + ret = netfs_extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0); + WARN_ON(ret < 0); + if (sgt->nents > 0) + sg_mark_end(&sgt->sgl[sgt->nents - 1]); + return ret; +} + +/* * Register memory for RDMA read/write - * pages[]: the list of pages to register memory with - * num_pages: the number of pages to register - * tailsz: if non-zero, the bytes to register in the last page + * iter: the buffer to register memory with * writing: true if this is a RDMA write (SMB read), false for RDMA read * need_invalidate: true if this MR needs to be locally invalidated after I/O * return value: the MR registered, NULL if failed. */ -struct smbd_mr *smbd_register_mr( - struct smbd_connection *info, struct page *pages[], int num_pages, - int offset, int tailsz, bool writing, bool need_invalidate) +struct smbd_mr *smbd_register_mr(struct smbd_connection *info, + struct iov_iter *iter, + bool writing, bool need_invalidate) { struct smbd_mr *smbdirect_mr; - int rc, i; + int rc, num_pages; enum dma_data_direction dir; struct ib_reg_wr *reg_wr; + num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1); if (num_pages > info->max_frmr_depth) { log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", num_pages, info->max_frmr_depth); + WARN_ON_ONCE(1); return NULL; } @@ -2333,45 +2263,31 @@ struct smbd_mr *smbd_register_mr( log_rdma_mr(ERR, "get_mr returning NULL\n"); return NULL; } - smbdirect_mr->need_invalidate = need_invalidate; - smbdirect_mr->sgl_count = num_pages; - sg_init_table(smbdirect_mr->sgl, num_pages); - - log_rdma_mr(INFO, "num_pages=0x%x offset=0x%x tailsz=0x%x\n", - num_pages, offset, tailsz); - - if (num_pages == 1) { - sg_set_page(&smbdirect_mr->sgl[0], pages[0], tailsz, offset); - goto skip_multiple_pages; - } - /* We have at least two pages to register */ - sg_set_page( - &smbdirect_mr->sgl[0], pages[0], PAGE_SIZE - offset, offset); - i = 1; - while (i < num_pages - 1) { - sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0); - i++; - } - sg_set_page(&smbdirect_mr->sgl[i], pages[i], - tailsz ? tailsz : PAGE_SIZE, 0); - -skip_multiple_pages: dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; smbdirect_mr->dir = dir; - rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir); + smbdirect_mr->need_invalidate = need_invalidate; + smbdirect_mr->sgt.nents = 0; + smbdirect_mr->sgt.orig_nents = 0; + + log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", + num_pages, iov_iter_count(iter), info->max_frmr_depth); + smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth); + + rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, dir); if (!rc) { log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", num_pages, dir, rc); goto dma_map_error; } - rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages, - NULL, PAGE_SIZE); - if (rc != num_pages) { + rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, NULL, PAGE_SIZE); + if (rc != smbdirect_mr->sgt.nents) { log_rdma_mr(ERR, - "ib_map_mr_sg failed rc = %d num_pages = %x\n", - rc, num_pages); + "ib_map_mr_sg failed rc = %d nents = %x\n", + rc, smbdirect_mr->sgt.nents); goto map_mr_error; } @@ -2403,8 +2319,8 @@ skip_multiple_pages: /* If all failed, attempt to recover this MR by setting it MR_ERROR*/ map_mr_error: - ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, smbdirect_mr->dir); + ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, smbdirect_mr->dir); dma_map_error: smbdirect_mr->state = MR_ERROR; @@ -2471,8 +2387,8 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) if (smbdirect_mr->state == MR_INVALIDATED) { ib_dma_unmap_sg( - info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, + info->id->device, smbdirect_mr->sgt.sgl, + smbdirect_mr->sgt.nents, smbdirect_mr->dir); smbdirect_mr->state = MR_READY; if (atomic_inc_return(&info->mr_ready_count) == 1) @@ -2490,3 +2406,206 @@ done: return rc; } + +static bool smb_set_sge(struct smb_extract_to_rdma *rdma, + struct page *lowest_page, size_t off, size_t len) +{ + struct ib_sge *sge = &rdma->sge[rdma->nr_sge]; + u64 addr; + + addr = ib_dma_map_page(rdma->device, lowest_page, + off, len, rdma->direction); + if (ib_dma_mapping_error(rdma->device, addr)) + return false; + + sge->addr = addr; + sge->length = len; + sge->lkey = rdma->local_dma_lkey; + rdma->nr_sge++; + return true; +} + +/* + * Extract page fragments from a BVEC-class iterator and add them to an RDMA + * element list. The pages are not pinned. + */ +static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter, + struct smb_extract_to_rdma *rdma, + ssize_t maxsize) +{ + const struct bio_vec *bv = iter->bvec; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + if (!smb_set_sge(rdma, bv[i].bv_page, off, len)) + return -EIO; + + ret += len; + maxsize -= len; + if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) + break; + start = 0; + } + + return ret; +} + +/* + * Extract fragments from a KVEC-class iterator and add them to an RDMA list. + * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers. + * The pages are not pinned. + */ +static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter, + struct smb_extract_to_rdma *rdma, + ssize_t maxsize) +{ + const struct kvec *kv = iter->kvec; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + struct page *page; + unsigned long kaddr; + size_t off, len, seg; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + kaddr = (unsigned long)kv[i].iov_base + start; + off = kaddr & ~PAGE_MASK; + len = min_t(size_t, maxsize, len - start); + kaddr &= PAGE_MASK; + + maxsize -= len; + do { + seg = min_t(size_t, len, PAGE_SIZE - off); + + if (is_vmalloc_or_module_addr((void *)kaddr)) + page = vmalloc_to_page((void *)kaddr); + else + page = virt_to_page(kaddr); + + if (!smb_set_sge(rdma, page, off, seg)) + return -EIO; + + ret += seg; + len -= seg; + kaddr += PAGE_SIZE; + off = 0; + } while (len > 0 && rdma->nr_sge < rdma->max_sge); + + if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) + break; + start = 0; + } + + return ret; +} + +/* + * Extract folio fragments from an XARRAY-class iterator and add them to an + * RDMA list. The folios are not pinned. + */ +static ssize_t smb_extract_xarray_to_rdma(struct iov_iter *iter, + struct smb_extract_to_rdma *rdma, + ssize_t maxsize) +{ + struct xarray *xa = iter->xarray; + struct folio *folio; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t index = start / PAGE_SIZE; + ssize_t ret = 0; + size_t off, len; + XA_STATE(xas, xa, index); + + rcu_read_lock(); + + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + off = offset_in_folio(folio, start); + len = min_t(size_t, maxsize, folio_size(folio) - off); + + if (!smb_set_sge(rdma, folio_page(folio, 0), off, len)) { + rcu_read_unlock(); + return -EIO; + } + + maxsize -= len; + ret += len; + if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) + break; + } + + rcu_read_unlock(); + return ret; +} + +/* + * Extract page fragments from up to the given amount of the source iterator + * and build up an RDMA list that refers to all of those bits. The RDMA list + * is appended to, up to the maximum number of elements set in the parameter + * block. + * + * The extracted page fragments are not pinned or ref'd in any way; if an + * IOVEC/UBUF-type iterator is to be used, it should be converted to a + * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some + * way. + */ +static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, + struct smb_extract_to_rdma *rdma) +{ + ssize_t ret; + int before = rdma->nr_sge; + + switch (iov_iter_type(iter)) { + case ITER_BVEC: + ret = smb_extract_bvec_to_rdma(iter, rdma, len); + break; + case ITER_KVEC: + ret = smb_extract_kvec_to_rdma(iter, rdma, len); + break; + case ITER_XARRAY: + ret = smb_extract_xarray_to_rdma(iter, rdma, len); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + + if (ret > 0) { + iov_iter_advance(iter, ret); + } else if (ret < 0) { + while (rdma->nr_sge > before) { + struct ib_sge *sge = &rdma->sge[rdma->nr_sge--]; + + ib_dma_unmap_single(rdma->device, sge->addr, sge->length, + rdma->direction); + sge->addr = 0; + } + } + + return ret; +} diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h index 207ef979cd51..83f239f376f0 100644 --- a/fs/cifs/smbdirect.h +++ b/fs/cifs/smbdirect.h @@ -288,8 +288,7 @@ struct smbd_mr { struct list_head list; enum mr_state state; struct ib_mr *mr; - struct scatterlist *sgl; - int sgl_count; + struct sg_table sgt; enum dma_data_direction dir; union { struct ib_reg_wr wr; @@ -302,8 +301,8 @@ struct smbd_mr { /* Interfaces to register and deregister MR for RDMA read/write */ struct smbd_mr *smbd_register_mr( - struct smbd_connection *info, struct page *pages[], int num_pages, - int offset, int tailsz, bool writing, bool need_invalidate); + struct smbd_connection *info, struct iov_iter *iter, + bool writing, bool need_invalidate); int smbd_deregister_mr(struct smbd_mr *mr); #else diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 3851d0aaa288..b42050c68e6c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -270,26 +270,7 @@ smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst) for (i = 0; i < nvec; i++) buflen += iov[i].iov_len; - /* - * Add in the page array if there is one. The caller needs to make - * sure rq_offset and rq_tailsz are set correctly. If a buffer of - * multiple pages ends at page boundary, rq_tailsz needs to be set to - * PAGE_SIZE. - */ - if (rqst->rq_npages) { - if (rqst->rq_npages == 1) - buflen += rqst->rq_tailsz; - else { - /* - * If there is more than one page, calculate the - * buffer length based on rq_offset and rq_tailsz - */ - buflen += rqst->rq_pagesz * (rqst->rq_npages - 1) - - rqst->rq_offset; - buflen += rqst->rq_tailsz; - } - } - + buflen += iov_iter_count(&rqst->rq_iter); return buflen; } @@ -376,23 +357,15 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, total_len += sent; - /* now walk the page array and send each page in it */ - for (i = 0; i < rqst[j].rq_npages; i++) { - struct bio_vec bvec; - - bvec.bv_page = rqst[j].rq_pages[i]; - rqst_page_get_length(&rqst[j], i, &bvec.bv_len, - &bvec.bv_offset); - - iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE, - &bvec, 1, bvec.bv_len); + if (iov_iter_count(&rqst[j].rq_iter) > 0) { + smb_msg.msg_iter = rqst[j].rq_iter; rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) break; - total_len += sent; } - } + +} unmask: sigprocmask(SIG_SETMASK, &oldmask, NULL); @@ -1034,15 +1007,40 @@ cifs_cancelled_callback(struct mid_q_entry *mid) struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) { uint index = 0; + unsigned int min_in_flight = UINT_MAX, max_in_flight = 0; + struct TCP_Server_Info *server = NULL; + int i; if (!ses) return NULL; - /* round robin */ - index = (uint)atomic_inc_return(&ses->chan_seq); - spin_lock(&ses->chan_lock); - index %= ses->chan_count; + for (i = 0; i < ses->chan_count; i++) { + server = ses->chans[i].server; + if (!server) + continue; + + /* + * strictly speaking, we should pick up req_lock to read + * server->in_flight. But it shouldn't matter much here if we + * race while reading this data. The worst that can happen is + * that we could use a channel that's not least loaded. Avoiding + * taking the lock could help reduce wait time, which is + * important for this function + */ + if (server->in_flight < min_in_flight) { + min_in_flight = server->in_flight; + index = i; + } + if (server->in_flight > max_in_flight) + max_in_flight = server->in_flight; + } + + /* if all channels are equally loaded, fall back to round-robin */ + if (min_in_flight == max_in_flight) { + index = (uint)atomic_inc_return(&ses->chan_seq); + index %= ses->chan_count; + } spin_unlock(&ses->chan_lock); return ses->chans[index].server; @@ -1640,11 +1638,11 @@ int cifs_discard_remaining_data(struct TCP_Server_Info *server) { unsigned int rfclen = server->pdu_size; - int remaining = rfclen + HEADER_PREAMBLE_SIZE(server) - + size_t remaining = rfclen + HEADER_PREAMBLE_SIZE(server) - server->total_read; while (remaining > 0) { - int length; + ssize_t length; length = cifs_discard_from_socket(server, min_t(size_t, remaining, @@ -1790,10 +1788,15 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return cifs_readv_discard(server, mid); } - length = rdata->read_into_pages(server, rdata, data_len); - if (length < 0) - return length; - +#ifdef CONFIG_CIFS_SMB_DIRECT + if (rdata->mr) + length = data_len; /* An RDMA read is already done. */ + else +#endif + length = cifs_read_iter_from_socket(server, &rdata->iter, + data_len); + if (length > 0) + rdata->got_bytes += length; server->total_read += length; cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 61cd6c2628fa..a9b14f81d655 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -54,6 +54,7 @@ #include <net/ipv6.h> #include <trace/events/dlm.h> +#include <trace/events/sock.h> #include "dlm_internal.h" #include "lowcomms.h" @@ -502,6 +503,8 @@ static void lowcomms_data_ready(struct sock *sk) { struct connection *con = sock2con(sk); + trace_sk_data_ready(sk); + set_bit(CF_RECV_INTR, &con->flags); lowcomms_queue_rwork(con); } @@ -533,6 +536,8 @@ static void lowcomms_state_change(struct sock *sk) static void lowcomms_listen_data_ready(struct sock *sk) { + trace_sk_data_ready(sk); + queue_work(io_workqueue, &listen_con.rwork); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index bd3f3c755b24..c16f0d660cb7 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -260,22 +260,6 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, return i; } -struct extent_crypt_result { - struct completion completion; - int rc; -}; - -static void extent_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct extent_crypt_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->rc = rc; - complete(&ecr->completion); -} - /** * crypt_scatterlist * @crypt_stat: Pointer to the crypt_stat struct to initialize. @@ -293,7 +277,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, unsigned char *iv, int op) { struct skcipher_request *req = NULL; - struct extent_crypt_result ecr; + DECLARE_CRYPTO_WAIT(ecr); int rc = 0; if (unlikely(ecryptfs_verbosity > 0)) { @@ -303,8 +287,6 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, crypt_stat->key_size); } - init_completion(&ecr.completion); - mutex_lock(&crypt_stat->cs_tfm_mutex); req = skcipher_request_alloc(crypt_stat->tfm, GFP_NOFS); if (!req) { @@ -315,7 +297,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - extent_crypt_complete, &ecr); + crypto_req_done, &ecr); /* Consider doing this once, when the file is opened */ if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) { rc = crypto_skcipher_setkey(crypt_stat->tfm, crypt_stat->key, @@ -334,13 +316,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, skcipher_request_set_crypt(req, src_sg, dst_sg, size, iv); rc = op == ENCRYPT ? crypto_skcipher_encrypt(req) : crypto_skcipher_decrypt(req); - if (rc == -EINPROGRESS || rc == -EBUSY) { - struct extent_crypt_result *ecr = req->base.data; - - wait_for_completion(&ecr->completion); - rc = ecr->rc; - reinit_completion(&ecr->completion); - } + rc = crypto_wait_req(rc, &ecr); out: skcipher_request_free(req); return rc; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index e782b4f1d104..2748a82de42a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -37,10 +37,10 @@ #include "aops.h" -void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int len) +void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, + unsigned int from, unsigned int len) { - struct buffer_head *head = page_buffers(page); + struct buffer_head *head = folio_buffers(folio); unsigned int bsize = head->b_size; struct buffer_head *bh; unsigned int to = from + len; @@ -127,7 +127,6 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w { struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); if (PageChecked(page)) { ClearPageChecked(page); @@ -135,7 +134,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w create_empty_buffers(page, inode->i_sb->s_blocksize, BIT(BH_Dirty)|BIT(BH_Uptodate)); } - gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize); + gfs2_trans_add_databufs(ip, page_folio(page), 0, PAGE_SIZE); } return gfs2_write_jdata_page(page, wbc); } diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h index ff9877a68780..09db1914425e 100644 --- a/fs/gfs2/aops.h +++ b/fs/gfs2/aops.h @@ -9,7 +9,7 @@ #include "incore.h" extern void adjust_fs_space(struct inode *inode); -extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int len); +extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, + unsigned int from, unsigned int len); #endif /* __AOPS_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e7537fd305dd..eedf6926c652 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -956,26 +956,40 @@ hole_found: goto out; } -static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, - unsigned len) +static struct folio * +gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) { + struct inode *inode = iter->inode; unsigned int blockmask = i_blocksize(inode) - 1; struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int blocks; + struct folio *folio; + int status; blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits; - return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); + status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); + if (status) + return ERR_PTR(status); + + folio = iomap_get_folio(iter, pos); + if (IS_ERR(folio)) + gfs2_trans_end(sdp); + return folio; } -static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, - unsigned copied, struct page *page) +static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, + unsigned copied, struct folio *folio) { struct gfs2_trans *tr = current->journal_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - if (page && !gfs2_is_stuffed(ip)) - gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied); + if (!gfs2_is_stuffed(ip)) + gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos), + copied); + + folio_unlock(folio); + folio_put(folio); if (tr->tr_num_buf_new) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -983,9 +997,9 @@ static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, gfs2_trans_end(sdp); } -static const struct iomap_page_ops gfs2_iomap_page_ops = { - .page_prepare = gfs2_iomap_page_prepare, - .page_done = gfs2_iomap_page_done, +static const struct iomap_folio_ops gfs2_iomap_folio_ops = { + .get_folio = gfs2_iomap_get_folio, + .put_folio = gfs2_iomap_put_folio, }; static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, @@ -1061,7 +1075,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, } if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip)) - iomap->page_ops = &gfs2_iomap_page_ops; + iomap->folio_ops = &gfs2_iomap_folio_ops; return 0; out_trans_end: @@ -1277,7 +1291,7 @@ int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, /* * NOTE: Never call gfs2_block_zero_range with an open transaction because it * uses iomap write to perform its actions, which begin their own transactions - * (iomap_begin, page_prepare, etc.) + * (iomap_begin, get_folio, etc.) */ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 2e215e8c3c88..6fe9ca253b70 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -83,26 +83,8 @@ static int gfs2_dhash(const struct dentry *dentry, struct qstr *str) return 0; } -static int gfs2_dentry_delete(const struct dentry *dentry) -{ - struct gfs2_inode *ginode; - - if (d_really_is_negative(dentry)) - return 0; - - ginode = GFS2_I(d_inode(dentry)); - if (!gfs2_holder_initialized(&ginode->i_iopen_gh)) - return 0; - - if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) - return 1; - - return 0; -} - const struct dentry_operations gfs2_dops = { .d_revalidate = gfs2_drevalidate, .d_hash = gfs2_dhash, - .d_delete = gfs2_dentry_delete, }; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 524f3c96b9a4..5adc7d85dbf3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -67,7 +67,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; -struct workqueue_struct *gfs2_delete_workqueue; static LIST_HEAD(lru_list); static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); @@ -274,9 +273,8 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) struct address_space *mapping = gfs2_glock2aspace(gl); lockref_mark_dead(&gl->gl_lockref); - - gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); + gfs2_glock_remove_from_lru(gl); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); if (mapping) { truncate_inode_pages_final(mapping); @@ -883,6 +881,7 @@ void glock_set_object(struct gfs2_glock *gl, void *object) /** * glock_clear_object - clear the gl_object field of a glock * @gl: the glock + * @object: object the glock currently points at */ void glock_clear_object(struct gfs2_glock *gl, void *object) { @@ -892,8 +891,7 @@ void glock_clear_object(struct gfs2_glock *gl, void *object) prev_object = gl->gl_object; gl->gl_object = NULL; spin_unlock(&gl->gl_lockref.lock); - if (gfs2_assert_warn(gl->gl_name.ln_sbd, - prev_object == object || prev_object == NULL)) { + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) { pr_warn("glock=%u/%llx\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); @@ -977,6 +975,26 @@ static bool gfs2_try_evict(struct gfs2_glock *gl) return evicted; } +bool gfs2_queue_try_to_evict(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (test_and_set_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) + return false; + return queue_delayed_work(sdp->sd_delete_wq, + &gl->gl_delete, 0); +} + +static bool gfs2_queue_verify_evict(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (test_and_set_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) + return false; + return queue_delayed_work(sdp->sd_delete_wq, + &gl->gl_delete, 5 * HZ); +} + static void delete_work_func(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); @@ -985,11 +1003,7 @@ static void delete_work_func(struct work_struct *work) struct inode *inode; u64 no_addr = gl->gl_name.ln_number; - spin_lock(&gl->gl_lockref.lock); - clear_bit(GLF_PENDING_DELETE, &gl->gl_flags); - spin_unlock(&gl->gl_lockref.lock); - - if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { + if (test_and_clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) { /* * If we can evict the inode, give the remote node trying to * delete the inode some time before verifying that the delete @@ -1008,22 +1022,28 @@ static void delete_work_func(struct work_struct *work) * step entirely. */ if (gfs2_try_evict(gl)) { - if (gfs2_queue_delete_work(gl, 5 * HZ)) + if (test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) + goto out; + if (gfs2_queue_verify_evict(gl)) return; } goto out; } - inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, - GFS2_BLKST_UNLINKED); - if (IS_ERR(inode)) { - if (PTR_ERR(inode) == -EAGAIN && - (gfs2_queue_delete_work(gl, 5 * HZ))) + if (test_and_clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) { + inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, + GFS2_BLKST_UNLINKED); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -EAGAIN && + !test_bit(SDF_DEACTIVATING, &sdp->sd_flags) && + gfs2_queue_verify_evict(gl)) return; - } else { - d_prune_aliases(inode); - iput(inode); + } else { + d_prune_aliases(inode); + iput(inode); + } } + out: gfs2_glock_put(gl); } @@ -1985,26 +2005,26 @@ add_back_to_lru: static long gfs2_scan_glock_lru(int nr) { - struct gfs2_glock *gl; - LIST_HEAD(skipped); + struct gfs2_glock *gl, *next; LIST_HEAD(dispose); long freed = 0; spin_lock(&lru_lock); - while ((nr-- >= 0) && !list_empty(&lru_list)) { - gl = list_first_entry(&lru_list, struct gfs2_glock, gl_lru); - + list_for_each_entry_safe(gl, next, &lru_list, gl_lru) { + if (nr-- <= 0) + break; /* Test for being demotable */ if (!test_bit(GLF_LOCK, &gl->gl_flags)) { - list_move(&gl->gl_lru, &dispose); - atomic_dec(&lru_count); - freed++; - continue; + if (!spin_trylock(&gl->gl_lockref.lock)) + continue; + if (!gl->gl_lockref.count) { + list_move(&gl->gl_lru, &dispose); + atomic_dec(&lru_count); + freed++; + } + spin_unlock(&gl->gl_lockref.lock); } - - list_move(&gl->gl_lru, &skipped); } - list_splice(&skipped, &lru_list); if (!list_empty(&dispose)) gfs2_dispose_glock_lru(&dispose); spin_unlock(&lru_lock); @@ -2063,37 +2083,21 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) rhashtable_walk_exit(&iter); } -bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay) -{ - bool queued; - - spin_lock(&gl->gl_lockref.lock); - queued = queue_delayed_work(gfs2_delete_workqueue, - &gl->gl_delete, delay); - if (queued) - set_bit(GLF_PENDING_DELETE, &gl->gl_flags); - spin_unlock(&gl->gl_lockref.lock); - return queued; -} - void gfs2_cancel_delete_work(struct gfs2_glock *gl) { - if (cancel_delayed_work(&gl->gl_delete)) { - clear_bit(GLF_PENDING_DELETE, &gl->gl_flags); + clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags); + clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags); + if (cancel_delayed_work(&gl->gl_delete)) gfs2_glock_put(gl); - } -} - -bool gfs2_delete_work_queued(const struct gfs2_glock *gl) -{ - return test_bit(GLF_PENDING_DELETE, &gl->gl_flags); } static void flush_delete_work(struct gfs2_glock *gl) { if (gl->gl_name.ln_type == LM_TYPE_IOPEN) { + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + if (cancel_delayed_work(&gl->gl_delete)) { - queue_delayed_work(gfs2_delete_workqueue, + queue_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, 0); } } @@ -2102,7 +2106,7 @@ static void flush_delete_work(struct gfs2_glock *gl) void gfs2_flush_delete_work(struct gfs2_sbd *sdp) { glock_hash_walk(flush_delete_work, sdp); - flush_workqueue(gfs2_delete_workqueue); + flush_workqueue(sdp->sd_delete_wq); } /** @@ -2308,14 +2312,16 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'o'; if (test_bit(GLF_BLOCKING, gflags)) *p++ = 'b'; - if (test_bit(GLF_PENDING_DELETE, gflags)) - *p++ = 'P'; if (test_bit(GLF_FREEING, gflags)) *p++ = 'x'; if (test_bit(GLF_INSTANTIATE_NEEDED, gflags)) *p++ = 'n'; if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags)) *p++ = 'N'; + if (test_bit(GLF_TRY_TO_EVICT, gflags)) + *p++ = 'e'; + if (test_bit(GLF_VERIFY_EVICT, gflags)) + *p++ = 'E'; *p = 0; return buf; } @@ -2465,18 +2471,9 @@ int __init gfs2_glock_init(void) rhashtable_destroy(&gl_hash_table); return -ENOMEM; } - gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", - WQ_MEM_RECLAIM | WQ_FREEZABLE, - 0); - if (!gfs2_delete_workqueue) { - destroy_workqueue(glock_workqueue); - rhashtable_destroy(&gl_hash_table); - return -ENOMEM; - } ret = register_shrinker(&glock_shrinker, "gfs2-glock"); if (ret) { - destroy_workqueue(gfs2_delete_workqueue); destroy_workqueue(glock_workqueue); rhashtable_destroy(&gl_hash_table); return ret; @@ -2493,7 +2490,6 @@ void gfs2_glock_exit(void) unregister_shrinker(&glock_shrinker); rhashtable_destroy(&gl_hash_table); destroy_workqueue(glock_workqueue); - destroy_workqueue(gfs2_delete_workqueue); } static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index f37ac087e2c1..1f1ba92c15a8 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -144,7 +144,6 @@ struct gfs2_glock_aspace { struct address_space mapping; }; -extern struct workqueue_struct *gfs2_delete_workqueue; static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { struct gfs2_holder *gh; @@ -268,9 +267,8 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); -extern bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay); +extern bool gfs2_queue_try_to_evict(struct gfs2_glock *gl); extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); -extern bool gfs2_delete_work_queued(const struct gfs2_glock *gl); extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d78b61ecc1cd..ad14818a790a 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -193,7 +193,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl) struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); int error; - if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + if (!rgd || !test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return 0; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); @@ -222,9 +222,12 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); const unsigned bsize = sdp->sd_sb.sb_bsize; - loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; - loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; + loff_t start, end; + if (!rgd) + return; + start = (rgd->rd_addr * bsize) & PAGE_MASK; + end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); truncate_inode_pages_range(mapping, start, end); @@ -645,23 +648,18 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) struct gfs2_inode *ip = gl->gl_object; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - if (!remote || sb_rdonly(sdp->sd_vfs)) + if (!remote || sb_rdonly(sdp->sd_vfs) || + test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) return; if (gl->gl_demote_state == LM_ST_UNLOCKED && gl->gl_state == LM_ST_SHARED && ip) { gl->gl_lockref.count++; - if (!queue_delayed_work(gfs2_delete_workqueue, - &gl->gl_delete, 0)) + if (!gfs2_queue_try_to_evict(gl)) gl->gl_lockref.count--; } } -static int iopen_go_demote_ok(const struct gfs2_glock *gl) -{ - return !gfs2_delete_work_queued(gl); -} - /** * inode_go_free - wake up anyone waiting for dlm's unlock ast to free it * @gl: glock being freed @@ -767,7 +765,6 @@ const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, .go_callback = iopen_go_callback, .go_dump = inode_go_dump, - .go_demote_ok = iopen_go_demote_ok, .go_flags = GLOF_LRU | GLOF_NONDISK, .go_subclass = 1, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c26765080f28..79485329118b 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -329,8 +329,9 @@ enum { GLF_LRU = 13, GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, - GLF_PENDING_DELETE = 17, - GLF_FREEING = 18, /* Wait for glock to be freed */ + GLF_FREEING = 16, /* Wait for glock to be freed */ + GLF_TRY_TO_EVICT = 17, /* iopen glocks only */ + GLF_VERIFY_EVICT = 18, /* iopen glocks only */ }; struct gfs2_glock { @@ -605,6 +606,8 @@ enum { SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */ SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are withdrawing */ + SDF_DEACTIVATING = 15, + SDF_EVICTING = 16, }; enum gfs2_freeze_state { @@ -771,6 +774,10 @@ struct gfs2_sbd { struct completion sd_journal_ready; + /* Workqueue stuff */ + + struct workqueue_struct *sd_delete_wq; + /* Daemon stuff */ struct task_struct *sd_logd_process; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 713efa3bb732..1291b5ee3584 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -225,6 +225,10 @@ fail: gfs2_glock_dq_uninit(&ip->i_iopen_gh); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); + if (ip->i_gl) { + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + } iget_failed(inode); return ERR_PTR(error); } @@ -816,6 +820,10 @@ fail_gunlock3: fail_gunlock2: gfs2_glock_put(io_gl); fail_free_inode: + if (ip->i_gl) { + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + } gfs2_rs_deltree(&ip->i_res); gfs2_qa_put(ip); fail_free_acls: diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c0cf1d2d0ef5..6de901c3b89b 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1197,9 +1197,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); + sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s", + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname); + error = -ENOMEM; + if (!sdp->sd_delete_wq) + goto fail_free; + error = gfs2_sys_fs_add(sdp); if (error) - goto fail_free; + goto fail_delete_wq; gfs2_create_debugfs_file(sdp); @@ -1309,6 +1315,8 @@ fail_lm: fail_debug: gfs2_delete_debugfs_file(sdp); gfs2_sys_fs_del(sdp); +fail_delete_wq: + destroy_workqueue(sdp->sd_delete_wq); fail_free: free_sbd(sdp); sb->s_fs_info = NULL; @@ -1720,6 +1728,55 @@ static int gfs2_meta_init_fs_context(struct fs_context *fc) return 0; } +/** + * gfs2_evict_inodes - evict inodes cooperatively + * @sb: the superblock + * + * When evicting an inode with a zero link count, we are trying to upgrade the + * inode's iopen glock from SH to EX mode in order to determine if we can + * delete the inode. The other nodes are supposed to evict the inode from + * their caches if they can, and to poke the inode's inode glock if they cannot + * do so. Either behavior allows gfs2_upgrade_iopen_glock() to proceed + * quickly, but if the other nodes are not cooperating, the lock upgrading + * attempt will time out. Since inodes are evicted sequentially, this can add + * up quickly. + * + * Function evict_inodes() tries to keep the s_inode_list_lock list locked over + * a long time, which prevents other inodes from being evicted concurrently. + * This precludes the cooperative behavior we are looking for. This special + * version of evict_inodes() avoids that. + * + * Modeled after drop_pagecache_sb(). + */ +static void gfs2_evict_inodes(struct super_block *sb) +{ + struct inode *inode, *toput_inode = NULL; + struct gfs2_sbd *sdp = sb->s_fs_info; + + set_bit(SDF_EVICTING, &sdp->sd_flags); + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) && + !need_resched()) { + spin_unlock(&inode->i_lock); + continue; + } + atomic_inc(&inode->i_count); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + iput(toput_inode); + toput_inode = inode; + + cond_resched(); + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); + iput(toput_inode); +} + static void gfs2_kill_sb(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; @@ -1735,6 +1792,18 @@ static void gfs2_kill_sb(struct super_block *sb) sdp->sd_root_dir = NULL; sdp->sd_master_dir = NULL; shrink_dcache_sb(sb); + + gfs2_evict_inodes(sb); + + /* + * Flush and then drain the delete workqueue here (via + * destroy_workqueue()) to ensure that any delete work that + * may be running will also see the SDF_DEACTIVATING flag. + */ + set_bit(SDF_DEACTIVATING, &sdp->sd_flags); + gfs2_flush_delete_work(sdp); + destroy_workqueue(sdp->sd_delete_wq); + kill_block_super(sb); } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index f602fb844951..3b9b76e980ad 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1879,7 +1879,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip */ ip = gl->gl_object; - if (ip || !gfs2_queue_delete_work(gl, 0)) + if (ip || !gfs2_queue_try_to_evict(gl)) gfs2_glock_put(gl); else found++; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 999cc146d708..a83fa62106f0 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -138,8 +138,10 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) return -EIO; error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); - if (error || gfs2_withdrawn(sdp)) + if (error) { + gfs2_consist(sdp); return error; + } if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { gfs2_consist(sdp); @@ -151,7 +153,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) gfs2_log_pointers_init(sdp, head.lh_blkno); error = gfs2_quota_init(sdp); - if (!error && !gfs2_withdrawn(sdp)) + if (!error && gfs2_withdrawn(sdp)) + error = -EIO; + if (!error) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); return error; } @@ -529,7 +533,9 @@ void gfs2_make_fs_ro(struct gfs2_sbd *sdp) { int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - gfs2_flush_delete_work(sdp); + if (!test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) + gfs2_flush_delete_work(sdp); + if (!log_write_allowed && current == sdp->sd_quotad_process) fs_warn(sdp, "The quotad daemon is withdrawing.\n"); else if (sdp->sd_quotad_process) @@ -933,6 +939,7 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) static int gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); if (inode->i_nlink && gfs2_holder_initialized(&ip->i_iopen_gh)) { @@ -952,11 +959,17 @@ static int gfs2_drop_inode(struct inode *inode) struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; gfs2_glock_hold(gl); - if (!gfs2_queue_delete_work(gl, 0)) + if (!gfs2_queue_try_to_evict(gl)) gfs2_glock_queue_put(gl); return 0; } + /* + * No longer cache inodes when trying to evict them all. + */ + if (test_bit(SDF_EVICTING, &sdp->sd_flags)) + return 1; + return generic_drop_inode(inode); } @@ -1175,15 +1188,23 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) gfs2_glock_dq_wait(gh); /* - * If there are no other lock holders, we'll get the lock immediately. + * If there are no other lock holders, we will immediately get + * exclusive access to the iopen glock here. + * * Otherwise, the other nodes holding the lock will be notified about - * our locking request. If they don't have the inode open, they'll - * evict the cached inode and release the lock. Otherwise, if they - * poke the inode glock, we'll take this as an indication that they - * still need the iopen glock and that they'll take care of deleting - * the inode when they're done. As a last resort, if another node - * keeps holding the iopen glock without showing any activity on the - * inode glock, we'll eventually time out. + * our locking request. If they do not have the inode open, they are + * expected to evict the cached inode and release the lock, allowing us + * to proceed. + * + * Otherwise, if they cannot evict the inode, they are expected to poke + * the inode glock (note: not the iopen glock). We will notice that + * and stop waiting for the iopen glock immediately. The other node(s) + * are then expected to take care of deleting the inode when they no + * longer use it. + * + * As a last resort, if another node keeps holding the iopen glock + * without showing any activity on the inode glock, we will eventually + * time out and fail the iopen glock upgrade. * * Note that we're passing the LM_FLAG_TRY_1CB flag to the first * locking request as an optimization to notify lock holders as soon as @@ -1401,10 +1422,8 @@ static void gfs2_evict_inode(struct inode *inode) if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); - if (gfs2_holder_initialized(&gh)) { - glock_clear_object(ip->i_gl, ip); + if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); - } if (ret && ret != GLR_TRYFAILED && ret != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", ret); out: diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d87ea98cf535..c40118ea4bbc 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -87,6 +87,7 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) "Withdraw In Prog: %d\n" "Remote Withdraw: %d\n" "Withdraw Recovery: %d\n" + "Deactivating: %d\n" "sd_log_error: %d\n" "sd_log_flush_lock: %d\n" "sd_log_num_revoke: %u\n" @@ -115,6 +116,7 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) test_bit(SDF_WITHDRAW_IN_PROG, &f), test_bit(SDF_REMOTE_WITHDRAW, &f), test_bit(SDF_WITHDRAW_RECOVERY, &f), + test_bit(SDF_DEACTIVATING, &f), sdp->sd_log_error, rwsem_is_locked(&sdp->sd_log_flush_lock), sdp->sd_log_num_revoke, diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 356193e44cf0..d3c300563eb8 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -457,6 +457,33 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) } EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); +/** + * iomap_get_folio - get a folio reference for writing + * @iter: iteration structure + * @pos: start offset of write + * + * Returns a locked reference to the folio at @pos, or an error pointer if the + * folio could not be obtained. + */ +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) +{ + unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; + struct folio *folio; + + if (iter->flags & IOMAP_NOWAIT) + fgp |= FGP_NOWAIT; + + folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + fgp, mapping_gfp_mask(iter->inode->i_mapping)); + if (folio) + return folio; + + if (iter->flags & IOMAP_NOWAIT) + return ERR_PTR(-EAGAIN); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(iomap_get_folio); + bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) { trace_iomap_release_folio(folio->mapping->host, folio_pos(folio), @@ -575,6 +602,30 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return 0; } +static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, + size_t len) +{ + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; + + if (folio_ops && folio_ops->get_folio) + return folio_ops->get_folio(iter, pos, len); + else + return iomap_get_folio(iter, pos); +} + +static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, + struct folio *folio) +{ + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; + + if (folio_ops && folio_ops->put_folio) { + folio_ops->put_folio(iter->inode, pos, ret, folio); + } else { + folio_unlock(folio); + folio_put(folio); + } +} + static int iomap_write_begin_inline(const struct iomap_iter *iter, struct folio *folio) { @@ -587,15 +638,11 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, size_t len, struct folio **foliop) { - const struct iomap_page_ops *page_ops = iter->iomap.page_ops; + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); struct folio *folio; - unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; int status = 0; - if (iter->flags & IOMAP_NOWAIT) - fgp |= FGP_NOWAIT; - BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); @@ -606,18 +653,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); - if (page_ops && page_ops->page_prepare) { - status = page_ops->page_prepare(iter->inode, pos, len); - if (status) - return status; - } - - folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, - fgp, mapping_gfp_mask(iter->inode->i_mapping)); - if (!folio) { - status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; - goto out_no_page; - } + folio = __iomap_get_folio(iter, pos, len); + if (IS_ERR(folio)) + return PTR_ERR(folio); /* * Now we have a locked folio, before we do anything with it we need to @@ -629,9 +667,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, * could do the wrong thing here (zero a page range incorrectly or fail * to zero) and corrupt data. */ - if (page_ops && page_ops->iomap_valid) { - bool iomap_valid = page_ops->iomap_valid(iter->inode, - &iter->iomap); + if (folio_ops && folio_ops->iomap_valid) { + bool iomap_valid = folio_ops->iomap_valid(iter->inode, + &iter->iomap); if (!iomap_valid) { iter->iomap.flags |= IOMAP_F_STALE; status = 0; @@ -656,13 +694,9 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, return 0; out_unlock: - folio_unlock(folio); - folio_put(folio); + __iomap_put_folio(iter, pos, 0, folio); iomap_write_failed(iter->inode, pos, len); -out_no_page: - if (page_ops && page_ops->page_done) - page_ops->page_done(iter->inode, pos, 0, NULL); return status; } @@ -712,7 +746,6 @@ static size_t iomap_write_end_inline(const struct iomap_iter *iter, static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct folio *folio) { - const struct iomap_page_ops *page_ops = iter->iomap.page_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t old_size = iter->inode->i_size; size_t ret; @@ -735,14 +768,10 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, i_size_write(iter->inode, pos + ret); iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } - folio_unlock(folio); + __iomap_put_folio(iter, pos, ret, folio); if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); - if (page_ops && page_ops->page_done) - page_ops->page_done(iter->inode, pos, ret, &folio->page); - folio_put(folio); - if (ret < len) iomap_write_failed(iter->inode, pos + ret, len - ret); return ret; diff --git a/fs/ksmbd/Kconfig b/fs/ksmbd/Kconfig index e1fe17747ed6..7055cb5d2880 100644 --- a/fs/ksmbd/Kconfig +++ b/fs/ksmbd/Kconfig @@ -33,14 +33,16 @@ config SMB_SERVER in ksmbd-tools, available from https://github.com/cifsd-team/ksmbd-tools. More detail about how to run the ksmbd kernel server is - available via README file + available via the README file (https://github.com/cifsd-team/ksmbd-tools/blob/master/README). ksmbd kernel server includes support for auto-negotiation, Secure negotiate, Pre-authentication integrity, oplock/lease, compound requests, multi-credit, packet signing, RDMA(smbdirect), smb3 encryption, copy-offload, secure per-user session - establishment via NTLM or NTLMv2. + establishment via Kerberos or NTLMv2. + +if SMB_SERVER config SMB_SERVER_SMBDIRECT bool "Support for SMB Direct protocol" @@ -54,6 +56,8 @@ config SMB_SERVER_SMBDIRECT SMB Direct allows transferring SMB packets over RDMA. If unsure, say N. +endif + config SMB_SERVER_CHECK_CAP_NET_ADMIN bool "Enable check network administration capability" depends on SMB_SERVER diff --git a/fs/ksmbd/asn1.c b/fs/ksmbd/asn1.c index c03eba090368..cc6384f79675 100644 --- a/fs/ksmbd/asn1.c +++ b/fs/ksmbd/asn1.c @@ -208,9 +208,9 @@ int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, return 0; } -int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, - unsigned char tag, const void *value, - size_t vlen) +static int ksmbd_neg_token_alloc(void *context, size_t hdrlen, + unsigned char tag, const void *value, + size_t vlen) { struct ksmbd_conn *conn = context; @@ -223,17 +223,16 @@ int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, return 0; } -int ksmbd_neg_token_targ_resp_token(void *context, size_t hdrlen, +int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { - struct ksmbd_conn *conn = context; - - conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL); - if (!conn->mechToken) - return -ENOMEM; + return ksmbd_neg_token_alloc(context, hdrlen, tag, value, vlen); +} - memcpy(conn->mechToken, value, vlen); - conn->mechToken[vlen] = '\0'; - return 0; +int ksmbd_neg_token_targ_resp_token(void *context, size_t hdrlen, + unsigned char tag, const void *value, + size_t vlen) +{ + return ksmbd_neg_token_alloc(context, hdrlen, tag, value, vlen); } diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 56be077e5d8a..5b10b03800c1 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -114,7 +114,7 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work) if (conn->ops->get_cmd_val(work) != SMB2_CANCEL_HE) { requests_queue = &conn->requests; - work->syncronous = true; + work->synchronous = true; } if (requests_queue) { @@ -139,7 +139,7 @@ int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) spin_lock(&conn->request_lock); if (!work->multiRsp) { list_del_init(&work->request_entry); - if (work->syncronous == false) + if (!work->synchronous) list_del_init(&work->async_request_entry); ret = 0; } @@ -312,7 +312,7 @@ int ksmbd_conn_handler_loop(void *p) max_allowed_pdu_size = SMB3_MAX_MSGSIZE; if (pdu_size > max_allowed_pdu_size) { - pr_err_ratelimited("PDU length(%u) excceed maximum allowed pdu size(%u) on connection(%d)\n", + pr_err_ratelimited("PDU length(%u) exceeded maximum allowed pdu size(%u) on connection(%d)\n", pdu_size, max_allowed_pdu_size, conn->status); break; diff --git a/fs/ksmbd/ksmbd_work.h b/fs/ksmbd/ksmbd_work.h index 5ece58e40c97..3234f2cf6327 100644 --- a/fs/ksmbd/ksmbd_work.h +++ b/fs/ksmbd/ksmbd_work.h @@ -68,7 +68,7 @@ struct ksmbd_work { /* Request is encrypted */ bool encrypted:1; /* Is this SYNC or ASYNC ksmbd_work */ - bool syncronous:1; + bool synchronous:1; bool need_invalidate_rkey:1; unsigned int remote_key; diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c index 92b1603b5abe..1ca2aae4c299 100644 --- a/fs/ksmbd/mgmt/user_session.c +++ b/fs/ksmbd/mgmt/user_session.c @@ -25,20 +25,19 @@ static DECLARE_RWSEM(sessions_table_lock); struct ksmbd_session_rpc { int id; unsigned int method; - struct list_head list; }; static void free_channel_list(struct ksmbd_session *sess) { - struct channel *chann, *tmp; + struct channel *chann; + unsigned long index; - write_lock(&sess->chann_lock); - list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list, - chann_list) { - list_del(&chann->chann_list); + xa_for_each(&sess->ksmbd_chann_list, index, chann) { + xa_erase(&sess->ksmbd_chann_list, index); kfree(chann); } - write_unlock(&sess->chann_lock); + + xa_destroy(&sess->ksmbd_chann_list); } static void __session_rpc_close(struct ksmbd_session *sess, @@ -58,15 +57,14 @@ static void __session_rpc_close(struct ksmbd_session *sess, static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess) { struct ksmbd_session_rpc *entry; + long index; - while (!list_empty(&sess->rpc_handle_list)) { - entry = list_entry(sess->rpc_handle_list.next, - struct ksmbd_session_rpc, - list); - - list_del(&entry->list); + xa_for_each(&sess->rpc_handle_list, index, entry) { + xa_erase(&sess->rpc_handle_list, index); __session_rpc_close(sess, entry); } + + xa_destroy(&sess->rpc_handle_list); } static int __rpc_method(char *rpc_name) @@ -102,13 +100,13 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) entry = kzalloc(sizeof(struct ksmbd_session_rpc), GFP_KERNEL); if (!entry) - return -EINVAL; + return -ENOMEM; - list_add(&entry->list, &sess->rpc_handle_list); entry->method = method; entry->id = ksmbd_ipc_id_alloc(); if (entry->id < 0) goto free_entry; + xa_store(&sess->rpc_handle_list, entry->id, entry, GFP_KERNEL); resp = ksmbd_rpc_open(sess, entry->id); if (!resp) @@ -117,9 +115,9 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) kvfree(resp); return entry->id; free_id: + xa_erase(&sess->rpc_handle_list, entry->id); ksmbd_rpc_id_free(entry->id); free_entry: - list_del(&entry->list); kfree(entry); return -EINVAL; } @@ -128,24 +126,17 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; - list_for_each_entry(entry, &sess->rpc_handle_list, list) { - if (entry->id == id) { - list_del(&entry->list); - __session_rpc_close(sess, entry); - break; - } - } + entry = xa_erase(&sess->rpc_handle_list, id); + if (entry) + __session_rpc_close(sess, entry); } int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; - list_for_each_entry(entry, &sess->rpc_handle_list, list) { - if (entry->id == id) - return entry->method; - } - return 0; + entry = xa_load(&sess->rpc_handle_list, id); + return entry ? entry->method : 0; } void ksmbd_session_destroy(struct ksmbd_session *sess) @@ -190,21 +181,15 @@ int ksmbd_session_register(struct ksmbd_conn *conn, static int ksmbd_chann_del(struct ksmbd_conn *conn, struct ksmbd_session *sess) { - struct channel *chann, *tmp; - - write_lock(&sess->chann_lock); - list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list, - chann_list) { - if (chann->conn == conn) { - list_del(&chann->chann_list); - kfree(chann); - write_unlock(&sess->chann_lock); - return 0; - } - } - write_unlock(&sess->chann_lock); + struct channel *chann; + + chann = xa_erase(&sess->ksmbd_chann_list, (long)conn); + if (!chann) + return -ENOENT; - return -ENOENT; + kfree(chann); + + return 0; } void ksmbd_sessions_deregister(struct ksmbd_conn *conn) @@ -234,7 +219,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) return; sess_destroy: - if (list_empty(&sess->ksmbd_chann_list)) { + if (xa_empty(&sess->ksmbd_chann_list)) { xa_erase(&conn->sessions, sess->id); ksmbd_session_destroy(sess); } @@ -320,6 +305,9 @@ static struct ksmbd_session *__session_create(int protocol) struct ksmbd_session *sess; int ret; + if (protocol != CIFDS_SESSION_FLAG_SMB2) + return NULL; + sess = kzalloc(sizeof(struct ksmbd_session), GFP_KERNEL); if (!sess) return NULL; @@ -329,30 +317,20 @@ static struct ksmbd_session *__session_create(int protocol) set_session_flag(sess, protocol); xa_init(&sess->tree_conns); - INIT_LIST_HEAD(&sess->ksmbd_chann_list); - INIT_LIST_HEAD(&sess->rpc_handle_list); + xa_init(&sess->ksmbd_chann_list); + xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; - rwlock_init(&sess->chann_lock); - - switch (protocol) { - case CIFDS_SESSION_FLAG_SMB2: - ret = __init_smb2_session(sess); - break; - default: - ret = -EINVAL; - break; - } + ret = __init_smb2_session(sess); if (ret) goto error; ida_init(&sess->tree_conn_ida); - if (protocol == CIFDS_SESSION_FLAG_SMB2) { - down_write(&sessions_table_lock); - hash_add(sessions_table, &sess->hlist, sess->id); - up_write(&sessions_table_lock); - } + down_write(&sessions_table_lock); + hash_add(sessions_table, &sess->hlist, sess->id); + up_write(&sessions_table_lock); + return sess; error: diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h index 8934b8ee275b..b6a9e7a6aae4 100644 --- a/fs/ksmbd/mgmt/user_session.h +++ b/fs/ksmbd/mgmt/user_session.h @@ -21,7 +21,6 @@ struct ksmbd_file_table; struct channel { __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; struct ksmbd_conn *conn; - struct list_head chann_list; }; struct preauth_session { @@ -50,11 +49,10 @@ struct ksmbd_session { char sess_key[CIFS_KEY_SIZE]; struct hlist_node hlist; - rwlock_t chann_lock; - struct list_head ksmbd_chann_list; + struct xarray ksmbd_chann_list; struct xarray tree_conns; struct ida tree_conn_ida; - struct list_head rpc_handle_list; + struct xarray rpc_handle_list; __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE]; diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 6e25ace36568..fbdde426dd01 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -149,15 +149,11 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, break; case SMB2_LOCK: { - int lock_count; + unsigned short lock_count; - /* - * smb2_lock request size is 48 included single - * smb2_lock_element structure size. - */ - lock_count = le16_to_cpu(((struct smb2_lock_req *)hdr)->LockCount) - 1; + lock_count = le16_to_cpu(((struct smb2_lock_req *)hdr)->LockCount); if (lock_count > 0) { - *off = __SMB2_HEADER_STRUCTURE_SIZE + 48; + *off = offsetof(struct smb2_lock_req, locks); *len = sizeof(struct smb2_lock_element) * lock_count; } break; @@ -412,20 +408,19 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) goto validate_credit; /* - * windows client also pad up to 8 bytes when compounding. - * If pad is longer than eight bytes, log the server behavior - * (once), since may indicate a problem but allow it and - * continue since the frame is parseable. + * SMB2 NEGOTIATE request will be validated when message + * handling proceeds. */ - if (clc_len < len) { - ksmbd_debug(SMB, - "cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n", - len, clc_len, command, - le64_to_cpu(hdr->MessageId)); + if (command == SMB2_NEGOTIATE_HE) + goto validate_credit; + + /* + * Allow a message that padded to 8byte boundary. + */ + if (clc_len < len && (len - clc_len) < 8) goto validate_credit; - } - ksmbd_debug(SMB, + pr_err_ratelimited( "cli req too short, len %d not %d. cmd:%d mid:%llu\n", len, clc_len, command, le64_to_cpu(hdr->MessageId)); diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index e401302478c3..aed7704a0672 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -26,7 +26,7 @@ static struct smb_version_values smb21_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -52,7 +52,7 @@ static struct smb_version_values smb30_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -79,7 +79,7 @@ static struct smb_version_values smb302_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -106,7 +106,7 @@ static struct smb_version_values smb311_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 4ef6e1e59a40..0685c1c77b9f 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -75,14 +75,7 @@ static inline bool check_session_id(struct ksmbd_conn *conn, u64 id) struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn *conn) { - struct channel *chann; - - list_for_each_entry(chann, &sess->ksmbd_chann_list, chann_list) { - if (chann->conn == conn) - return chann; - } - - return NULL; + return xa_load(&sess->ksmbd_chann_list, (long)conn); } /** @@ -281,8 +274,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) le16_to_cpu(rsp->SecurityBufferOffset)); inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) + - AUTH_GSS_LENGTH); + sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY) rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; @@ -506,7 +498,7 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work) rsp_hdr->SessionId = rcv_hdr->SessionId; memcpy(rsp_hdr->Signature, rcv_hdr->Signature, 16); - work->syncronous = true; + work->synchronous = true; if (work->async_id) { ksmbd_release_id(&conn->async_ida, work->async_id); work->async_id = 0; @@ -596,6 +588,7 @@ static void destroy_previous_session(struct ksmbd_conn *conn, struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id); struct ksmbd_user *prev_user; struct channel *chann; + long index; if (!prev_sess) return; @@ -609,10 +602,8 @@ static void destroy_previous_session(struct ksmbd_conn *conn, return; prev_sess->state = SMB2_SESSION_EXPIRED; - write_lock(&prev_sess->chann_lock); - list_for_each_entry(chann, &prev_sess->ksmbd_chann_list, chann_list) + xa_for_each(&prev_sess->ksmbd_chann_list, index, chann) chann->conn->status = KSMBD_SESS_EXITING; - write_unlock(&prev_sess->chann_lock); } /** @@ -653,7 +644,7 @@ int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg) pr_err("Failed to alloc async message id\n"); return id; } - work->syncronous = false; + work->synchronous = false; work->async_id = id; rsp_hdr->Id.AsyncId = cpu_to_le64(id); @@ -1213,8 +1204,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) + le16_to_cpu(rsp->SecurityBufferOffset)); inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) + - AUTH_GSS_LENGTH); + sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; conn->use_spnego = true; @@ -1520,19 +1510,14 @@ static int ntlm_authenticate(struct ksmbd_work *work) binding_session: if (conn->dialect >= SMB30_PROT_ID) { - read_lock(&sess->chann_lock); chann = lookup_chann_list(sess, conn); - read_unlock(&sess->chann_lock); if (!chann) { chann = kmalloc(sizeof(struct channel), GFP_KERNEL); if (!chann) return -ENOMEM; chann->conn = conn; - INIT_LIST_HEAD(&chann->chann_list); - write_lock(&sess->chann_lock); - list_add(&chann->chann_list, &sess->ksmbd_chann_list); - write_unlock(&sess->chann_lock); + xa_store(&sess->ksmbd_chann_list, (long)conn, chann, GFP_KERNEL); } } @@ -1607,19 +1592,14 @@ static int krb5_authenticate(struct ksmbd_work *work) } if (conn->dialect >= SMB30_PROT_ID) { - read_lock(&sess->chann_lock); chann = lookup_chann_list(sess, conn); - read_unlock(&sess->chann_lock); if (!chann) { chann = kmalloc(sizeof(struct channel), GFP_KERNEL); if (!chann) return -ENOMEM; chann->conn = conn; - INIT_LIST_HEAD(&chann->chann_list); - write_lock(&sess->chann_lock); - list_add(&chann->chann_list, &sess->ksmbd_chann_list); - write_unlock(&sess->chann_lock); + xa_store(&sess->ksmbd_chann_list, (long)conn, chann, GFP_KERNEL); } } @@ -6645,7 +6625,7 @@ int smb2_cancel(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; struct smb2_hdr *hdr = smb2_get_msg(work->request_buf); struct smb2_hdr *chdr; - struct ksmbd_work *cancel_work = NULL, *iter; + struct ksmbd_work *iter; struct list_head *command_list; ksmbd_debug(SMB, "smb2 cancel called on mid %llu, async flags 0x%x\n", @@ -6667,7 +6647,9 @@ int smb2_cancel(struct ksmbd_work *work) "smb2 with AsyncId %llu cancelled command = 0x%x\n", le64_to_cpu(hdr->Id.AsyncId), le16_to_cpu(chdr->Command)); - cancel_work = iter; + iter->state = KSMBD_WORK_CANCELLED; + if (iter->cancel_fn) + iter->cancel_fn(iter->cancel_argv); break; } spin_unlock(&conn->request_lock); @@ -6686,18 +6668,12 @@ int smb2_cancel(struct ksmbd_work *work) "smb2 with mid %llu cancelled command = 0x%x\n", le64_to_cpu(hdr->MessageId), le16_to_cpu(chdr->Command)); - cancel_work = iter; + iter->state = KSMBD_WORK_CANCELLED; break; } spin_unlock(&conn->request_lock); } - if (cancel_work) { - cancel_work->state = KSMBD_WORK_CANCELLED; - if (cancel_work->cancel_fn) - cancel_work->cancel_fn(cancel_work->cancel_argv); - } - /* For SMB2_CANCEL command itself send no response*/ work->send_no_response = 1; return 0; @@ -7062,6 +7038,14 @@ skip: ksmbd_vfs_posix_lock_wait(flock); + spin_lock(&work->conn->request_lock); + spin_lock(&fp->f_lock); + list_del(&work->fp_entry); + work->cancel_fn = NULL; + kfree(argv); + spin_unlock(&fp->f_lock); + spin_unlock(&work->conn->request_lock); + if (work->state != KSMBD_WORK_ACTIVE) { list_del(&smb_lock->llist); spin_lock(&work->conn->llist_lock); @@ -7070,9 +7054,6 @@ skip: locks_free_lock(flock); if (work->state == KSMBD_WORK_CANCELLED) { - spin_lock(&fp->f_lock); - list_del(&work->fp_entry); - spin_unlock(&fp->f_lock); rsp->hdr.Status = STATUS_CANCELLED; kfree(smb_lock); @@ -7094,9 +7075,6 @@ skip: list_del(&smb_lock->clist); spin_unlock(&work->conn->llist_lock); - spin_lock(&fp->f_lock); - list_del(&work->fp_entry); - spin_unlock(&fp->f_lock); goto retry; } else if (!rc) { spin_lock(&work->conn->llist_lock); @@ -8410,14 +8388,11 @@ int smb3_check_sign_req(struct ksmbd_work *work) if (le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { signing_key = work->sess->smb3signingkey; } else { - read_lock(&work->sess->chann_lock); chann = lookup_chann_list(work->sess, conn); if (!chann) { - read_unlock(&work->sess->chann_lock); return 0; } signing_key = chann->smb3signingkey; - read_unlock(&work->sess->chann_lock); } if (!signing_key) { @@ -8477,14 +8452,11 @@ void smb3_set_sign_rsp(struct ksmbd_work *work) le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { signing_key = work->sess->smb3signingkey; } else { - read_lock(&work->sess->chann_lock); chann = lookup_chann_list(work->sess, work->conn); if (!chann) { - read_unlock(&work->sess->chann_lock); return; } signing_key = chann->smb3signingkey; - read_unlock(&work->sess->chann_lock); } if (!signing_key) diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index aa1300b7bfc2..5ea9229dad2c 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -952,9 +952,9 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, * ksmbd_vfs_setxattr() - vfs helper for smb set extended attributes value * @idmap: idmap of the relevant mount * @dentry: dentry to set XATTR at - * @name: xattr name for setxattr - * @value: xattr value to set - * @size: size of xattr value + * @attr_name: xattr name for setxattr + * @attr_value: xattr value to set + * @attr_size: size of xattr value * @flags: destination buffer length * * Return: 0 on success, otherwise error diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index 1d8126443a7f..054a7d2e0f48 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -365,12 +365,11 @@ static void __put_fd_final(struct ksmbd_work *work, struct ksmbd_file *fp) static void set_close_state_blocked_works(struct ksmbd_file *fp) { - struct ksmbd_work *cancel_work, *ctmp; + struct ksmbd_work *cancel_work; spin_lock(&fp->f_lock); - list_for_each_entry_safe(cancel_work, ctmp, &fp->blocked_works, + list_for_each_entry(cancel_work, &fp->blocked_works, fp_entry) { - list_del(&cancel_work->fp_entry); cancel_work->state = KSMBD_WORK_CLOSED; cancel_work->cancel_fn(cancel_work->cancel_argv); } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 914ea1c3537d..9a47303b2cba 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -685,17 +685,16 @@ module_exit(exit_nlm); /** * nlmsvc_dispatch - Process an NLM Request * @rqstp: incoming request - * @statp: pointer to location of accept_stat field in RPC Reply buffer * * Return values: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ -static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp) +static int nlmsvc_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *procp = rqstp->rq_procinfo; + __be32 *statp = rqstp->rq_accept_statp; - svcxdr_init_decode(rqstp); if (!procp->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; @@ -705,7 +704,6 @@ static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp) if (*statp != rpc_success) return 1; - svcxdr_init_encode(rqstp); if (!procp->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; @@ -723,7 +721,7 @@ out_encode_err: /* * Define NLM program and procedures */ -static unsigned int nlmsvc_version1_count[17]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, nlmsvc_version1_count[17]); static const struct svc_version nlmsvc_version1 = { .vs_vers = 1, .vs_nproc = 17, @@ -732,26 +730,31 @@ static const struct svc_version nlmsvc_version1 = { .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; -static unsigned int nlmsvc_version3_count[24]; + +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nlmsvc_version3_count[ARRAY_SIZE(nlmsvc_procedures)]); static const struct svc_version nlmsvc_version3 = { .vs_vers = 3, - .vs_nproc = 24, + .vs_nproc = ARRAY_SIZE(nlmsvc_procedures), .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version3_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; + #ifdef CONFIG_LOCKD_V4 -static unsigned int nlmsvc_version4_count[24]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nlmsvc_version4_count[ARRAY_SIZE(nlmsvc_procedures4)]); static const struct svc_version nlmsvc_version4 = { .vs_vers = 4, - .vs_nproc = 24, + .vs_nproc = ARRAY_SIZE(nlmsvc_procedures4), .vs_proc = nlmsvc_procedures4, .vs_count = nlmsvc_version4_count, .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #endif + static const struct svc_version *nlmsvc_version[] = { [1] = &nlmsvc_version1, [3] = &nlmsvc_version3, diff --git a/fs/namei.c b/fs/namei.c index 5855dc6edbd5..edfedfbccaef 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1459,11 +1459,11 @@ EXPORT_SYMBOL(follow_down_one); * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ -int follow_down(struct path *path) +int follow_down(struct path *path, unsigned int flags) { struct vfsmount *mnt = path->mnt; bool jumped; - int ret = traverse_mounts(path, &jumped, NULL, 0); + int ret = traverse_mounts(path, &jumped, NULL, flags); if (path->mnt != mnt) mntput(mnt); @@ -2865,7 +2865,7 @@ int path_pts(struct path *path) path->dentry = child; dput(parent); - follow_down(path); + follow_down(path, 0); return 0; } #endif diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index f684c0cd1ec5..386d6fb92793 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -3,6 +3,7 @@ netfs-y := \ buffered_read.o \ io.o \ + iterator.o \ main.o \ objects.o diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c new file mode 100644 index 000000000000..f00d43b8ac0a --- /dev/null +++ b/fs/netfs/iterator.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Iterator helpers. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/uio.h> +#include <linux/scatterlist.h> +#include <linux/netfs.h> +#include "internal.h" + +/** + * netfs_extract_user_iter - Extract the pages from a user iterator into a bvec + * @orig: The original iterator + * @orig_len: The amount of iterator to copy + * @new: The iterator to be set up + * @extraction_flags: Flags to qualify the request + * + * Extract the page fragments from the given amount of the source iterator and + * build up a second iterator that refers to all of those bits. This allows + * the original iterator to disposed of. + * + * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA be + * allowed on the pages extracted. + * + * On success, the number of elements in the bvec is returned, the original + * iterator will have been advanced by the amount extracted. + * + * The iov_iter_extract_mode() function should be used to query how cleanup + * should be performed. + */ +ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, + struct iov_iter *new, + iov_iter_extraction_t extraction_flags) +{ + struct bio_vec *bv = NULL; + struct page **pages; + unsigned int cur_npages; + unsigned int max_pages; + unsigned int npages = 0; + unsigned int i; + ssize_t ret; + size_t count = orig_len, offset, len; + size_t bv_size, pg_size; + + if (WARN_ON_ONCE(!iter_is_ubuf(orig) && !iter_is_iovec(orig))) + return -EIO; + + max_pages = iov_iter_npages(orig, INT_MAX); + bv_size = array_size(max_pages, sizeof(*bv)); + bv = kvmalloc(bv_size, GFP_KERNEL); + if (!bv) + return -ENOMEM; + + /* Put the page list at the end of the bvec list storage. bvec + * elements are larger than page pointers, so as long as we work + * 0->last, we should be fine. + */ + pg_size = array_size(max_pages, sizeof(*pages)); + pages = (void *)bv + bv_size - pg_size; + + while (count && npages < max_pages) { + ret = iov_iter_extract_pages(orig, &pages, count, + max_pages - npages, extraction_flags, + &offset); + if (ret < 0) { + pr_err("Couldn't get user pages (rc=%zd)\n", ret); + break; + } + + if (ret > count) { + pr_err("get_pages rc=%zd more than %zu\n", ret, count); + break; + } + + count -= ret; + ret += offset; + cur_npages = DIV_ROUND_UP(ret, PAGE_SIZE); + + if (npages + cur_npages > max_pages) { + pr_err("Out of bvec array capacity (%u vs %u)\n", + npages + cur_npages, max_pages); + break; + } + + for (i = 0; i < cur_npages; i++) { + len = ret > PAGE_SIZE ? PAGE_SIZE : ret; + bvec_set_page(bv + npages + i, *pages++, len - offset, offset); + ret -= len; + offset = 0; + } + + npages += cur_npages; + } + + iov_iter_bvec(new, orig->data_source, bv, npages, orig_len - count); + return npages; +} +EXPORT_SYMBOL_GPL(netfs_extract_user_iter); + +/* + * Extract and pin a list of up to sg_max pages from UBUF- or IOVEC-class + * iterators, and add them to the scatterlist. + */ +static ssize_t netfs_extract_user_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + struct page **pages; + unsigned int npages; + ssize_t ret = 0, res; + size_t len, off; + + /* We decant the page list into the tail of the scatterlist */ + pages = (void *)sgtable->sgl + array_size(sg_max, sizeof(struct scatterlist)); + pages -= sg_max; + + do { + res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max, + extraction_flags, &off); + if (res < 0) + goto failed; + + len = res; + maxsize -= len; + ret += len; + npages = DIV_ROUND_UP(off + len, PAGE_SIZE); + sg_max -= npages; + + for (; npages < 0; npages--) { + struct page *page = *pages; + size_t seg = min_t(size_t, PAGE_SIZE - off, len); + + *pages++ = NULL; + sg_set_page(sg, page, len, off); + sgtable->nents++; + sg++; + len -= seg; + off = 0; + } + } while (maxsize > 0 && sg_max > 0); + + return ret; + +failed: + while (sgtable->nents > sgtable->orig_nents) + put_page(sg_page(&sgtable->sgl[--sgtable->nents])); + return res; +} + +/* + * Extract up to sg_max pages from a BVEC-type iterator and add them to the + * scatterlist. The pages are not pinned. + */ +static ssize_t netfs_extract_bvec_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + const struct bio_vec *bv = iter->bvec; + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + sg_set_page(sg, bv[i].bv_page, len, off); + sgtable->nents++; + sg++; + sg_max--; + + ret += len; + maxsize -= len; + if (maxsize <= 0 || sg_max == 0) + break; + start = 0; + } + + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/* + * Extract up to sg_max pages from a KVEC-type iterator and add them to the + * scatterlist. This can deal with vmalloc'd buffers as well as kmalloc'd or + * static buffers. The pages are not pinned. + */ +static ssize_t netfs_extract_kvec_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + const struct kvec *kv = iter->kvec; + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + struct page *page; + unsigned long kaddr; + size_t off, len, seg; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + kaddr = (unsigned long)kv[i].iov_base + start; + off = kaddr & ~PAGE_MASK; + len = min_t(size_t, maxsize, len - start); + kaddr &= PAGE_MASK; + + maxsize -= len; + ret += len; + do { + seg = min_t(size_t, len, PAGE_SIZE - off); + if (is_vmalloc_or_module_addr((void *)kaddr)) + page = vmalloc_to_page((void *)kaddr); + else + page = virt_to_page(kaddr); + + sg_set_page(sg, page, len, off); + sgtable->nents++; + sg++; + sg_max--; + + len -= seg; + kaddr += PAGE_SIZE; + off = 0; + } while (len > 0 && sg_max > 0); + + if (maxsize <= 0 || sg_max == 0) + break; + start = 0; + } + + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/* + * Extract up to sg_max folios from an XARRAY-type iterator and add them to + * the scatterlist. The pages are not pinned. + */ +static ssize_t netfs_extract_xarray_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + struct xarray *xa = iter->xarray; + struct folio *folio; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t index = start / PAGE_SIZE; + ssize_t ret = 0; + size_t offset, len; + XA_STATE(xas, xa, index); + + rcu_read_lock(); + + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + offset = offset_in_folio(folio, start); + len = min_t(size_t, maxsize, folio_size(folio) - offset); + + sg_set_page(sg, folio_page(folio, 0), len, offset); + sgtable->nents++; + sg++; + sg_max--; + + maxsize -= len; + ret += len; + if (maxsize <= 0 || sg_max == 0) + break; + } + + rcu_read_unlock(); + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/** + * netfs_extract_iter_to_sg - Extract pages from an iterator and add ot an sglist + * @iter: The iterator to extract from + * @maxsize: The amount of iterator to copy + * @sgtable: The scatterlist table to fill in + * @sg_max: Maximum number of elements in @sgtable that may be filled + * @extraction_flags: Flags to qualify the request + * + * Extract the page fragments from the given amount of the source iterator and + * add them to a scatterlist that refers to all of those bits, to a maximum + * addition of @sg_max elements. + * + * The pages referred to by UBUF- and IOVEC-type iterators are extracted and + * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE- + * and DISCARD-type are not supported. + * + * No end mark is placed on the scatterlist; that's left to the caller. + * + * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA + * be allowed on the pages extracted. + * + * If successul, @sgtable->nents is updated to include the number of elements + * added and the number of bytes added is returned. @sgtable->orig_nents is + * left unaltered. + * + * The iov_iter_extract_mode() function should be used to query how cleanup + * should be performed. + */ +ssize_t netfs_extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, + struct sg_table *sgtable, unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + if (maxsize == 0) + return 0; + + switch (iov_iter_type(iter)) { + case ITER_UBUF: + case ITER_IOVEC: + return netfs_extract_user_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + case ITER_BVEC: + return netfs_extract_bvec_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + case ITER_KVEC: + return netfs_extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + case ITER_XARRAY: + return netfs_extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); + default: + pr_err("%s(%u) unsupported\n", __func__, iov_iter_type(iter)); + WARN_ON_ONCE(1); + return -EIO; + } +} +EXPORT_SYMBOL_GPL(netfs_extract_iter_to_sg); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d0cccddb7d08..321af81c456e 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -980,14 +980,11 @@ out_invalidcred: } static int -nfs_callback_dispatch(struct svc_rqst *rqstp, __be32 *statp) +nfs_callback_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *procp = rqstp->rq_procinfo; - svcxdr_init_decode(rqstp); - svcxdr_init_encode(rqstp); - - *statp = procp->pc_func(rqstp); + *rqstp->rq_accept_statp = procp->pc_func(rqstp); return 1; } @@ -1072,7 +1069,8 @@ static const struct svc_procedure nfs4_callback_procedures1[] = { } }; -static unsigned int nfs4_callback_count1[ARRAY_SIZE(nfs4_callback_procedures1)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfs4_callback_count1[ARRAY_SIZE(nfs4_callback_procedures1)]); const struct svc_version nfs4_callback_version1 = { .vs_vers = 1, .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), @@ -1084,7 +1082,8 @@ const struct svc_version nfs4_callback_version1 = { .vs_need_cong_ctrl = true, }; -static unsigned int nfs4_callback_count4[ARRAY_SIZE(nfs4_callback_procedures1)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfs4_callback_count4[ARRAY_SIZE(nfs4_callback_procedures1)]); const struct svc_version nfs4_callback_version4 = { .vs_vers = 4, .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f8e420464b77..a41c3ee4549c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -203,14 +203,14 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie, { struct nfs_cache_array *array; - array = kmap_atomic(page); + array = kmap_local_page(page); array->change_attr = change_attr; array->last_cookie = last_cookie; array->size = 0; array->page_full = 0; array->page_is_eof = 0; array->cookies_are_ordered = 1; - kunmap_atomic(array); + kunmap_local(array); } /* @@ -221,11 +221,11 @@ static void nfs_readdir_clear_array(struct page *page) struct nfs_cache_array *array; unsigned int i; - array = kmap_atomic(page); + array = kmap_local_page(page); for (i = 0; i < array->size; i++) kfree(array->array[i].name); array->size = 0; - kunmap_atomic(array); + kunmap_local(array); } static void nfs_readdir_free_folio(struct folio *folio) @@ -371,14 +371,14 @@ static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie) static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie, u64 change_attr) { - struct nfs_cache_array *array = kmap_atomic(page); + struct nfs_cache_array *array = kmap_local_page(page); int ret = true; if (array->change_attr != change_attr) ret = false; if (nfs_readdir_array_index_cookie(array) != last_cookie) ret = false; - kunmap_atomic(array); + kunmap_local(array); return ret; } @@ -418,9 +418,9 @@ static u64 nfs_readdir_page_last_cookie(struct page *page) struct nfs_cache_array *array; u64 ret; - array = kmap_atomic(page); + array = kmap_local_page(page); ret = array->last_cookie; - kunmap_atomic(array); + kunmap_local(array); return ret; } @@ -429,9 +429,9 @@ static bool nfs_readdir_page_needs_filling(struct page *page) struct nfs_cache_array *array; bool ret; - array = kmap_atomic(page); + array = kmap_local_page(page); ret = !nfs_readdir_array_is_full(array); - kunmap_atomic(array); + kunmap_local(array); return ret; } @@ -439,9 +439,9 @@ static void nfs_readdir_page_set_eof(struct page *page) { struct nfs_cache_array *array; - array = kmap_atomic(page); + array = kmap_local_page(page); nfs_readdir_array_set_eof(array); - kunmap_atomic(array); + kunmap_local(array); } static struct page *nfs_readdir_page_get_next(struct address_space *mapping, @@ -568,14 +568,14 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) struct nfs_cache_array *array; int status; - array = kmap_atomic(desc->page); + array = kmap_local_page(desc->page); if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); else status = nfs_readdir_search_for_cookie(array, desc); - kunmap_atomic(array); + kunmap_local(array); return status; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 1707f46b1335..9a18c5a69ace 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -343,14 +343,12 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); /* XXX do we need to do the eof zeroing found in async_filler? */ - req = nfs_create_request(dreq->ctx, pagevec[i], - pgbase, req_len); + req = nfs_page_create_from_page(dreq->ctx, pagevec[i], + pgbase, pos, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); break; } - req->wb_index = pos >> PAGE_SHIFT; - req->wb_offset = pos & ~PAGE_MASK; if (!nfs_pageio_add_request(&desc, req)) { result = desc.pg_error; nfs_release_request(req); @@ -802,8 +800,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct nfs_page *req; unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); - req = nfs_create_request(dreq->ctx, pagevec[i], - pgbase, req_len); + req = nfs_page_create_from_page(dreq->ctx, pagevec[i], + pgbase, pos, req_len); if (IS_ERR(req)) { result = PTR_ERR(req); break; @@ -816,8 +814,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, } nfs_lock_request(req); - req->wb_index = pos >> PAGE_SHIFT; - req->wb_offset = pos & ~PAGE_MASK; if (!nfs_pageio_add_request(&desc, req)) { result = desc.pg_error; nfs_unlock_and_release_request(req); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 1a9d5aa51dfb..d6a6d1ebb8fd 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -42,7 +42,7 @@ nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) dprintk("%s: max fh len %d inode %p parent %p", __func__, *max_len, inode, parent); - if (*max_len < len || IS_AUTOMOUNT(inode)) { + if (*max_len < len) { dprintk("%s: fh len %d too small, required %d\n", __func__, *max_len, len); *max_len = len; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index b0f3c9339e70..893625eacab9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -277,27 +277,28 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync); * and that the new data won't completely replace the old data in * that range of the file. */ -static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len) +static bool nfs_folio_is_full_write(struct folio *folio, loff_t pos, + unsigned int len) { - unsigned int pglen = nfs_page_length(page); - unsigned int offset = pos & (PAGE_SIZE - 1); + unsigned int pglen = nfs_folio_length(folio); + unsigned int offset = offset_in_folio(folio, pos); unsigned int end = offset + len; return !pglen || (end >= pglen && !offset); } -static bool nfs_want_read_modify_write(struct file *file, struct page *page, - loff_t pos, unsigned int len) +static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, + loff_t pos, unsigned int len) { /* * Up-to-date pages, those with ongoing or full-page write * don't need read/modify/write */ - if (PageUptodate(page) || PagePrivate(page) || - nfs_full_page_write(page, pos, len)) + if (folio_test_uptodate(folio) || folio_test_private(folio) || + nfs_folio_is_full_write(folio, pos, len)) return false; - if (pnfs_ld_read_whole_page(file->f_mapping->host)) + if (pnfs_ld_read_whole_page(file_inode(file))) return true; /* Open for reading too? */ if (file->f_mode & FMODE_READ) @@ -305,6 +306,15 @@ static bool nfs_want_read_modify_write(struct file *file, struct page *page, return false; } +static struct folio * +nfs_folio_grab_cache_write_begin(struct address_space *mapping, pgoff_t index) +{ + unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; + + return __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); +} + /* * This does the "real" work of the write. We must allocate and lock the * page to be sent back to the generic routine, which then copies the @@ -314,32 +324,31 @@ static bool nfs_want_read_modify_write(struct file *file, struct page *page, * increment the page use counts until he is done with the page. */ static int nfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct page **pagep, + void **fsdata) { - int ret; - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; + struct folio *folio; int once_thru = 0; + int ret; dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); start: - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT); + if (!folio) return -ENOMEM; - *pagep = page; + *pagep = &folio->page; - ret = nfs_flush_incompatible(file, page); + ret = nfs_flush_incompatible(file, folio); if (ret) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } else if (!once_thru && - nfs_want_read_modify_write(file, page, pos, len)) { + nfs_want_read_modify_write(file, folio, pos, len)) { once_thru = 1; - ret = nfs_read_folio(file, page_folio(page)); - put_page(page); + ret = nfs_read_folio(file, folio); + folio_put(folio); if (!ret) goto start; } @@ -347,11 +356,12 @@ start: } static int nfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - unsigned offset = pos & (PAGE_SIZE - 1); struct nfs_open_context *ctx = nfs_file_open_context(file); + struct folio *folio = page_folio(page); + unsigned offset = offset_in_folio(folio, pos); int status; dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", @@ -361,26 +371,26 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, * Zero any uninitialised parts of the page, and then mark the page * as up to date if it turns out that we're extending the file. */ - if (!PageUptodate(page)) { - unsigned pglen = nfs_page_length(page); + if (!folio_test_uptodate(folio)) { + size_t fsize = folio_size(folio); + unsigned pglen = nfs_folio_length(folio); unsigned end = offset + copied; if (pglen == 0) { - zero_user_segments(page, 0, offset, - end, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segments(folio, 0, offset, end, fsize); + folio_mark_uptodate(folio); } else if (end >= pglen) { - zero_user_segment(page, end, PAGE_SIZE); + folio_zero_segment(folio, end, fsize); if (offset == 0) - SetPageUptodate(page); + folio_mark_uptodate(folio); } else - zero_user_segment(page, pglen, PAGE_SIZE); + folio_zero_segment(folio, pglen, fsize); } - status = nfs_updatepage(file, page, offset, copied); + status = nfs_update_folio(file, folio, offset, copied); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (status < 0) return status; @@ -402,14 +412,16 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, static void nfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { + struct inode *inode = folio_file_mapping(folio)->host; dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n", folio->index, offset, length); if (offset != 0 || length < folio_size(folio)) return; /* Cancel any unstarted writes on this page */ - nfs_wb_folio_cancel(folio->mapping->host, folio); + nfs_wb_folio_cancel(inode, folio); folio_wait_fscache(folio); + trace_nfs_invalidate_folio(inode, folio); } /* @@ -423,8 +435,13 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_folio(%p)\n", folio); /* If the private flag is set, then the folio is not freeable */ - if (folio_test_private(folio)) - return false; + if (folio_test_private(folio)) { + if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL || + current_is_kswapd()) + return false; + if (nfs_wb_folio(folio_file_mapping(folio)->host, folio) < 0) + return false; + } return nfs_fscache_release_folio(folio, gfp); } @@ -465,12 +482,15 @@ static void nfs_check_dirty_writeback(struct folio *folio, static int nfs_launder_folio(struct folio *folio) { struct inode *inode = folio->mapping->host; + int ret; dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n", inode->i_ino, folio_pos(folio)); folio_wait_fscache(folio); - return nfs_wb_page(inode, &folio->page); + ret = nfs_wb_folio(inode, folio); + trace_nfs_launder_folio_done(inode, folio, ret); + return ret; } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -547,22 +567,22 @@ const struct address_space_operations nfs_file_aops = { */ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; struct file *filp = vmf->vma->vm_file; struct inode *inode = file_inode(filp); unsigned pagelen; vm_fault_t ret = VM_FAULT_NOPAGE; struct address_space *mapping; + struct folio *folio = page_folio(vmf->page); dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", - filp, filp->f_mapping->host->i_ino, - (long long)page_offset(page)); + filp, filp->f_mapping->host->i_ino, + (long long)folio_file_pos(folio)); sb_start_pagefault(inode->i_sb); /* make sure the cache has finished storing the page */ - if (PageFsCache(page) && - wait_on_page_fscache_killable(vmf->page) < 0) { + if (folio_test_fscache(folio) && + folio_wait_fscache_killable(folio) < 0) { ret = VM_FAULT_RETRY; goto out; } @@ -571,25 +591,25 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) nfs_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); - lock_page(page); - mapping = page_file_mapping(page); + folio_lock(folio); + mapping = folio_file_mapping(folio); if (mapping != inode->i_mapping) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(folio); - pagelen = nfs_page_length(page); + pagelen = nfs_folio_length(folio); if (pagelen == 0) goto out_unlock; ret = VM_FAULT_LOCKED; - if (nfs_flush_incompatible(filp, page) == 0 && - nfs_updatepage(filp, page, 0, pagelen) == 0) + if (nfs_flush_incompatible(filp, folio) == 0 && + nfs_update_folio(filp, folio, 0, pagelen) == 0) goto out; ret = VM_FAULT_SIGBUS; out_unlock: - unlock_page(page); + folio_unlock(folio); out: sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 4974cd18ca46..ce8f8934bca5 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -862,6 +862,8 @@ fl_pnfs_update_layout(struct inode *ino, status = filelayout_check_deviceid(lo, fl, gfp_flags); if (status) { + pnfs_error_mark_layout_for_return(ino, lseg); + pnfs_set_lo_fail(lseg); pnfs_put_lseg(lseg); lseg = NULL; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 41468c21291d..2a65fe2a63ab 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -760,17 +760,18 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) * Record the page as unstable (an extra writeback period) and mark its * inode as dirty. */ -static inline -void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) +static inline void nfs_folio_mark_unstable(struct folio *folio, + struct nfs_commit_info *cinfo) { - if (!cinfo->dreq) { - struct inode *inode = page_file_mapping(page)->host; + if (folio && !cinfo->dreq) { + struct inode *inode = folio_file_mapping(folio)->host; + long nr = folio_nr_pages(folio); /* This page is really still in write-back - just that the * writeback is happening on the server now. */ - inc_node_page_state(page, NR_WRITEBACK); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + node_stat_mod_folio(folio, NR_WRITEBACK, nr); + wb_stat_mod(&inode_to_bdi(inode)->wb, WB_WRITEBACK, nr); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } @@ -795,6 +796,24 @@ unsigned int nfs_page_length(struct page *page) } /* + * Determine the number of bytes of data the page contains + */ +static inline size_t nfs_folio_length(struct folio *folio) +{ + loff_t i_size = i_size_read(folio_file_mapping(folio)->host); + + if (i_size > 0) { + pgoff_t index = folio_index(folio) >> folio_order(folio); + pgoff_t end_index = (i_size - 1) >> folio_shift(folio); + if (index < end_index) + return folio_size(folio); + if (index == end_index) + return offset_in_folio(folio, i_size - 1) + 1; + } + return 0; +} + +/* * Convert a umode to a dirent->d_type */ static inline @@ -807,11 +826,10 @@ unsigned char nfs_umode_to_dtype(umode_t mode) * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ -static inline -unsigned int nfs_page_array_len(unsigned int base, size_t len) +static inline unsigned int nfs_page_array_len(unsigned int base, size_t len) { - return ((unsigned long)len + (unsigned long)base + - PAGE_SIZE - 1) >> PAGE_SHIFT; + return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> + PAGE_SHIFT; } /* diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index ecb428512fe1..93e306bf4430 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -460,7 +460,8 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, if (err >= 0) break; - if (err == -ENOTSUPP && + if ((err == -ENOTSUPP || + err == -NFS4ERR_OFFLOAD_DENIED) && nfs42_files_from_same_server(src, dst)) { err = -EOPNOTSUPP; break; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d9c332019d06..22a93ae46cd7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -10604,7 +10604,9 @@ static void nfs4_disable_swap(struct inode *inode) /* The state manager thread will now exit once it is * woken. */ - wake_up_var(&NFS_SERVER(inode)->nfs_client->cl_state); + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + + nfs4_schedule_state_manager(clp); } static const struct inode_operations nfs4_dir_inode_operations = { diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 214bc56f92d2..d27919d7241d 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -292,32 +292,34 @@ TRACE_DEFINE_ENUM(NFS4CLNT_MOVED); TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED); TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED); TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER); +TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_AVAILABLE); TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING); TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ); TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_DELAYED); #define show_nfs4_clp_state(state) \ __print_flags(state, "|", \ - { NFS4CLNT_MANAGER_RUNNING, "MANAGER_RUNNING" }, \ - { NFS4CLNT_CHECK_LEASE, "CHECK_LEASE" }, \ - { NFS4CLNT_LEASE_EXPIRED, "LEASE_EXPIRED" }, \ - { NFS4CLNT_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \ - { NFS4CLNT_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \ - { NFS4CLNT_DELEGRETURN, "DELEGRETURN" }, \ - { NFS4CLNT_SESSION_RESET, "SESSION_RESET" }, \ - { NFS4CLNT_LEASE_CONFIRM, "LEASE_CONFIRM" }, \ - { NFS4CLNT_SERVER_SCOPE_MISMATCH, \ - "SERVER_SCOPE_MISMATCH" }, \ - { NFS4CLNT_PURGE_STATE, "PURGE_STATE" }, \ - { NFS4CLNT_BIND_CONN_TO_SESSION, \ - "BIND_CONN_TO_SESSION" }, \ - { NFS4CLNT_MOVED, "MOVED" }, \ - { NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \ - { NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \ - { NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \ - { NFS4CLNT_RECALL_RUNNING, "RECALL_RUNNING" }, \ - { NFS4CLNT_RECALL_ANY_LAYOUT_READ, "RECALL_ANY_LAYOUT_READ" }, \ - { NFS4CLNT_RECALL_ANY_LAYOUT_RW, "RECALL_ANY_LAYOUT_RW" }) + { BIT(NFS4CLNT_MANAGER_RUNNING), "MANAGER_RUNNING" }, \ + { BIT(NFS4CLNT_CHECK_LEASE), "CHECK_LEASE" }, \ + { BIT(NFS4CLNT_LEASE_EXPIRED), "LEASE_EXPIRED" }, \ + { BIT(NFS4CLNT_RECLAIM_REBOOT), "RECLAIM_REBOOT" }, \ + { BIT(NFS4CLNT_RECLAIM_NOGRACE), "RECLAIM_NOGRACE" }, \ + { BIT(NFS4CLNT_DELEGRETURN), "DELEGRETURN" }, \ + { BIT(NFS4CLNT_SESSION_RESET), "SESSION_RESET" }, \ + { BIT(NFS4CLNT_LEASE_CONFIRM), "LEASE_CONFIRM" }, \ + { BIT(NFS4CLNT_SERVER_SCOPE_MISMATCH), "SERVER_SCOPE_MISMATCH" }, \ + { BIT(NFS4CLNT_PURGE_STATE), "PURGE_STATE" }, \ + { BIT(NFS4CLNT_BIND_CONN_TO_SESSION), "BIND_CONN_TO_SESSION" }, \ + { BIT(NFS4CLNT_MOVED), "MOVED" }, \ + { BIT(NFS4CLNT_LEASE_MOVED), "LEASE_MOVED" }, \ + { BIT(NFS4CLNT_DELEGATION_EXPIRED), "DELEGATION_EXPIRED" }, \ + { BIT(NFS4CLNT_RUN_MANAGER), "RUN_MANAGER" }, \ + { BIT(NFS4CLNT_MANAGER_AVAILABLE), "MANAGER_AVAILABLE" }, \ + { BIT(NFS4CLNT_RECALL_RUNNING), "RECALL_RUNNING" }, \ + { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_READ), "RECALL_ANY_LAYOUT_READ" }, \ + { BIT(NFS4CLNT_RECALL_ANY_LAYOUT_RW), "RECALL_ANY_LAYOUT_RW" }, \ + { BIT(NFS4CLNT_DELEGRETURN_DELAYED), "DELERETURN_DELAYED" }) TRACE_EVENT(nfs4_state_mgr, TP_PROTO( diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 642f6921852f..a778713343df 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -152,8 +152,6 @@ DEFINE_NFS_INODE_EVENT(nfs_getattr_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_getattr_exit); DEFINE_NFS_INODE_EVENT(nfs_setattr_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_setattr_exit); -DEFINE_NFS_INODE_EVENT(nfs_writeback_page_enter); -DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_page_exit); DEFINE_NFS_INODE_EVENT(nfs_writeback_inode_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); @@ -933,13 +931,13 @@ TRACE_EVENT(nfs_sillyrename_unlink, ) ); -TRACE_EVENT(nfs_aop_readpage, +DECLARE_EVENT_CLASS(nfs_folio_event, TP_PROTO( const struct inode *inode, - struct page *page + struct folio *folio ), - TP_ARGS(inode, page), + TP_ARGS(inode, folio), TP_STRUCT__entry( __field(dev_t, dev) @@ -947,6 +945,7 @@ TRACE_EVENT(nfs_aop_readpage, __field(u64, fileid) __field(u64, version) __field(loff_t, offset) + __field(u32, count) ), TP_fast_assign( @@ -956,26 +955,36 @@ TRACE_EVENT(nfs_aop_readpage, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = page_index(page) << PAGE_SHIFT; + __entry->offset = folio_file_pos(folio); + __entry->count = nfs_folio_length(folio); ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld", + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "offset=%lld count=%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, - __entry->offset + __entry->offset, __entry->count ) ); -TRACE_EVENT(nfs_aop_readpage_done, +#define DEFINE_NFS_FOLIO_EVENT(name) \ + DEFINE_EVENT(nfs_folio_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + struct folio *folio \ + ), \ + TP_ARGS(inode, folio)) + +DECLARE_EVENT_CLASS(nfs_folio_event_done, TP_PROTO( const struct inode *inode, - struct page *page, + struct folio *folio, int ret ), - TP_ARGS(inode, page, ret), + TP_ARGS(inode, folio, ret), TP_STRUCT__entry( __field(dev_t, dev) @@ -984,6 +993,7 @@ TRACE_EVENT(nfs_aop_readpage_done, __field(u64, fileid) __field(u64, version) __field(loff_t, offset) + __field(u32, count) ), TP_fast_assign( @@ -993,19 +1003,39 @@ TRACE_EVENT(nfs_aop_readpage_done, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = page_index(page) << PAGE_SHIFT; + __entry->offset = folio_file_pos(folio); + __entry->count = nfs_folio_length(folio); __entry->ret = ret; ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld ret=%d", + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "offset=%lld count=%u ret=%d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->version, - __entry->offset, __entry->ret + __entry->offset, __entry->count, __entry->ret ) ); +#define DEFINE_NFS_FOLIO_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_folio_event_done, name, \ + TP_PROTO( \ + const struct inode *inode, \ + struct folio *folio, \ + int ret \ + ), \ + TP_ARGS(inode, folio, ret)) + +DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done); + TRACE_EVENT(nfs_aop_readahead, TP_PROTO( const struct inode *inode, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 779bfc37233c..64fa8de199de 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -32,6 +32,42 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; +struct nfs_page_iter_page { + const struct nfs_page *req; + size_t count; +}; + +static void nfs_page_iter_page_init(struct nfs_page_iter_page *i, + const struct nfs_page *req) +{ + i->req = req; + i->count = 0; +} + +static void nfs_page_iter_page_advance(struct nfs_page_iter_page *i, size_t sz) +{ + const struct nfs_page *req = i->req; + size_t tmp = i->count + sz; + + i->count = (tmp < req->wb_bytes) ? tmp : req->wb_bytes; +} + +static struct page *nfs_page_iter_page_get(struct nfs_page_iter_page *i) +{ + const struct nfs_page *req = i->req; + struct page *page; + + if (i->count != req->wb_bytes) { + size_t base = i->count + req->wb_pgbase; + size_t len = PAGE_SIZE - offset_in_page(base); + + page = nfs_page_to_page(req, base); + nfs_page_iter_page_advance(i, len); + return page; + } + return NULL; +} + static struct nfs_pgio_mirror * nfs_pgio_get_mirror(struct nfs_pageio_descriptor *desc, u32 idx) { @@ -391,7 +427,7 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) * has extra ref from the write/commit path to handle handoff * between write and commit lists. */ if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { - inode = page_file_mapping(req->wb_page)->host; + inode = nfs_page_to_inode(req); set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); atomic_long_inc(&NFS_I(inode)->nrequests); @@ -431,10 +467,9 @@ out: nfs_release_request(head); } -static struct nfs_page * -__nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, - unsigned int pgbase, unsigned int offset, - unsigned int count) +static struct nfs_page *nfs_page_create(struct nfs_lock_context *l_ctx, + unsigned int pgbase, pgoff_t index, + unsigned int offset, unsigned int count) { struct nfs_page *req; struct nfs_open_context *ctx = l_ctx->open_context; @@ -453,42 +488,90 @@ __nfs_create_request(struct nfs_lock_context *l_ctx, struct page *page, /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ - req->wb_page = page; - if (page) { - req->wb_index = page_index(page); - get_page(page); - } - req->wb_offset = offset; - req->wb_pgbase = pgbase; - req->wb_bytes = count; + req->wb_pgbase = pgbase; + req->wb_index = index; + req->wb_offset = offset; + req->wb_bytes = count; kref_init(&req->wb_kref); req->wb_nio = 0; return req; } +static void nfs_page_assign_folio(struct nfs_page *req, struct folio *folio) +{ + if (folio != NULL) { + req->wb_folio = folio; + folio_get(folio); + set_bit(PG_FOLIO, &req->wb_flags); + } +} + +static void nfs_page_assign_page(struct nfs_page *req, struct page *page) +{ + if (page != NULL) { + req->wb_page = page; + get_page(page); + } +} + /** - * nfs_create_request - Create an NFS read/write request. + * nfs_page_create_from_page - Create an NFS read/write request. * @ctx: open context to use * @page: page to write - * @offset: starting offset within the page for the write + * @pgbase: starting offset within the page for the write + * @offset: file offset for the write * @count: number of bytes to read/write * * The page must be locked by the caller. This makes sure we never * create two different requests for the same page. * User should ensure it is safe to sleep in this function. */ -struct nfs_page * -nfs_create_request(struct nfs_open_context *ctx, struct page *page, - unsigned int offset, unsigned int count) +struct nfs_page *nfs_page_create_from_page(struct nfs_open_context *ctx, + struct page *page, + unsigned int pgbase, loff_t offset, + unsigned int count) +{ + struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx); + struct nfs_page *ret; + + if (IS_ERR(l_ctx)) + return ERR_CAST(l_ctx); + ret = nfs_page_create(l_ctx, pgbase, offset >> PAGE_SHIFT, + offset_in_page(offset), count); + if (!IS_ERR(ret)) { + nfs_page_assign_page(ret, page); + nfs_page_group_init(ret, NULL); + } + nfs_put_lock_context(l_ctx); + return ret; +} + +/** + * nfs_page_create_from_folio - Create an NFS read/write request. + * @ctx: open context to use + * @folio: folio to write + * @offset: starting offset within the folio for the write + * @count: number of bytes to read/write + * + * The page must be locked by the caller. This makes sure we never + * create two different requests for the same page. + * User should ensure it is safe to sleep in this function. + */ +struct nfs_page *nfs_page_create_from_folio(struct nfs_open_context *ctx, + struct folio *folio, + unsigned int offset, + unsigned int count) { struct nfs_lock_context *l_ctx = nfs_get_lock_context(ctx); struct nfs_page *ret; if (IS_ERR(l_ctx)) return ERR_CAST(l_ctx); - ret = __nfs_create_request(l_ctx, page, offset, offset, count); - if (!IS_ERR(ret)) + ret = nfs_page_create(l_ctx, offset, folio_index(folio), offset, count); + if (!IS_ERR(ret)) { + nfs_page_assign_folio(ret, folio); nfs_page_group_init(ret, NULL); + } nfs_put_lock_context(l_ctx); return ret; } @@ -501,10 +584,16 @@ nfs_create_subreq(struct nfs_page *req, { struct nfs_page *last; struct nfs_page *ret; + struct folio *folio = nfs_page_to_folio(req); + struct page *page = nfs_page_to_page(req, pgbase); - ret = __nfs_create_request(req->wb_lock_context, req->wb_page, - pgbase, offset, count); + ret = nfs_page_create(req->wb_lock_context, pgbase, req->wb_index, + offset, count); if (!IS_ERR(ret)) { + if (folio) + nfs_page_assign_folio(ret, folio); + else + nfs_page_assign_page(ret, page); /* find the last request */ for (last = req->wb_head; last->wb_this_page != req->wb_head; @@ -512,7 +601,6 @@ nfs_create_subreq(struct nfs_page *req, ; nfs_lock_request(ret); - ret->wb_index = req->wb_index; nfs_page_group_init(ret, last); ret->wb_nio = req->wb_nio; } @@ -551,11 +639,16 @@ void nfs_unlock_and_release_request(struct nfs_page *req) */ static void nfs_clear_request(struct nfs_page *req) { + struct folio *folio = nfs_page_to_folio(req); struct page *page = req->wb_page; struct nfs_lock_context *l_ctx = req->wb_lock_context; struct nfs_open_context *ctx; - if (page != NULL) { + if (folio != NULL) { + folio_put(folio); + req->wb_folio = NULL; + clear_bit(PG_FOLIO, &req->wb_flags); + } else if (page != NULL) { put_page(page); req->wb_page = NULL; } @@ -693,13 +786,14 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); /** * nfs_pgio_rpcsetup - Set up arguments for a pageio call * @hdr: The pageio hdr + * @pgbase: base * @count: Number of bytes to read * @how: How to commit data (writes only) * @cinfo: Commit information for the call (writes only) */ -static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, - unsigned int count, - int how, struct nfs_commit_info *cinfo) +static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, unsigned int pgbase, + unsigned int count, int how, + struct nfs_commit_info *cinfo) { struct nfs_page *req = hdr->req; @@ -710,7 +804,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, hdr->args.offset = req_offset(req); /* pnfs_set_layoutcommit needs this */ hdr->mds_offset = hdr->args.offset; - hdr->args.pgbase = req->wb_pgbase; + hdr->args.pgbase = pgbase; hdr->args.pages = hdr->page_array.pagevec; hdr->args.count = count; hdr->args.context = get_nfs_open_context(nfs_req_openctx(req)); @@ -896,9 +990,10 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct nfs_commit_info cinfo; struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; + unsigned int pg_base = offset_in_page(mirror->pg_base); gfp_t gfp_flags = nfs_io_gfp_mask(); - pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); + pagecount = nfs_page_array_len(pg_base, mirror->pg_count); pg_array->npages = pagecount; if (pagecount <= ARRAY_SIZE(pg_array->page_array)) @@ -918,16 +1013,26 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, last_page = NULL; pageused = 0; while (!list_empty(head)) { + struct nfs_page_iter_page i; + struct page *page; + req = nfs_list_entry(head->next); nfs_list_move_request(req, &hdr->pages); - if (!last_page || last_page != req->wb_page) { - pageused++; - if (pageused > pagecount) - break; - *pages++ = last_page = req->wb_page; + if (req->wb_pgbase == 0) + last_page = NULL; + + nfs_page_iter_page_init(&i, req); + while ((page = nfs_page_iter_page_get(&i)) != NULL) { + if (last_page != page) { + pageused++; + if (pageused > pagecount) + goto full; + *pages++ = last_page = page; + } } } +full: if (WARN_ON_ONCE(pageused != pagecount)) { nfs_pgio_error(hdr); desc->pg_error = -EINVAL; @@ -939,7 +1044,8 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - nfs_pgio_rpcsetup(hdr, mirror->pg_count, desc->pg_ioflags, &cinfo); + nfs_pgio_rpcsetup(hdr, pg_base, mirror->pg_count, desc->pg_ioflags, + &cinfo); desc->pg_rpc_callops = &nfs_pgio_common_ops; return 0; } @@ -1035,6 +1141,24 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1, return l1->lockowner == l2->lockowner; } +static bool nfs_page_is_contiguous(const struct nfs_page *prev, + const struct nfs_page *req) +{ + size_t prev_end = prev->wb_pgbase + prev->wb_bytes; + + if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + return false; + if (req->wb_pgbase == 0) + return prev_end == nfs_page_max_length(prev); + if (req->wb_pgbase == prev_end) { + struct folio *folio = nfs_page_to_folio(req); + if (folio) + return folio == nfs_page_to_folio(prev); + return req->wb_page == prev->wb_page; + } + return false; +} + /** * nfs_coalesce_size - test two requests for compatibility * @prev: pointer to nfs_page @@ -1063,16 +1187,8 @@ static unsigned int nfs_coalesce_size(struct nfs_page *prev, !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) return 0; - if (req_offset(req) != req_offset(prev) + prev->wb_bytes) + if (!nfs_page_is_contiguous(prev, req)) return 0; - if (req->wb_page == prev->wb_page) { - if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes) - return 0; - } else { - if (req->wb_pgbase != 0 || - prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE) - return 0; - } } return pgio->pg_ops->pg_test(pgio, prev, req); } @@ -1412,16 +1528,21 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) { struct nfs_pgio_mirror *mirror; struct nfs_page *prev; + struct folio *folio; u32 midx; for (midx = 0; midx < desc->pg_mirror_count; midx++) { mirror = nfs_pgio_get_mirror(desc, midx); if (!list_empty(&mirror->pg_list)) { prev = nfs_list_entry(mirror->pg_list.prev); - if (index != prev->wb_index + 1) { - nfs_pageio_complete(desc); - break; - } + folio = nfs_page_to_folio(prev); + if (folio) { + if (index == folio_next_index(folio)) + continue; + } else if (index == prev->wb_index + 1) + continue; + nfs_pageio_complete(desc); + break; } } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a5db5158c634..306cba0b9e69 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -511,7 +511,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) spin_lock(&inode->i_lock); pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); - pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0); + pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index e3e6a41f19de..d886c8226d8f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -193,7 +193,7 @@ struct pnfs_commit_ops { void (*recover_commit_reqs) (struct list_head *list, struct nfs_commit_info *cinfo); struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, - struct page *page); + struct folio *folio); }; struct pnfs_layout_hdr { @@ -395,7 +395,7 @@ void pnfs_generic_rw_release(void *data); void pnfs_generic_recover_commit_reqs(struct list_head *dst, struct nfs_commit_info *cinfo); struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, - struct page *page); + struct folio *folio); int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int how, @@ -557,13 +557,13 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, - struct page *page) + struct folio *folio) { struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs) return NULL; - return fl_cinfo->ops->search_commit_reqs(cinfo, page); + return fl_cinfo->ops->search_commit_reqs(cinfo, folio); } /* Should the pNFS client commit and return the layout upon a setattr */ @@ -864,7 +864,7 @@ pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, - struct page *page) + struct folio *folio) { return NULL; } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 5d035dd2d7bf..a0112ad4937a 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -353,7 +353,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); static struct nfs_page * pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, - unsigned int nbuckets, struct page *page) + unsigned int nbuckets, struct folio *folio) { struct nfs_page *req; struct pnfs_commit_bucket *b; @@ -363,11 +363,11 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, * request is found */ for (i = 0, b = buckets; i < nbuckets; i++, b++) { list_for_each_entry(req, &b->written, wb_list) { - if (req->wb_page == page) + if (nfs_page_to_folio(req) == folio) return req->wb_head; } list_for_each_entry(req, &b->committing, wb_list) { - if (req->wb_page == page) + if (nfs_page_to_folio(req) == folio) return req->wb_head; } } @@ -375,14 +375,14 @@ pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, } /* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head request - * for @page + * for @folio * @cinfo - commit info for current inode - * @page - page to search for matching head request + * @folio - page to search for matching head request * * Return: the head request if one is found, otherwise %NULL. */ -struct nfs_page * -pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) +struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, + struct folio *folio) { struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; struct pnfs_commit_array *array; @@ -390,7 +390,7 @@ pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) { req = pnfs_bucket_search_commit_reqs(array->buckets, - array->nbuckets, page); + array->nbuckets, folio); if (req) return req; } @@ -1180,7 +1180,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, nfs_request_add_commit_list_locked(req, list, cinfo); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - nfs_mark_page_unstable(req->wb_page, cinfo); + nfs_folio_mark_unstable(nfs_page_to_folio(req), cinfo); return; out_resched: mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 8ae2c8d1219d..c380cff4108e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -49,12 +49,11 @@ static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) kmem_cache_free(nfs_rdata_cachep, rhdr); } -static -int nfs_return_empty_page(struct page *page) +static int nfs_return_empty_folio(struct folio *folio) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); - unlock_page(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; } @@ -111,18 +110,18 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); static void nfs_readpage_release(struct nfs_page *req, int error) { struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); - struct page *page = req->wb_page; + struct folio *folio = nfs_page_to_folio(req); dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), req->wb_bytes, (long long)req_offset(req)); if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT) - SetPageError(page); + folio_set_error(folio); if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (PageUptodate(page)) - nfs_fscache_write_page(inode, page); - unlock_page(page); + if (folio_test_uptodate(folio)) + nfs_fscache_write_page(inode, &folio->page); + folio_unlock(folio); } nfs_release_request(req); } @@ -135,7 +134,7 @@ struct nfs_readdesc { static void nfs_page_group_set_uptodate(struct nfs_page *req) { if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) - SetPageUptodate(req->wb_page); + folio_mark_uptodate(nfs_page_to_folio(req)); } static void nfs_read_completion(struct nfs_pgio_header *hdr) @@ -147,7 +146,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) goto out; while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); - struct page *page = req->wb_page; + struct folio *folio = nfs_page_to_folio(req); unsigned long start = req->wb_pgbase; unsigned long end = req->wb_pgbase + req->wb_bytes; @@ -157,14 +156,14 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) if (bytes > hdr->good_bytes) { /* nothing in this request was good, so zero * the full extent of the request */ - zero_user_segment(page, start, end); + folio_zero_segment(folio, start, end); } else if (hdr->good_bytes - bytes < req->wb_bytes) { /* part of this request has good bytes, but * not all. zero the bad bytes */ start += hdr->good_bytes - bytes; WARN_ON(start < req->wb_pgbase); - zero_user_segment(page, start, end); + folio_zero_segment(folio, start, end); } } error = 0; @@ -281,33 +280,34 @@ static void nfs_readpage_result(struct rpc_task *task, nfs_readpage_retry(task, hdr); } -static int -readpage_async_filler(struct nfs_readdesc *desc, struct page *page) +static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; - unsigned int rsize = NFS_SERVER(inode)->rsize; + struct inode *inode = folio_file_mapping(folio)->host; + struct nfs_server *server = NFS_SERVER(inode); + size_t fsize = folio_size(folio); + unsigned int rsize = server->rsize; struct nfs_page *new; unsigned int len, aligned_len; int error; - len = nfs_page_length(page); + len = nfs_folio_length(folio); if (len == 0) - return nfs_return_empty_page(page); + return nfs_return_empty_folio(folio); - aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE); + aligned_len = min_t(unsigned int, ALIGN(len, rsize), fsize); - if (!IS_SYNC(page->mapping->host)) { - error = nfs_fscache_read_page(page->mapping->host, page); + if (!IS_SYNC(inode)) { + error = nfs_fscache_read_page(inode, &folio->page); if (error == 0) goto out_unlock; } - new = nfs_create_request(desc->ctx, page, 0, aligned_len); + new = nfs_page_create_from_folio(desc->ctx, folio, 0, aligned_len); if (IS_ERR(new)) goto out_error; - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); + if (len < fsize) + folio_zero_segment(folio, len, fsize); if (!nfs_pageio_add_request(&desc->pgio, new)) { nfs_list_remove_request(new); error = desc->pgio.pg_error; @@ -318,7 +318,7 @@ readpage_async_filler(struct nfs_readdesc *desc, struct page *page) out_error: error = PTR_ERR(new); out_unlock: - unlock_page(page); + folio_unlock(folio); out: return error; } @@ -331,61 +331,54 @@ out: */ int nfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; struct nfs_readdesc desc; - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = file_inode(file); int ret; - trace_nfs_aop_readpage(inode, page); + trace_nfs_aop_readpage(inode, folio); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); /* * Try to flush any pending writes to the file.. * - * NOTE! Because we own the page lock, there cannot + * NOTE! Because we own the folio lock, there cannot * be any new pending writes generated at this point - * for this page (other pages can be written to). + * for this folio (other folios can be written to). */ - ret = nfs_wb_page(inode, page); + ret = nfs_wb_folio(inode, folio); if (ret) goto out_unlock; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out_unlock; ret = -ESTALE; if (NFS_STALE(inode)) goto out_unlock; - if (file == NULL) { - ret = -EBADF; - desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); - if (desc.ctx == NULL) - goto out_unlock; - } else - desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); + desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); xchg(&desc.ctx->error, 0); nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); - ret = readpage_async_filler(&desc, page); + ret = readpage_async_filler(&desc, folio); if (ret) goto out; nfs_pageio_complete_read(&desc.pgio); ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; if (!ret) { - ret = wait_on_page_locked_killable(page); - if (!PageUptodate(page) && !ret) + ret = folio_wait_locked_killable(folio); + if (!folio_test_uptodate(folio) && !ret) ret = xchg(&desc.ctx->error, 0); } out: put_nfs_open_context(desc.ctx); - trace_nfs_aop_readpage_done(inode, page, ret); + trace_nfs_aop_readpage_done(inode, folio, ret); return ret; out_unlock: - unlock_page(page); - trace_nfs_aop_readpage_done(inode, page, ret); + folio_unlock(folio); + trace_nfs_aop_readpage_done(inode, folio, ret); return ret; } @@ -395,7 +388,7 @@ void nfs_readahead(struct readahead_control *ractl) struct file *file = ractl->file; struct nfs_readdesc desc; struct inode *inode = ractl->mapping->host; - struct page *page; + struct folio *folio; int ret; trace_nfs_aop_readahead(inode, readahead_pos(ractl), nr_pages); @@ -416,9 +409,8 @@ void nfs_readahead(struct readahead_control *ractl) nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); - while ((page = readahead_page(ractl)) != NULL) { - ret = readpage_async_filler(&desc, page); - put_page(page); + while ((folio = readahead_folio(ractl)) != NULL) { + ret = readpage_async_filler(&desc, folio); if (ret) break; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1a80d548253a..b508c985eb14 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -64,7 +64,7 @@ static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); static struct nfs_page * nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct page *page); + struct folio *folio); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -171,31 +171,28 @@ nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) return 0; } -static struct nfs_page * -nfs_page_private_request(struct page *page) +static struct nfs_page *nfs_folio_private_request(struct folio *folio) { - if (!PagePrivate(page)) - return NULL; - return (struct nfs_page *)page_private(page); + return folio_get_private(folio); } -/* - * nfs_page_find_head_request_locked - find head request associated with @page +/** + * nfs_folio_find_private_request - find head request associated with a folio + * @folio: pointer to folio * * must be called while holding the inode lock. * * returns matching head request with reference held, or NULL if not found. */ -static struct nfs_page * -nfs_page_find_private_request(struct page *page) +static struct nfs_page *nfs_folio_find_private_request(struct folio *folio) { - struct address_space *mapping = page_file_mapping(page); + struct address_space *mapping = folio_file_mapping(folio); struct nfs_page *req; - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return NULL; spin_lock(&mapping->private_lock); - req = nfs_page_private_request(page); + req = nfs_folio_private_request(folio); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); @@ -204,18 +201,17 @@ nfs_page_find_private_request(struct page *page) return req; } -static struct nfs_page * -nfs_page_find_swap_request(struct page *page) +static struct nfs_page *nfs_folio_find_swap_request(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *req = NULL; - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) return NULL; mutex_lock(&nfsi->commit_mutex); - if (PageSwapCache(page)) { + if (folio_test_swapcache(folio)) { req = nfs_page_search_commits_for_head_request_locked(nfsi, - page); + folio); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); @@ -225,29 +221,30 @@ nfs_page_find_swap_request(struct page *page) return req; } -/* - * nfs_page_find_head_request - find head request associated with @page +/** + * nfs_folio_find_head_request - find head request associated with a folio + * @folio: pointer to folio * * returns matching head request with reference held, or NULL if not found. */ -static struct nfs_page *nfs_page_find_head_request(struct page *page) +static struct nfs_page *nfs_folio_find_head_request(struct folio *folio) { struct nfs_page *req; - req = nfs_page_find_private_request(page); + req = nfs_folio_find_private_request(folio); if (!req) - req = nfs_page_find_swap_request(page); + req = nfs_folio_find_swap_request(folio); return req; } -static struct nfs_page *nfs_find_and_lock_page_request(struct page *page) +static struct nfs_page *nfs_folio_find_and_lock_request(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; struct nfs_page *req, *head; int ret; for (;;) { - req = nfs_page_find_head_request(page); + req = nfs_folio_find_head_request(folio); if (!req) return req; head = nfs_page_group_lock_head(req); @@ -261,9 +258,9 @@ static struct nfs_page *nfs_find_and_lock_page_request(struct page *page) return ERR_PTR(ret); } /* Ensure that nobody removed the request before we locked it */ - if (head == nfs_page_private_request(page)) + if (head == nfs_folio_private_request(folio)) break; - if (PageSwapCache(page)) + if (folio_test_swapcache(folio)) break; nfs_unlock_and_release_request(head); } @@ -271,18 +268,19 @@ static struct nfs_page *nfs_find_and_lock_page_request(struct page *page) } /* Adjust the file length if we're writing beyond the end */ -static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) +static void nfs_grow_file(struct folio *folio, unsigned int offset, + unsigned int count) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; loff_t end, i_size; pgoff_t end_index; spin_lock(&inode->i_lock); i_size = i_size_read(inode); - end_index = (i_size - 1) >> PAGE_SHIFT; - if (i_size > 0 && page_index(page) < end_index) + end_index = ((i_size - 1) >> folio_shift(folio)) << folio_order(folio); + if (i_size > 0 && folio_index(folio) < end_index) goto out; - end = page_file_offset(page) + ((loff_t)offset+count); + end = folio_file_pos(folio) + (loff_t)offset + (loff_t)count; if (i_size >= end) goto out; trace_nfs_size_grow(inode, end); @@ -308,11 +306,11 @@ static void nfs_set_pageerror(struct address_space *mapping) spin_unlock(&inode->i_lock); } -static void nfs_mapping_set_error(struct page *page, int error) +static void nfs_mapping_set_error(struct folio *folio, int error) { - struct address_space *mapping = page_file_mapping(page); + struct address_space *mapping = folio_file_mapping(folio); - SetPageError(page); + folio_set_error(folio); filemap_set_wb_err(mapping, error); if (mapping->host) errseq_set(&mapping->host->i_sb->s_wb_err, @@ -359,9 +357,9 @@ nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) */ static bool nfs_page_group_covers_page(struct nfs_page *req) { + unsigned int len = nfs_folio_length(nfs_page_to_folio(req)); struct nfs_page *tmp; unsigned int pos = 0; - unsigned int len = nfs_page_length(req->wb_page); nfs_page_group_lock(req); @@ -381,11 +379,13 @@ static bool nfs_page_group_covers_page(struct nfs_page *req) */ static void nfs_mark_uptodate(struct nfs_page *req) { - if (PageUptodate(req->wb_page)) + struct folio *folio = nfs_page_to_folio(req); + + if (folio_test_uptodate(folio)) return; if (!nfs_page_group_covers_page(req)) return; - SetPageUptodate(req->wb_page); + folio_mark_uptodate(folio); } static int wb_priority(struct writeback_control *wbc) @@ -407,35 +407,34 @@ int nfs_congestion_kb; #define NFS_CONGESTION_OFF_THRESH \ (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) -static void nfs_set_page_writeback(struct page *page) +static void nfs_folio_set_writeback(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); - int ret = test_set_page_writeback(page); - - WARN_ON_ONCE(ret != 0); + struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host); - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) + folio_start_writeback(folio); + if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH) nfss->write_congested = 1; } -static void nfs_end_page_writeback(struct nfs_page *req) +static void nfs_folio_end_writeback(struct folio *folio) { - struct inode *inode = page_file_mapping(req->wb_page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); - bool is_done; + struct nfs_server *nfss = NFS_SERVER(folio_file_mapping(folio)->host); - is_done = nfs_page_group_sync_on_bit(req, PG_WB_END); - nfs_unlock_request(req); - if (!is_done) - return; - - end_page_writeback(req->wb_page); - if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) + folio_end_writeback(folio); + if (atomic_long_dec_return(&nfss->writeback) < + NFS_CONGESTION_OFF_THRESH) nfss->write_congested = 0; } +static void nfs_page_end_writeback(struct nfs_page *req) +{ + if (nfs_page_group_sync_on_bit(req, PG_WB_END)) { + nfs_unlock_request(req); + nfs_folio_end_writeback(nfs_page_to_folio(req)); + } else + nfs_unlock_request(req); +} + /* * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests * @@ -550,7 +549,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) /* * nfs_lock_and_join_requests - join all subreqs to the head req - * @page: the page used to lookup the "page group" of nfs_page structures + * @folio: the folio used to lookup the "page group" of nfs_page structures * * This function joins all sub requests to the head request by first * locking all requests in the group, cancelling any pending operations @@ -560,13 +559,12 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) * * Returns a locked, referenced pointer to the head request - which after * this call is guaranteed to be the only request associated with the page. - * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * Returns NULL if no requests are found for @folio, or a ERR_PTR if an * error was encountered. */ -static struct nfs_page * -nfs_lock_and_join_requests(struct page *page) +static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; struct nfs_page *head; int ret; @@ -575,7 +573,7 @@ nfs_lock_and_join_requests(struct page *page) * reference to the whole page group - the group will not be destroyed * until the head reference is released. */ - head = nfs_find_and_lock_page_request(page); + head = nfs_folio_find_and_lock_request(folio); if (IS_ERR_OR_NULL(head)) return head; @@ -593,11 +591,10 @@ nfs_lock_and_join_requests(struct page *page) static void nfs_write_error(struct nfs_page *req, int error) { - trace_nfs_write_error(page_file_mapping(req->wb_page)->host, req, - error); - nfs_mapping_set_error(req->wb_page, error); + trace_nfs_write_error(nfs_page_to_inode(req), req, error); + nfs_mapping_set_error(nfs_page_to_folio(req), error); nfs_inode_remove_request(req); - nfs_end_page_writeback(req); + nfs_page_end_writeback(req); nfs_release_request(req); } @@ -605,21 +602,21 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct page *page, +static int nfs_page_async_flush(struct folio *folio, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; int ret = 0; - req = nfs_lock_and_join_requests(page); + req = nfs_lock_and_join_requests(folio); if (!req) goto out; ret = PTR_ERR(req); if (IS_ERR(req)) goto out; - nfs_set_page_writeback(page); + nfs_folio_set_writeback(folio); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); /* If there is a fatal error that covers this write, just exit */ @@ -637,12 +634,12 @@ static int nfs_page_async_flush(struct page *page, goto out_launder; if (wbc->sync_mode == WB_SYNC_NONE) ret = AOP_WRITEPAGE_ACTIVATE; - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); nfs_redirty_request(req); pgio->pg_error = 0; } else - nfs_add_stats(page_file_mapping(page)->host, - NFSIOS_WRITEPAGES, 1); + nfs_add_stats(folio_file_mapping(folio)->host, + NFSIOS_WRITEPAGES, 1); out: return ret; out_launder: @@ -650,21 +647,21 @@ out_launder: return 0; } -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, +static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - nfs_pageio_cond_complete(pgio, page_index(page)); - return nfs_page_async_flush(page, wbc, pgio); + nfs_pageio_cond_complete(pgio, folio_index(folio)); + return nfs_page_async_flush(folio, wbc, pgio); } /* * Write an mmapped page to the server. */ -static int nfs_writepage_locked(struct page *page, +static int nfs_writepage_locked(struct folio *folio, struct writeback_control *wbc) { struct nfs_pageio_descriptor pgio; - struct inode *inode = page_file_mapping(page)->host; + struct inode *inode = folio_file_mapping(folio)->host; int err; if (wbc->sync_mode == WB_SYNC_NONE && @@ -672,9 +669,9 @@ static int nfs_writepage_locked(struct page *page, return AOP_WRITEPAGE_ACTIVATE; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_pageio_init_write(&pgio, inode, 0, - false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio); + nfs_pageio_init_write(&pgio, inode, 0, false, + &nfs_async_write_completion_ops); + err = nfs_do_writepage(folio, wbc, &pgio); pgio.pg_error = 0; nfs_pageio_complete(&pgio); return err; @@ -682,19 +679,22 @@ static int nfs_writepage_locked(struct page *page, int nfs_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); int ret; - ret = nfs_writepage_locked(page, wbc); + ret = nfs_writepage_locked(folio, wbc); if (ret != AOP_WRITEPAGE_ACTIVATE) unlock_page(page); return ret; } -static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +static int nfs_writepages_callback(struct page *page, + struct writeback_control *wbc, void *data) { + struct folio *folio = page_folio(page); int ret; - ret = nfs_do_writepage(page, wbc, data); + ret = nfs_do_writepage(folio, wbc, data); if (ret != AOP_WRITEPAGE_ACTIVATE) unlock_page(page); return ret; @@ -750,10 +750,11 @@ out_err: /* * Insert a write request into an inode */ -static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static void nfs_inode_add_request(struct nfs_page *req) { - struct address_space *mapping = page_file_mapping(req->wb_page); - struct nfs_inode *nfsi = NFS_I(inode); + struct folio *folio = nfs_page_to_folio(req); + struct address_space *mapping = folio_file_mapping(folio); + struct nfs_inode *nfsi = NFS_I(mapping->host); WARN_ON_ONCE(req->wb_this_page != req); @@ -765,10 +766,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) * with invalidate/truncate. */ spin_lock(&mapping->private_lock); - if (likely(!PageSwapCache(req->wb_page))) { + if (likely(!folio_test_swapcache(folio))) { set_bit(PG_MAPPED, &req->wb_flags); - SetPagePrivate(req->wb_page); - set_page_private(req->wb_page, (unsigned long)req); + folio_set_private(folio); + folio->private = req; } spin_unlock(&mapping->private_lock); atomic_long_inc(&nfsi->nrequests); @@ -785,47 +786,43 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct address_space *mapping = page_file_mapping(req->wb_page); - struct inode *inode = mapping->host; - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_page *head; - if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { - head = req->wb_head; + struct folio *folio = nfs_page_to_folio(req->wb_head); + struct address_space *mapping = folio_file_mapping(folio); spin_lock(&mapping->private_lock); - if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { - set_page_private(head->wb_page, 0); - ClearPagePrivate(head->wb_page); - clear_bit(PG_MAPPED, &head->wb_flags); + if (likely(folio && !folio_test_swapcache(folio))) { + folio->private = NULL; + folio_clear_private(folio); + clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } spin_unlock(&mapping->private_lock); } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { nfs_release_request(req); - atomic_long_dec(&nfsi->nrequests); + atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests); } } -static void -nfs_mark_request_dirty(struct nfs_page *req) +static void nfs_mark_request_dirty(struct nfs_page *req) { - if (req->wb_page) - __set_page_dirty_nobuffers(req->wb_page); + struct folio *folio = nfs_page_to_folio(req); + if (folio) + filemap_dirty_folio(folio_mapping(folio), folio); } /* * nfs_page_search_commits_for_head_request_locked * - * Search through commit lists on @inode for the head request for @page. + * Search through commit lists on @inode for the head request for @folio. * Must be called while holding the inode (which is cinfo) lock. * * Returns the head request if found, or NULL if not found. */ static struct nfs_page * nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct page *page) + struct folio *folio) { struct nfs_page *freq, *t; struct nfs_commit_info cinfo; @@ -834,13 +831,13 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, nfs_init_cinfo_from_inode(&cinfo, inode); /* search through pnfs commit lists */ - freq = pnfs_search_commit_reqs(inode, &cinfo, page); + freq = pnfs_search_commit_reqs(inode, &cinfo, folio); if (freq) return freq->wb_head; /* Linearly search the commit list for the correct request */ list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { - if (freq->wb_page == page) + if (nfs_page_to_folio(freq) == folio) return freq->wb_head; } @@ -888,8 +885,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - if (req->wb_page) - nfs_mark_page_unstable(req->wb_page, cinfo); + nfs_folio_mark_unstable(nfs_page_to_folio(req), cinfo); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -948,12 +944,15 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, nfs_request_add_commit_list(req, cinfo); } -static void -nfs_clear_page_commit(struct page *page) +static void nfs_folio_clear_commit(struct folio *folio) { - dec_node_page_state(page, NR_WRITEBACK); - dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, - WB_WRITEBACK); + if (folio) { + long nr = folio_nr_pages(folio); + + node_stat_mod_folio(folio, NR_WRITEBACK, -nr); + wb_stat_mod(&inode_to_bdi(folio_file_mapping(folio)->host)->wb, + WB_WRITEBACK, -nr); + } } /* Called holding the request lock on @req */ @@ -971,7 +970,7 @@ nfs_clear_request_commit(struct nfs_page *req) nfs_request_remove_commit_list(req, &cinfo); } mutex_unlock(&NFS_I(inode)->commit_mutex); - nfs_clear_page_commit(req->wb_page); + nfs_folio_clear_commit(nfs_page_to_folio(req)); } } @@ -1003,7 +1002,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes < bytes)) { trace_nfs_comp_error(hdr->inode, req, hdr->error); - nfs_mapping_set_error(req->wb_page, hdr->error); + nfs_mapping_set_error(nfs_page_to_folio(req), + hdr->error); goto remove_req; } if (nfs_write_need_commit(hdr)) { @@ -1017,7 +1017,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) remove_req: nfs_inode_remove_request(req); next: - nfs_end_page_writeback(req); + nfs_page_end_writeback(req); nfs_release_request(req); } out: @@ -1093,10 +1093,9 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, * If the attempt fails, then the existing request is flushed out * to disk. */ -static struct nfs_page *nfs_try_to_update_request(struct inode *inode, - struct page *page, - unsigned int offset, - unsigned int bytes) +static struct nfs_page *nfs_try_to_update_request(struct folio *folio, + unsigned int offset, + unsigned int bytes) { struct nfs_page *req; unsigned int rqend; @@ -1105,7 +1104,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, end = offset + bytes; - req = nfs_lock_and_join_requests(page); + req = nfs_lock_and_join_requests(folio); if (IS_ERR_OR_NULL(req)) return req; @@ -1138,7 +1137,7 @@ out_flushme: */ nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); - error = nfs_wb_page(inode, page); + error = nfs_wb_folio(folio_file_mapping(folio)->host, folio); return (error < 0) ? ERR_PTR(error) : NULL; } @@ -1149,40 +1148,42 @@ out_flushme: * if we have to add a new request. Also assumes that the caller has * already called nfs_flush_incompatible() if necessary. */ -static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, - struct page *page, unsigned int offset, unsigned int bytes) +static struct nfs_page *nfs_setup_write_request(struct nfs_open_context *ctx, + struct folio *folio, + unsigned int offset, + unsigned int bytes) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req; + struct nfs_page *req; - req = nfs_try_to_update_request(inode, page, offset, bytes); + req = nfs_try_to_update_request(folio, offset, bytes); if (req != NULL) goto out; - req = nfs_create_request(ctx, page, offset, bytes); + req = nfs_page_create_from_folio(ctx, folio, offset, bytes); if (IS_ERR(req)) goto out; - nfs_inode_add_request(inode, req); + nfs_inode_add_request(req); out: return req; } -static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, - unsigned int offset, unsigned int count) +static int nfs_writepage_setup(struct nfs_open_context *ctx, + struct folio *folio, unsigned int offset, + unsigned int count) { - struct nfs_page *req; + struct nfs_page *req; - req = nfs_setup_write_request(ctx, page, offset, count); + req = nfs_setup_write_request(ctx, folio, offset, count); if (IS_ERR(req)) return PTR_ERR(req); /* Update file length */ - nfs_grow_file(page, offset, count); + nfs_grow_file(folio, offset, count); nfs_mark_uptodate(req); nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); return 0; } -int nfs_flush_incompatible(struct file *file, struct page *page) +int nfs_flush_incompatible(struct file *file, struct folio *folio) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct nfs_lock_context *l_ctx; @@ -1198,12 +1199,12 @@ int nfs_flush_incompatible(struct file *file, struct page *page) * dropped page. */ do { - req = nfs_page_find_head_request(page); + req = nfs_folio_find_head_request(folio); if (req == NULL) return 0; l_ctx = req->wb_lock_context; - do_flush = req->wb_page != page || - !nfs_match_open_context(nfs_req_openctx(req), ctx); + do_flush = nfs_page_to_folio(req) != folio || + !nfs_match_open_context(nfs_req_openctx(req), ctx); if (l_ctx && flctx && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock))) { @@ -1212,7 +1213,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) nfs_release_request(req); if (!do_flush) return 0; - status = nfs_wb_page(page_file_mapping(page)->host, page); + status = nfs_wb_folio(folio_file_mapping(folio)->host, folio); } while (status == 0); return status; } @@ -1284,9 +1285,9 @@ out: * the PageUptodate() flag. In this case, we will need to turn off * write optimisations that depend on the page contents being correct. */ -static bool nfs_write_pageuptodate(struct page *page, struct inode *inode, - unsigned int pagelen) +static bool nfs_folio_write_uptodate(struct folio *folio, unsigned int pagelen) { + struct inode *inode = folio_file_mapping(folio)->host; struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) @@ -1300,7 +1301,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode, out: if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0) return false; - return PageUptodate(page) != 0; + return folio_test_uptodate(folio) != 0; } static bool @@ -1318,16 +1319,17 @@ is_whole_file_wrlock(struct file_lock *fl) * If the file is opened for synchronous writes then we can just skip the rest * of the checks. */ -static int nfs_can_extend_write(struct file *file, struct page *page, - struct inode *inode, unsigned int pagelen) +static int nfs_can_extend_write(struct file *file, struct folio *folio, + unsigned int pagelen) { - int ret; + struct inode *inode = file_inode(file); struct file_lock_context *flctx = locks_inode_context(inode); struct file_lock *fl; + int ret; if (file->f_flags & O_DSYNC) return 0; - if (!nfs_write_pageuptodate(page, inode, pagelen)) + if (!nfs_folio_write_uptodate(folio, pagelen)) return 0; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return 1; @@ -1359,33 +1361,33 @@ static int nfs_can_extend_write(struct file *file, struct page *page, * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad * things with a page scheduled for an RPC call (e.g. invalidate it). */ -int nfs_updatepage(struct file *file, struct page *page, - unsigned int offset, unsigned int count) +int nfs_update_folio(struct file *file, struct folio *folio, + unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct address_space *mapping = page_file_mapping(page); - struct inode *inode = mapping->host; - unsigned int pagelen = nfs_page_length(page); + struct address_space *mapping = folio_file_mapping(folio); + struct inode *inode = mapping->host; + unsigned int pagelen = nfs_folio_length(folio); int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); - dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", - file, count, (long long)(page_file_offset(page) + offset)); + dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count, + (long long)(folio_file_pos(folio) + offset)); if (!count) goto out; - if (nfs_can_extend_write(file, page, inode, pagelen)) { + if (nfs_can_extend_write(file, folio, pagelen)) { count = max(count + offset, pagelen); offset = 0; } - status = nfs_writepage_setup(ctx, page, offset, count); + status = nfs_writepage_setup(ctx, folio, offset, count); if (status < 0) nfs_set_pageerror(mapping); out: - dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", + dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; } @@ -1421,13 +1423,13 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, */ static void nfs_redirty_request(struct nfs_page *req) { - struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host); + struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); /* Bump the transmission count */ req->wb_nio++; nfs_mark_request_dirty(req); atomic_long_inc(&nfsi->redirtied_pages); - nfs_end_page_writeback(req); + nfs_page_end_writeback(req); nfs_release_request(req); } @@ -1785,18 +1787,18 @@ void nfs_retry_commit(struct list_head *page_list, req = nfs_list_entry(page_list->next); nfs_list_remove_request(req); nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx); - if (!cinfo->dreq) - nfs_clear_page_commit(req->wb_page); + nfs_folio_clear_commit(nfs_page_to_folio(req)); nfs_unlock_and_release_request(req); } } EXPORT_SYMBOL_GPL(nfs_retry_commit); -static void -nfs_commit_resched_write(struct nfs_commit_info *cinfo, - struct nfs_page *req) +static void nfs_commit_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) { - __set_page_dirty_nobuffers(req->wb_page); + struct folio *folio = nfs_page_to_folio(req); + + filemap_dirty_folio(folio_mapping(folio), folio); } /* @@ -1847,12 +1849,13 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) int status = data->task.tk_status; struct nfs_commit_info cinfo; struct nfs_server *nfss; + struct folio *folio; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (req->wb_page) - nfs_clear_page_commit(req->wb_page); + folio = nfs_page_to_folio(req); + nfs_folio_clear_commit(folio); dprintk("NFS: commit (%s/%llu %d@%lld)", nfs_req_openctx(req)->dentry->d_sb->s_id, @@ -1860,10 +1863,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) req->wb_bytes, (long long)req_offset(req)); if (status < 0) { - if (req->wb_page) { + if (folio) { trace_nfs_commit_error(data->inode, req, status); - nfs_mapping_set_error(req->wb_page, status); + nfs_mapping_set_error(folio, status); nfs_inode_remove_request(req); } dprintk_cont(", error = %d\n", status); @@ -1874,7 +1877,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) * returned by the server against all stored verfs. */ if (nfs_write_match_verf(verf, req)) { /* We have a match */ - if (req->wb_page) + if (folio) nfs_inode_remove_request(req); dprintk_cont(" OK\n"); goto next; @@ -2055,7 +2058,7 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) /* blocking call to cancel all requests and join to a single (head) * request */ - req = nfs_lock_and_join_requests(&folio->page); + req = nfs_lock_and_join_requests(folio); if (IS_ERR(req)) { ret = PTR_ERR(req); @@ -2071,13 +2074,18 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) return ret; } -/* - * Write back all requests on one page - we do this before reading it. +/** + * nfs_wb_folio - Write back all requests on one page + * @inode: pointer to page + * @folio: pointer to folio + * + * Assumes that the folio has been locked by the caller, and will + * not unlock it. */ -int nfs_wb_page(struct inode *inode, struct page *page) +int nfs_wb_folio(struct inode *inode, struct folio *folio) { - loff_t range_start = page_file_offset(page); - loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); + loff_t range_start = folio_file_pos(folio); + loff_t range_end = range_start + (loff_t)folio_size(folio) - 1; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, @@ -2086,25 +2094,25 @@ int nfs_wb_page(struct inode *inode, struct page *page) }; int ret; - trace_nfs_writeback_page_enter(inode); + trace_nfs_writeback_folio(inode, folio); for (;;) { - wait_on_page_writeback(page); - if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc); + folio_wait_writeback(folio); + if (folio_clear_dirty_for_io(folio)) { + ret = nfs_writepage_locked(folio, &wbc); if (ret < 0) goto out_error; continue; } ret = 0; - if (!PagePrivate(page)) + if (!folio_test_private(folio)) break; ret = nfs_commit_inode(inode, FLUSH_SYNC); if (ret < 0) goto out_error; } out_error: - trace_nfs_writeback_page_exit(inode, ret); + trace_nfs_writeback_folio_done(inode, folio, ret); return ret; } diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c deleted file mode 100644 index 76bee0a0d308..000000000000 --- a/fs/nfsd/fault_inject.c +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com> - * - * Uses debugfs to create fault injection points for client testing - */ - -#include <linux/types.h> -#include <linux/fs.h> -#include <linux/debugfs.h> -#include <linux/module.h> -#include <linux/nsproxy.h> -#include <linux/sunrpc/addr.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> - -#include "state.h" -#include "netns.h" - -struct nfsd_fault_inject_op { - char *file; - u64 (*get)(void); - u64 (*set_val)(u64); - u64 (*set_clnt)(struct sockaddr_storage *, size_t); -}; - -static struct dentry *debug_dir; - -static ssize_t fault_inject_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - static u64 val; - char read_buf[25]; - size_t size; - loff_t pos = *ppos; - struct nfsd_fault_inject_op *op = file_inode(file)->i_private; - - if (!pos) - val = op->get(); - size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); - - return simple_read_from_buffer(buf, len, ppos, read_buf, size); -} - -static ssize_t fault_inject_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - char write_buf[INET6_ADDRSTRLEN]; - size_t size = min(sizeof(write_buf) - 1, len); - struct net *net = current->nsproxy->net_ns; - struct sockaddr_storage sa; - struct nfsd_fault_inject_op *op = file_inode(file)->i_private; - u64 val; - char *nl; - - if (copy_from_user(write_buf, buf, size)) - return -EFAULT; - write_buf[size] = '\0'; - - /* Deal with any embedded newlines in the string */ - nl = strchr(write_buf, '\n'); - if (nl) { - size = nl - write_buf; - *nl = '\0'; - } - - size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); - if (size > 0) { - val = op->set_clnt(&sa, size); - if (val) - pr_info("NFSD [%s]: Client %s had %llu state object(s)\n", - op->file, write_buf, val); - } else { - val = simple_strtoll(write_buf, NULL, 0); - if (val == 0) - pr_info("NFSD Fault Injection: %s (all)", op->file); - else - pr_info("NFSD Fault Injection: %s (n = %llu)", - op->file, val); - val = op->set_val(val); - pr_info("NFSD: %s: found %llu", op->file, val); - } - return len; /* on success, claim we got the whole input */ -} - -static const struct file_operations fops_nfsd = { - .owner = THIS_MODULE, - .read = fault_inject_read, - .write = fault_inject_write, -}; - -void nfsd_fault_inject_cleanup(void) -{ - debugfs_remove_recursive(debug_dir); -} - -static struct nfsd_fault_inject_op inject_ops[] = { - { - .file = "forget_clients", - .get = nfsd_inject_print_clients, - .set_val = nfsd_inject_forget_clients, - .set_clnt = nfsd_inject_forget_client, - }, - { - .file = "forget_locks", - .get = nfsd_inject_print_locks, - .set_val = nfsd_inject_forget_locks, - .set_clnt = nfsd_inject_forget_client_locks, - }, - { - .file = "forget_openowners", - .get = nfsd_inject_print_openowners, - .set_val = nfsd_inject_forget_openowners, - .set_clnt = nfsd_inject_forget_client_openowners, - }, - { - .file = "forget_delegations", - .get = nfsd_inject_print_delegations, - .set_val = nfsd_inject_forget_delegations, - .set_clnt = nfsd_inject_forget_client_delegations, - }, - { - .file = "recall_delegations", - .get = nfsd_inject_print_delegations, - .set_val = nfsd_inject_recall_delegations, - .set_clnt = nfsd_inject_recall_client_delegations, - }, -}; - -void nfsd_fault_inject_init(void) -{ - unsigned int i; - struct nfsd_fault_inject_op *op; - umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; - - debug_dir = debugfs_create_dir("nfsd", NULL); - - for (i = 0; i < ARRAY_SIZE(inject_ops); i++) { - op = &inject_ops[i]; - debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd); - } -} diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index c0950edb26b0..6e8712bd7c99 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -331,37 +331,27 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) return nf; } +/** + * nfsd_file_check_write_error - check for writeback errors on a file + * @nf: nfsd_file to check for writeback errors + * + * Check whether a nfsd_file has an unseen error. Reset the write + * verifier if so. + */ static void -nfsd_file_fsync(struct nfsd_file *nf) -{ - struct file *file = nf->nf_file; - int ret; - - if (!file || !(file->f_mode & FMODE_WRITE)) - return; - ret = vfs_fsync(file, 1); - trace_nfsd_file_fsync(nf, ret); - if (ret) - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); -} - -static int nfsd_file_check_write_error(struct nfsd_file *nf) { struct file *file = nf->nf_file; - if (!file || !(file->f_mode & FMODE_WRITE)) - return 0; - return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)); + if ((file->f_mode & FMODE_WRITE) && + filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err))) + nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); } static void nfsd_file_hash_remove(struct nfsd_file *nf) { trace_nfsd_file_unhash(nf); - - if (nfsd_file_check_write_error(nf)) - nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, nfsd_file_rhash_params); } @@ -387,23 +377,12 @@ nfsd_file_free(struct nfsd_file *nf) this_cpu_add(nfsd_file_total_age, age); nfsd_file_unhash(nf); - - /* - * We call fsync here in order to catch writeback errors. It's not - * strictly required by the protocol, but an nfsd_file could get - * evicted from the cache before a COMMIT comes in. If another - * task were to open that file in the interim and scrape the error, - * then the client may never see it. By calling fsync here, we ensure - * that writeback happens before the entry is freed, and that any - * errors reported result in the write verifier changing. - */ - nfsd_file_fsync(nf); - if (nf->nf_mark) nfsd_file_mark_put(nf->nf_mark); if (nf->nf_file) { get_file(nf->nf_file); filp_close(nf->nf_file, NULL); + nfsd_file_check_write_error(nf); fput(nf->nf_file); } @@ -452,7 +431,7 @@ static bool nfsd_file_lru_remove(struct nfsd_file *nf) struct nfsd_file * nfsd_file_get(struct nfsd_file *nf) { - if (likely(refcount_inc_not_zero(&nf->nf_ref))) + if (nf && refcount_inc_not_zero(&nf->nf_ref)) return nf; return NULL; } @@ -1107,8 +1086,7 @@ retry: rcu_read_lock(); nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, nfsd_file_rhash_params); - if (nf) - nf = nfsd_file_get(nf); + nf = nfsd_file_get(nf); rcu_read_unlock(); if (nf) { @@ -1159,6 +1137,7 @@ wait_for_construction: out: if (status == nfs_ok) { this_cpu_inc(nfsd_file_acquisitions); + nfsd_file_check_write_error(nf); *pnf = nf; } else { if (refcount_dec_and_test(&nf->nf_ref)) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 995cb2c90b1a..12b2b9bc07bf 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -377,10 +377,11 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = { }, }; -static unsigned int nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]); const struct svc_version nfsd_acl_version2 = { .vs_vers = 2, - .vs_nproc = 5, + .vs_nproc = ARRAY_SIZE(nfsd_acl_procedures2), .vs_proc = nfsd_acl_procedures2, .vs_count = nfsd_acl_count2, .vs_dispatch = nfsd_dispatch, diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 887803735e2a..73adca47d373 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -266,10 +266,11 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = { }, }; -static unsigned int nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]); const struct svc_version nfsd_acl_version3 = { .vs_vers = 3, - .vs_nproc = 3, + .vs_nproc = ARRAY_SIZE(nfsd_acl_procedures3), .vs_proc = nfsd_acl_procedures3, .vs_count = nfsd_acl_count3, .vs_dispatch = nfsd_dispatch, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index f41992ecd0d7..e6bb8eeb5bc2 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -1064,10 +1064,11 @@ static const struct svc_procedure nfsd_procedures3[22] = { }, }; -static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures3)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_count3[ARRAY_SIZE(nfsd_procedures3)]); const struct svc_version nfsd_version3 = { .vs_vers = 3, - .vs_nproc = 22, + .vs_nproc = ARRAY_SIZE(nfsd_procedures3), .vs_proc = nfsd_procedures3, .vs_dispatch = nfsd_dispatch, .vs_count = nfsd_count3, diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 3564d1c6f610..e8a80052cb1b 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -323,11 +323,11 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) if (ls->ls_recalled) goto out_unlock; - ls->ls_recalled = true; - atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); if (list_empty(&ls->ls_layouts)) goto out_unlock; + ls->ls_recalled = true; + atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid); refcount_inc(&ls->ls_stid.sc_count); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f189ba7995f5..5ae670807449 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1214,8 +1214,10 @@ out: return status; out_put_dst: nfsd_file_put(*dst); + *dst = NULL; out_put_src: nfsd_file_put(*src); + *src = NULL; goto out; } @@ -1293,15 +1295,15 @@ extern void nfs_sb_deactive(struct super_block *sb); * setup a work entry in the ssc delayed unmount list. */ static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr, - struct nfsd4_ssc_umount_item **retwork, struct vfsmount **ss_mnt) + struct nfsd4_ssc_umount_item **nsui) { struct nfsd4_ssc_umount_item *ni = NULL; struct nfsd4_ssc_umount_item *work = NULL; struct nfsd4_ssc_umount_item *tmp; DEFINE_WAIT(wait); + __be32 status = 0; - *ss_mnt = NULL; - *retwork = NULL; + *nsui = NULL; work = kzalloc(sizeof(*work), GFP_KERNEL); try_again: spin_lock(&nn->nfsd_ssc_lock); @@ -1325,12 +1327,12 @@ try_again: finish_wait(&nn->nfsd_ssc_waitq, &wait); goto try_again; } - *ss_mnt = ni->nsui_vfsmount; + *nsui = ni; refcount_inc(&ni->nsui_refcnt); spin_unlock(&nn->nfsd_ssc_lock); kfree(work); - /* return vfsmount in ss_mnt */ + /* return vfsmount in (*nsui)->nsui_vfsmount */ return 0; } if (work) { @@ -1338,31 +1340,32 @@ try_again: refcount_set(&work->nsui_refcnt, 2); work->nsui_busy = true; list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list); - *retwork = work; - } + *nsui = work; + } else + status = nfserr_resource; spin_unlock(&nn->nfsd_ssc_lock); - return 0; + return status; } -static void nfsd4_ssc_update_dul_work(struct nfsd_net *nn, - struct nfsd4_ssc_umount_item *work, struct vfsmount *ss_mnt) +static void nfsd4_ssc_update_dul(struct nfsd_net *nn, + struct nfsd4_ssc_umount_item *nsui, + struct vfsmount *ss_mnt) { - /* set nsui_vfsmount, clear busy flag and wakeup waiters */ spin_lock(&nn->nfsd_ssc_lock); - work->nsui_vfsmount = ss_mnt; - work->nsui_busy = false; + nsui->nsui_vfsmount = ss_mnt; + nsui->nsui_busy = false; wake_up_all(&nn->nfsd_ssc_waitq); spin_unlock(&nn->nfsd_ssc_lock); } -static void nfsd4_ssc_cancel_dul_work(struct nfsd_net *nn, - struct nfsd4_ssc_umount_item *work) +static void nfsd4_ssc_cancel_dul(struct nfsd_net *nn, + struct nfsd4_ssc_umount_item *nsui) { spin_lock(&nn->nfsd_ssc_lock); - list_del(&work->nsui_list); + list_del(&nsui->nsui_list); wake_up_all(&nn->nfsd_ssc_waitq); spin_unlock(&nn->nfsd_ssc_lock); - kfree(work); + kfree(nsui); } /* @@ -1370,7 +1373,7 @@ static void nfsd4_ssc_cancel_dul_work(struct nfsd_net *nn, */ static __be32 nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, - struct vfsmount **mount) + struct nfsd4_ssc_umount_item **nsui) { struct file_system_type *type; struct vfsmount *ss_mnt; @@ -1381,7 +1384,6 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, char *ipaddr, *dev_name, *raw_data; int len, raw_len; __be32 status = nfserr_inval; - struct nfsd4_ssc_umount_item *work = NULL; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); naddr = &nss->u.nl4_addr; @@ -1389,6 +1391,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, naddr->addr_len, (struct sockaddr *)&tmp_addr, sizeof(tmp_addr)); + *nsui = NULL; if (tmp_addrlen == 0) goto out_err; @@ -1431,10 +1434,10 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, goto out_free_rawdata; snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep); - status = nfsd4_ssc_setup_dul(nn, ipaddr, &work, &ss_mnt); + status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui); if (status) goto out_free_devname; - if (ss_mnt) + if ((*nsui)->nsui_vfsmount) goto out_done; /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */ @@ -1442,15 +1445,12 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, module_put(type->owner); if (IS_ERR(ss_mnt)) { status = nfserr_nodev; - if (work) - nfsd4_ssc_cancel_dul_work(nn, work); + nfsd4_ssc_cancel_dul(nn, *nsui); goto out_free_devname; } - if (work) - nfsd4_ssc_update_dul_work(nn, work, ss_mnt); + nfsd4_ssc_update_dul(nn, *nsui, ss_mnt); out_done: status = 0; - *mount = ss_mnt; out_free_devname: kfree(dev_name); @@ -1474,7 +1474,7 @@ out_err: static __be32 nfsd4_setup_inter_ssc(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - struct nfsd4_copy *copy, struct vfsmount **mount) + struct nfsd4_copy *copy) { struct svc_fh *s_fh = NULL; stateid_t *s_stid = ©->cp_src_stateid; @@ -1487,7 +1487,7 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp, if (status) goto out; - status = nfsd4_interssc_connect(copy->cp_src, rqstp, mount); + status = nfsd4_interssc_connect(copy->cp_src, rqstp, ©->ss_nsui); if (status) goto out; @@ -1505,45 +1505,26 @@ out: } static void -nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp, +nfsd4_cleanup_inter_ssc(struct nfsd4_ssc_umount_item *nsui, struct file *filp, struct nfsd_file *dst) { - bool found = false; - long timeout; - struct nfsd4_ssc_umount_item *tmp; - struct nfsd4_ssc_umount_item *ni = NULL; struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id); + long timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout); nfs42_ssc_close(filp); - nfsd_file_put(dst); fput(filp); - if (!nn) { - mntput(ss_mnt); - return; - } spin_lock(&nn->nfsd_ssc_lock); - timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout); - list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { - if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) { - list_del(&ni->nsui_list); - /* - * vfsmount can be shared by multiple exports, - * decrement refcnt. If the count drops to 1 it - * will be unmounted when nsui_expire expires. - */ - refcount_dec(&ni->nsui_refcnt); - ni->nsui_expire = jiffies + timeout; - list_add_tail(&ni->nsui_list, &nn->nfsd_ssc_mount_list); - found = true; - break; - } - } + list_del(&nsui->nsui_list); + /* + * vfsmount can be shared by multiple exports, + * decrement refcnt. If the count drops to 1 it + * will be unmounted when nsui_expire expires. + */ + refcount_dec(&nsui->nsui_refcnt); + nsui->nsui_expire = jiffies + timeout; + list_add_tail(&nsui->nsui_list, &nn->nfsd_ssc_mount_list); spin_unlock(&nn->nfsd_ssc_lock); - if (!found) { - mntput(ss_mnt); - return; - } } #else /* CONFIG_NFSD_V4_2_INTER_SSC */ @@ -1551,15 +1532,13 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp, static __be32 nfsd4_setup_inter_ssc(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - struct nfsd4_copy *copy, - struct vfsmount **mount) + struct nfsd4_copy *copy) { - *mount = NULL; return nfserr_inval; } static void -nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp, +nfsd4_cleanup_inter_ssc(struct nfsd4_ssc_umount_item *nsui, struct file *filp, struct nfsd_file *dst) { } @@ -1582,13 +1561,6 @@ nfsd4_setup_intra_ssc(struct svc_rqst *rqstp, ©->nf_dst); } -static void -nfsd4_cleanup_intra_ssc(struct nfsd_file *src, struct nfsd_file *dst) -{ - nfsd_file_put(src); - nfsd_file_put(dst); -} - static void nfsd4_cb_offload_release(struct nfsd4_callback *cb) { struct nfsd4_cb_offload *cbo = @@ -1700,18 +1672,27 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) memcpy(dst->cp_src, src->cp_src, sizeof(struct nl4_server)); memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid)); memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh)); - dst->ss_mnt = src->ss_mnt; + dst->ss_nsui = src->ss_nsui; +} + +static void release_copy_files(struct nfsd4_copy *copy) +{ + if (copy->nf_src) + nfsd_file_put(copy->nf_src); + if (copy->nf_dst) + nfsd_file_put(copy->nf_dst); } static void cleanup_async_copy(struct nfsd4_copy *copy) { nfs4_free_copy_state(copy); - nfsd_file_put(copy->nf_dst); - if (!nfsd4_ssc_is_inter(copy)) - nfsd_file_put(copy->nf_src); - spin_lock(©->cp_clp->async_lock); - list_del(©->copies); - spin_unlock(©->cp_clp->async_lock); + release_copy_files(copy); + if (copy->cp_clp) { + spin_lock(©->cp_clp->async_lock); + if (!list_empty(©->copies)) + list_del_init(©->copies); + spin_unlock(©->cp_clp->async_lock); + } nfs4_put_copy(copy); } @@ -1749,8 +1730,8 @@ static int nfsd4_do_async_copy(void *data) if (nfsd4_ssc_is_inter(copy)) { struct file *filp; - filp = nfs42_ssc_open(copy->ss_mnt, ©->c_fh, - ©->stateid); + filp = nfs42_ssc_open(copy->ss_nsui->nsui_vfsmount, + ©->c_fh, ©->stateid); if (IS_ERR(filp)) { switch (PTR_ERR(filp)) { case -EBADF: @@ -1764,11 +1745,10 @@ static int nfsd4_do_async_copy(void *data) } nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file, false); - nfsd4_cleanup_inter_ssc(copy->ss_mnt, filp, copy->nf_dst); + nfsd4_cleanup_inter_ssc(copy->ss_nsui, filp, copy->nf_dst); } else { nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file, copy->nf_dst->nf_file, false); - nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst); } do_callback: @@ -1790,8 +1770,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_notsupp; goto out; } - status = nfsd4_setup_inter_ssc(rqstp, cstate, copy, - ©->ss_mnt); + status = nfsd4_setup_inter_ssc(rqstp, cstate, copy); if (status) return nfserr_offload_denied; } else { @@ -1810,12 +1789,13 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); if (!async_copy) goto out_err; + INIT_LIST_HEAD(&async_copy->copies); + refcount_set(&async_copy->refcount, 1); async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); if (!async_copy->cp_src) goto out_err; if (!nfs4_init_copy_state(nn, copy)) goto out_err; - refcount_set(&async_copy->refcount, 1); memcpy(©->cp_res.cb_stateid, ©->cp_stateid.cs_stid, sizeof(copy->cp_res.cb_stateid)); dup_copy_fields(copy, async_copy); @@ -1832,38 +1812,53 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else { status = nfsd4_do_copy(copy, copy->nf_src->nf_file, copy->nf_dst->nf_file, true); - nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst); } out: + release_copy_files(copy); return status; out_err: + if (nfsd4_ssc_is_inter(copy)) { + /* + * Source's vfsmount of inter-copy will be unmounted + * by the laundromat. Use copy instead of async_copy + * since async_copy->ss_nsui might not be set yet. + */ + refcount_dec(©->ss_nsui->nsui_refcnt); + } if (async_copy) cleanup_async_copy(async_copy); status = nfserrno(-ENOMEM); - /* - * source's vfsmount of inter-copy will be unmounted - * by the laundromat - */ goto out; } -struct nfsd4_copy * -find_async_copy(struct nfs4_client *clp, stateid_t *stateid) +static struct nfsd4_copy * +find_async_copy_locked(struct nfs4_client *clp, stateid_t *stateid) { struct nfsd4_copy *copy; - spin_lock(&clp->async_lock); + lockdep_assert_held(&clp->async_lock); + list_for_each_entry(copy, &clp->async_copies, copies) { if (memcmp(©->cp_stateid.cs_stid, stateid, NFS4_STATEID_SIZE)) continue; - refcount_inc(©->refcount); - spin_unlock(&clp->async_lock); return copy; } - spin_unlock(&clp->async_lock); return NULL; } +static struct nfsd4_copy * +find_async_copy(struct nfs4_client *clp, stateid_t *stateid) +{ + struct nfsd4_copy *copy; + + spin_lock(&clp->async_lock); + copy = find_async_copy_locked(clp, stateid); + if (copy) + refcount_inc(©->refcount); + spin_unlock(&clp->async_lock); + return copy; +} + static __be32 nfsd4_offload_cancel(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -1948,22 +1943,24 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd_file_put(nf); return status; } + static __be32 nfsd4_offload_status(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_offload_status *os = &u->offload_status; - __be32 status = 0; + __be32 status = nfs_ok; struct nfsd4_copy *copy; struct nfs4_client *clp = cstate->clp; - copy = find_async_copy(clp, &os->stateid); - if (copy) { + spin_lock(&clp->async_lock); + copy = find_async_copy_locked(clp, &os->stateid); + if (copy) os->count = copy->cp_res.wr_bytes_written; - nfs4_put_copy(copy); - } else + else status = nfserr_bad_stateid; + spin_unlock(&clp->async_lock); return status; } @@ -3619,12 +3616,13 @@ static const struct svc_procedure nfsd_procedures4[2] = { }, }; -static unsigned int nfsd_count3[ARRAY_SIZE(nfsd_procedures4)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_count4[ARRAY_SIZE(nfsd_procedures4)]); const struct svc_version nfsd_version4 = { .vs_vers = 4, - .vs_nproc = 2, + .vs_nproc = ARRAY_SIZE(nfsd_procedures4), .vs_proc = nfsd_procedures4, - .vs_count = nfsd_count3, + .vs_count = nfsd_count4, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS4_SVC_XDRSIZE, .vs_rpcb_optnl = true, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c1684da6c01f..6e61fa3acaf1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -600,23 +600,15 @@ put_nfs4_file(struct nfs4_file *fi) } static struct nfsd_file * -__nfs4_get_fd(struct nfs4_file *f, int oflag) -{ - if (f->fi_fds[oflag]) - return nfsd_file_get(f->fi_fds[oflag]); - return NULL; -} - -static struct nfsd_file * find_writeable_file_locked(struct nfs4_file *f) { struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); - ret = __nfs4_get_fd(f, O_WRONLY); + ret = nfsd_file_get(f->fi_fds[O_WRONLY]); if (!ret) - ret = __nfs4_get_fd(f, O_RDWR); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); return ret; } @@ -639,9 +631,9 @@ find_readable_file_locked(struct nfs4_file *f) lockdep_assert_held(&f->fi_lock); - ret = __nfs4_get_fd(f, O_RDONLY); + ret = nfsd_file_get(f->fi_fds[O_RDONLY]); if (!ret) - ret = __nfs4_get_fd(f, O_RDWR); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); return ret; } @@ -665,11 +657,11 @@ find_any_file(struct nfs4_file *f) if (!f) return NULL; spin_lock(&f->fi_lock); - ret = __nfs4_get_fd(f, O_RDWR); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); if (!ret) { - ret = __nfs4_get_fd(f, O_WRONLY); + ret = nfsd_file_get(f->fi_fds[O_WRONLY]); if (!ret) - ret = __nfs4_get_fd(f, O_RDONLY); + ret = nfsd_file_get(f->fi_fds[O_RDONLY]); } spin_unlock(&f->fi_lock); return ret; @@ -688,15 +680,6 @@ static struct nfsd_file *find_any_file_locked(struct nfs4_file *f) return NULL; } -static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f) -{ - lockdep_assert_held(&f->fi_lock); - - if (f->fi_deleg_file) - return f->fi_deleg_file; - return NULL; -} - static atomic_long_t num_delegations; unsigned long max_delegations; @@ -992,7 +975,6 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time; stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id; - stid->cs_type = cs_type; idr_preload(GFP_KERNEL); spin_lock(&nn->s2s_cp_lock); @@ -1003,6 +985,7 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid, idr_preload_end(); if (new_id < 0) return 0; + stid->cs_type = cs_type; return 1; } @@ -1036,7 +1019,8 @@ void nfs4_free_copy_state(struct nfsd4_copy *copy) { struct nfsd_net *nn; - WARN_ON_ONCE(copy->cp_stateid.cs_type != NFS4_COPY_STID); + if (copy->cp_stateid.cs_type != NFS4_COPY_STID) + return; nn = net_generic(copy->cp_clp->net, nfsd_net_id); spin_lock(&nn->s2s_cp_lock); idr_remove(&nn->s2s_cp_stateids, @@ -2705,7 +2689,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) ds = delegstateid(st); nf = st->sc_file; spin_lock(&nf->fi_lock); - file = find_deleg_file_locked(nf); + file = nf->fi_deleg_file; if (!file) goto out; @@ -5298,16 +5282,17 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, /* test and set deny mode */ spin_lock(&fp->fi_lock); status = nfs4_file_check_deny(fp, open->op_share_deny); - if (status == nfs_ok) { - if (status != nfserr_share_denied) { - set_deny(open->op_share_deny, stp); - fp->fi_share_deny |= - (open->op_share_deny & NFS4_SHARE_DENY_BOTH); - } else { - if (nfs4_resolve_deny_conflicts_locked(fp, false, - stp, open->op_share_deny, false)) - status = nfserr_jukebox; - } + switch (status) { + case nfs_ok: + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= + (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + break; + case nfserr_share_denied: + if (nfs4_resolve_deny_conflicts_locked(fp, false, + stp, open->op_share_deny, false)) + status = nfserr_jukebox; + break; } spin_unlock(&fp->fi_lock); @@ -5438,6 +5423,23 @@ nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp, return 0; } +/* + * We avoid breaking delegations held by a client due to its own activity, but + * clearing setuid/setgid bits on a write is an implicit activity and the client + * may not notice and continue using the old mode. Avoid giving out a delegation + * on setuid/setgid files when the client is requesting an open for write. + */ +static int +nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf) +{ + struct inode *inode = file_inode(nf->nf_file); + + if ((open->op_share_access & NFS4_SHARE_ACCESS_WRITE) && + (inode->i_mode & (S_ISUID|S_ISGID))) + return -EAGAIN; + return 0; +} + static struct nfs4_delegation * nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *parent) @@ -5471,6 +5473,8 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; + else if (nfsd4_verify_setuid_write(open, nf)) + status = -EAGAIN; else if (!fp->fi_deleg_file) { fp->fi_deleg_file = nf; /* increment early to prevent fi_deleg_file from being @@ -5511,6 +5515,14 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (status) goto out_unlock; + /* + * Now that the deleg is set, check again to ensure that nothing + * raced in and changed the mode while we weren't lookng. + */ + status = nfsd4_verify_setuid_write(open, fp->fi_deleg_file); + if (status) + goto out_unlock; + spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (fp->fi_had_conflict) @@ -6406,23 +6418,26 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, static struct nfsd_file * nfs4_find_file(struct nfs4_stid *s, int flags) { + struct nfsd_file *ret = NULL; + if (!s) return NULL; switch (s->sc_type) { case NFS4_DELEG_STID: - if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file)) - return NULL; - return nfsd_file_get(s->sc_file->fi_deleg_file); + spin_lock(&s->sc_file->fi_lock); + ret = nfsd_file_get(s->sc_file->fi_deleg_file); + spin_unlock(&s->sc_file->fi_lock); + break; case NFS4_OPEN_STID: case NFS4_LOCK_STID: if (flags & RD_STATE) - return find_readable_file(s->sc_file); + ret = find_readable_file(s->sc_file); else - return find_writeable_file(s->sc_file); + ret = find_writeable_file(s->sc_file); } - return NULL; + return ret; } static __be32 @@ -6547,8 +6562,19 @@ void nfs4_put_cpntf_state(struct nfsd_net *nn, struct nfs4_cpntf_state *cps) spin_unlock(&nn->s2s_cp_lock); } -/* - * Checks for stateid operations +/** + * nfs4_preprocess_stateid_op - find and prep stateid for an operation + * @rqstp: incoming request from client + * @cstate: current compound state + * @fhp: filehandle associated with requested stateid + * @stateid: stateid (provided by client) + * @flags: flags describing type of operation to be done + * @nfp: optional nfsd_file return pointer (may be NULL) + * @cstid: optional returned nfs4_stid pointer (may be NULL) + * + * Given info from the client, look up a nfs4_stid for the operation. On + * success, it returns a reference to the nfs4_stid and/or the nfsd_file + * associated with it. */ __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, @@ -6737,8 +6763,18 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ return status; } -/* - * Checks for sequence id mutating operations. +/** + * nfs4_preprocess_seqid_op - find and prep an ol_stateid for a seqid-morphing op + * @cstate: compund state + * @seqid: seqid (provided by client) + * @stateid: stateid (provided by client) + * @typemask: mask of allowable types for this operation + * @stpp: return pointer for the stateid found + * @nn: net namespace for request + * + * Given a stateid+seqid from a client, look up an nfs4_ol_stateid and + * return it in @stpp. On a nfs_ok return, the returned stateid will + * have its st_mutex locked. */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 3e64a3d50a1c..041faa13b852 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -488,7 +488,7 @@ found_entry: case RC_NOCACHE: break; case RC_REPLSTAT: - svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat); + xdr_stream_encode_be32(&rqstp->rq_res_stream, rp->c_replstat); rtn = RC_REPLY; break; case RC_REPLBUFF: @@ -509,7 +509,7 @@ out_trace: * nfsd_cache_update - Update an entry in the duplicate reply cache. * @rqstp: svc_rqst with a finished Reply * @cachetype: which cache to update - * @statp: Reply's status code + * @statp: pointer to Reply's NFS status code, or NULL * * This is called from nfsd_dispatch when the procedure has been * executed and the complete reply is in rqstp->rq_res. diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c2577ee7ffb2..7b8f17ee5224 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -14,7 +14,6 @@ #include <linux/lockd/lockd.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/gss_api.h> -#include <linux/sunrpc/gss_krb5_enctypes.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/module.h> #include <linux/fsnotify.h> @@ -47,7 +46,6 @@ enum { NFSD_MaxBlkSize, NFSD_MaxConnections, NFSD_Filecache, - NFSD_SupportedEnctypes, /* * The below MUST come last. Otherwise we leave a hole in nfsd_files[] * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops @@ -187,16 +185,6 @@ static int export_features_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(export_features); -#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) -static int supported_enctypes_show(struct seq_file *m, void *v) -{ - seq_printf(m, KRB5_SUPPORTED_ENCTYPES); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(supported_enctypes); -#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ - static const struct file_operations pool_stats_operations = { .open = nfsd_pool_stats_open, .read = seq_read, @@ -1150,6 +1138,9 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) inode->i_op = &simple_dir_inode_operations; inc_nlink(inode); break; + case S_IFLNK: + inode->i_op = &simple_symlink_inode_operations; + break; default: break; } @@ -1195,6 +1186,54 @@ out_err: goto out; } +#if IS_ENABLED(CONFIG_SUNRPC_GSS) +static int __nfsd_symlink(struct inode *dir, struct dentry *dentry, + umode_t mode, const char *content) +{ + struct inode *inode; + + inode = nfsd_get_inode(dir->i_sb, mode); + if (!inode) + return -ENOMEM; + + inode->i_link = (char *)content; + inode->i_size = strlen(content); + + d_add(dentry, inode); + inc_nlink(dir); + fsnotify_create(dir, dentry); + return 0; +} + +/* + * @content is assumed to be a NUL-terminated string that lives + * longer than the symlink itself. + */ +static void nfsd_symlink(struct dentry *parent, const char *name, + const char *content) +{ + struct inode *dir = parent->d_inode; + struct dentry *dentry; + int ret; + + inode_lock(dir); + dentry = d_alloc_name(parent, name); + if (!dentry) + goto out; + ret = __nfsd_symlink(d_inode(parent), dentry, S_IFLNK | 0777, content); + if (ret) + dput(dentry); +out: + inode_unlock(dir); +} +#else +static inline void nfsd_symlink(struct dentry *parent, const char *name, + const char *content) +{ +} + +#endif + static void clear_ncl(struct inode *inode) { struct nfsdfs_client *ncl = inode->i_private; @@ -1355,10 +1394,6 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO}, -#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) - [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", - &supported_enctypes_fops, S_IRUGO}, -#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, @@ -1371,6 +1406,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) ret = simple_fill_super(sb, 0x6e667364, nfsd_files); if (ret) return ret; + nfsd_symlink(sb->s_root, "supported_krb5_enctypes", + "/proc/net/rpc/gss_krb5_enctypes"); dentry = nfsd_mkdir(sb->s_root, NULL, "clients"); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -1458,16 +1495,11 @@ static __net_init int nfsd_init_net(struct net *net) nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; nfsd4_init_leases_net(nn); - retval = nfsd_reply_cache_init(nn); - if (retval) - goto out_cache_error; get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); return 0; -out_cache_error: - nfsd_idmap_shutdown(net); out_idmap_error: nfsd_export_shutdown(net); out_export_error: @@ -1476,9 +1508,6 @@ out_export_error: static __net_exit void nfsd_exit_net(struct net *net) { - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - nfsd_reply_cache_shutdown(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index fa0144a74267..d88498f8b275 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -86,7 +86,7 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, * Function prototypes. */ int nfsd_svc(int nrservs, struct net *net, const struct cred *cred); -int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); +int nfsd_dispatch(struct svc_rqst *rqstp); int nfsd_nrthreads(struct net *); int nfsd_nrpools(struct net *); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index a82d91afdc9c..c37195572fd0 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -838,11 +838,11 @@ static const struct svc_procedure nfsd_procedures2[18] = { }, }; - -static unsigned int nfsd_count2[ARRAY_SIZE(nfsd_procedures2)]; +static DEFINE_PER_CPU_ALIGNED(unsigned long, + nfsd_count2[ARRAY_SIZE(nfsd_procedures2)]); const struct svc_version nfsd_version2 = { .vs_vers = 2, - .vs_nproc = 18, + .vs_nproc = ARRAY_SIZE(nfsd_procedures2), .vs_proc = nfsd_procedures2, .vs_count = nfsd_count2, .vs_dispatch = nfsd_dispatch, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 325d3d3f1211..9c7b1ef5be40 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -363,7 +363,7 @@ void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn) do { read_seqbegin_or_lock(&nn->writeverf_lock, &seq); - memcpy(verf, nn->writeverf, sizeof(*verf)); + memcpy(verf, nn->writeverf, sizeof(nn->writeverf)); } while (need_seqretry(&nn->writeverf_lock, seq)); done_seqretry(&nn->writeverf_lock, seq); } @@ -427,16 +427,23 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred) ret = nfsd_file_cache_start_net(net); if (ret) goto out_lockd; - ret = nfs4_state_start_net(net); + + ret = nfsd_reply_cache_init(nn); if (ret) goto out_filecache; + ret = nfs4_state_start_net(net); + if (ret) + goto out_reply_cache; + #ifdef CONFIG_NFSD_V4_2_INTER_SSC nfsd4_ssc_init_umount_work(nn); #endif nn->nfsd_net_up = true; return 0; +out_reply_cache: + nfsd_reply_cache_shutdown(nn); out_filecache: nfsd_file_cache_shutdown_net(net); out_lockd: @@ -454,6 +461,7 @@ static void nfsd_shutdown_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_state_shutdown_net(net); + nfsd_reply_cache_shutdown(nn); nfsd_file_cache_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); @@ -1022,7 +1030,6 @@ out: /** * nfsd_dispatch - Process an NFS or NFSACL Request * @rqstp: incoming request - * @statp: pointer to location of accept_stat field in RPC Reply buffer * * This RPC dispatcher integrates the NFS server's duplicate reply cache. * @@ -1030,9 +1037,10 @@ out: * %0: Processing complete; do not send a Reply * %1: Processing complete; send Reply in rqstp->rq_res */ -int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) +int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; + __be32 *statp = rqstp->rq_accept_statp; /* * Give the xdr decoder a chance to change this if it wants @@ -1040,7 +1048,6 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) */ rqstp->rq_cachetype = proc->pc_cachetype; - svcxdr_init_decode(rqstp); if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; @@ -1053,12 +1060,6 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) goto out_dropit; } - /* - * Need to grab the location to store the status, as - * NFSv4 does some encoding while processing - */ - svcxdr_init_encode(rqstp); - *statp = proc->pc_func(rqstp); if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index e94634d30591..d49d3060ed4f 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -705,8 +705,6 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); void put_nfs4_file(struct nfs4_file *fi); -extern struct nfsd4_copy * -find_async_copy(struct nfs4_client *clp, stateid_t *staetid); extern void nfs4_put_cpntf_state(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); extern __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st, diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 8f9c82d9e075..4183819ea082 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1202,37 +1202,6 @@ TRACE_EVENT(nfsd_file_close, ) ); -TRACE_EVENT(nfsd_file_fsync, - TP_PROTO( - const struct nfsd_file *nf, - int ret - ), - TP_ARGS(nf, ret), - TP_STRUCT__entry( - __field(void *, nf_inode) - __field(int, nf_ref) - __field(int, ret) - __field(unsigned long, nf_flags) - __field(unsigned char, nf_may) - __field(struct file *, nf_file) - ), - TP_fast_assign( - __entry->nf_inode = nf->nf_inode; - __entry->nf_ref = refcount_read(&nf->nf_ref); - __entry->ret = ret; - __entry->nf_flags = nf->nf_flags; - __entry->nf_may = nf->nf_may; - __entry->nf_file = nf->nf_file; - ), - TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d", - __entry->nf_inode, - __entry->nf_ref, - show_nf_flags(__entry->nf_flags), - show_nfsd_may_flags(__entry->nf_may), - __entry->nf_file, __entry->ret - ) -); - #include "cache.h" TRACE_DEFINE_ENUM(RC_DROPIT); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ab4ee3509ce3..e7462b5e5f1e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -126,9 +126,13 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct dentry *dentry = *dpp; struct path path = {.mnt = mntget(exp->ex_path.mnt), .dentry = dget(dentry)}; + unsigned int follow_flags = 0; int err = 0; - err = follow_down(&path); + if (exp->ex_flags & NFSEXP_CROSSMOUNT) + follow_flags = LOOKUP_AUTOMOUNT; + + err = follow_down(&path, follow_flags); if (err < 0) goto out; if (path.mnt == exp->ex_path.mnt && path.dentry == dentry && @@ -223,7 +227,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) return 1; if (nfsd4_is_junction(dentry)) return 1; - if (d_mountpoint(dentry)) + if (d_managed(dentry)) /* * Might only be a mountpoint in a different namespace, * but we need to check. diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 4fd2cf6d1d2d..510978e602da 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -571,7 +571,7 @@ struct nfsd4_copy { struct task_struct *copy_task; refcount_t refcount; - struct vfsmount *ss_mnt; + struct nfsd4_ssc_umount_item *ss_nsui; struct nfs_fh c_fh; nfs4_stateid stateid; }; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a07b24d170f2..aecbd712a00c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -46,6 +46,7 @@ #include <linux/net.h> #include <linux/export.h> #include <net/tcp.h> +#include <trace/events/sock.h> #include <linux/uaccess.h> @@ -585,6 +586,8 @@ static void o2net_data_ready(struct sock *sk) void (*ready)(struct sock *sk); struct o2net_sock_container *sc; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); sc = sk->sk_user_data; if (sc) { @@ -1931,6 +1934,8 @@ static void o2net_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (ready == NULL) { /* check for teardown race */ diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 7d605db3bb3b..ace133cf6072 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -167,7 +167,7 @@ struct smb2_err_rsp { __u8 ErrorContextCount; __u8 Reserved; __le32 ByteCount; /* even if zero, at least one byte follows */ - __u8 ErrorData[1]; /* variable length */ + __u8 ErrorData[]; /* variable length */ } __packed; #define SMB3_AES_CCM_NONCE 11 @@ -308,7 +308,7 @@ struct smb2_tree_connect_req { __le16 Flags; /* Flags in SMB3.1.1 */ __le16 PathOffset; __le16 PathLength; - __u8 Buffer[1]; /* variable length */ + __u8 Buffer[]; /* variable length */ } __packed; /* Possible ShareType values */ @@ -595,7 +595,7 @@ struct smb2_negotiate_rsp { __le16 SecurityBufferOffset; __le16 SecurityBufferLength; __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; @@ -616,7 +616,7 @@ struct smb2_sess_setup_req { __le16 SecurityBufferOffset; __le16 SecurityBufferLength; __le64 PreviousSessionId; - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; /* Currently defined SessionFlags */ @@ -633,7 +633,7 @@ struct smb2_sess_setup_rsp { __le16 SessionFlags; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; @@ -715,7 +715,7 @@ struct smb2_read_req { __le32 RemainingBytes; __le16 ReadChannelInfoOffset; __le16 ReadChannelInfoLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* Read flags */ @@ -730,7 +730,7 @@ struct smb2_read_rsp { __le32 DataLength; __le32 DataRemaining; __le32 Flags; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; @@ -754,7 +754,7 @@ struct smb2_write_req { __le16 WriteChannelInfoOffset; __le16 WriteChannelInfoLength; __le32 Flags; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_write_rsp { @@ -765,7 +765,7 @@ struct smb2_write_rsp { __le32 DataLength; __le32 DataRemaining; __u32 Reserved2; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; @@ -812,7 +812,10 @@ struct smb2_lock_req { __u64 PersistentFileId; __u64 VolatileFileId; /* Followed by at least one */ - struct smb2_lock_element locks[1]; + union { + struct smb2_lock_element lock; + DECLARE_FLEX_ARRAY(struct smb2_lock_element, locks); + }; } __packed; struct smb2_lock_rsp { @@ -866,7 +869,7 @@ struct smb2_query_directory_req { __le16 FileNameOffset; __le16 FileNameLength; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_query_directory_rsp { @@ -874,7 +877,7 @@ struct smb2_query_directory_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* @@ -897,7 +900,7 @@ struct smb2_set_info_req { __le32 AdditionalInformation; __u64 PersistentFileId; __u64 VolatileFileId; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_set_info_rsp { @@ -952,7 +955,7 @@ struct smb2_change_notify_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; /* array of file notify structs */ + __u8 Buffer[]; /* array of file notify structs */ } __packed; @@ -1158,7 +1161,7 @@ struct smb2_create_rsp { __u64 VolatileFileId; __le32 CreateContextsOffset; __le32 CreateContextsLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct create_posix { @@ -1501,7 +1504,7 @@ struct smb2_query_info_req { __le32 Flags; __u64 PersistentFileId; __u64 VolatileFileId; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_query_info_rsp { @@ -1509,7 +1512,7 @@ struct smb2_query_info_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* @@ -1570,7 +1573,10 @@ struct smb2_file_all_info { /* data block encoding of response to level 18 */ __le32 Mode; __le32 AlignmentRequirement; __le32 FileNameLength; - char FileName[1]; + union { + char __pad; /* Legacy structure padding */ + DECLARE_FLEX_ARRAY(char, FileName); + }; } __packed; /* level 18 Query */ struct smb2_file_eof_info { /* encoding of request for level 10 */ diff --git a/fs/splice.c b/fs/splice.c index 87d9b19349de..2e76dbb81a8f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -282,6 +282,99 @@ void splice_shrink_spd(struct splice_pipe_desc *spd) kfree(spd->partial); } +/* + * Splice data from an O_DIRECT file into pages and then add them to the output + * pipe. + */ +ssize_t direct_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct iov_iter to; + struct bio_vec *bv; + struct kiocb kiocb; + struct page **pages; + ssize_t ret; + size_t used, npages, chunk, remain, reclaim; + int i; + + /* Work out how much data we can actually add into the pipe */ + used = pipe_occupancy(pipe->head, pipe->tail); + npages = max_t(ssize_t, pipe->max_usage - used, 0); + len = min_t(size_t, len, npages * PAGE_SIZE); + npages = DIV_ROUND_UP(len, PAGE_SIZE); + + bv = kzalloc(array_size(npages, sizeof(bv[0])) + + array_size(npages, sizeof(struct page *)), GFP_KERNEL); + if (!bv) + return -ENOMEM; + + pages = (void *)(bv + npages); + npages = alloc_pages_bulk_array(GFP_USER, npages, pages); + if (!npages) { + kfree(bv); + return -ENOMEM; + } + + remain = len = min_t(size_t, len, npages * PAGE_SIZE); + + for (i = 0; i < npages; i++) { + chunk = min_t(size_t, PAGE_SIZE, remain); + bv[i].bv_page = pages[i]; + bv[i].bv_offset = 0; + bv[i].bv_len = chunk; + remain -= chunk; + } + + /* Do the I/O */ + iov_iter_bvec(&to, ITER_DEST, bv, npages, len); + init_sync_kiocb(&kiocb, in); + kiocb.ki_pos = *ppos; + ret = call_read_iter(in, &kiocb, &to); + + reclaim = npages * PAGE_SIZE; + remain = 0; + if (ret > 0) { + reclaim -= ret; + remain = ret; + *ppos = kiocb.ki_pos; + file_accessed(in); + } else if (ret < 0) { + /* + * callers of ->splice_read() expect -EAGAIN on + * "can't put anything in there", rather than -EFAULT. + */ + if (ret == -EFAULT) + ret = -EAGAIN; + } + + /* Free any pages that didn't get touched at all. */ + reclaim /= PAGE_SIZE; + if (reclaim) { + npages -= reclaim; + release_pages(pages + npages, reclaim); + } + + /* Push the remaining pages into the pipe. */ + for (i = 0; i < npages; i++) { + struct pipe_buffer *buf = pipe_head_buf(pipe); + + chunk = min_t(size_t, remain, PAGE_SIZE); + *buf = (struct pipe_buffer) { + .ops = &default_pipe_buf_ops, + .page = bv[i].bv_page, + .offset = 0, + .len = chunk, + }; + pipe->head++; + remain -= chunk; + } + + kfree(bv); + return ret; +} +EXPORT_SYMBOL(direct_splice_read); + /** * generic_file_splice_read - splice data from file to a pipe * @in: file to splice from diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 989cf341779b..f8ff81c3de76 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2472,20 +2472,20 @@ xfs_defer_agfl_block( struct xfs_owner_info *oinfo) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent_free_item *new; /* new element */ + struct xfs_extent_free_item *xefi; ASSERT(xfs_extfree_item_cache != NULL); ASSERT(oinfo != NULL); - new = kmem_cache_zalloc(xfs_extfree_item_cache, + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); - new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); - new->xefi_blockcount = 1; - new->xefi_owner = oinfo->oi_owner; + xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); + xefi->xefi_blockcount = 1; + xefi->xefi_owner = oinfo->oi_owner; trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); } /* @@ -2500,7 +2500,7 @@ __xfs_free_extent_later( const struct xfs_owner_info *oinfo, bool skip_discard) { - struct xfs_extent_free_item *new; /* new element */ + struct xfs_extent_free_item *xefi; #ifdef DEBUG struct xfs_mount *mp = tp->t_mountp; xfs_agnumber_t agno; @@ -2519,27 +2519,27 @@ __xfs_free_extent_later( #endif ASSERT(xfs_extfree_item_cache != NULL); - new = kmem_cache_zalloc(xfs_extfree_item_cache, + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); - new->xefi_startblock = bno; - new->xefi_blockcount = (xfs_extlen_t)len; + xefi->xefi_startblock = bno; + xefi->xefi_blockcount = (xfs_extlen_t)len; if (skip_discard) - new->xefi_flags |= XFS_EFI_SKIP_DISCARD; + xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; if (oinfo) { ASSERT(oinfo->oi_offset == 0); if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) - new->xefi_flags |= XFS_EFI_ATTR_FORK; + xefi->xefi_flags |= XFS_EFI_ATTR_FORK; if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) - new->xefi_flags |= XFS_EFI_BMBT_BLOCK; - new->xefi_owner = oinfo->oi_owner; + xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; + xefi->xefi_owner = oinfo->oi_owner; } else { - new->xefi_owner = XFS_RMAP_OWN_NULL; + xefi->xefi_owner = XFS_RMAP_OWN_NULL; } trace_xfs_bmap_free_defer(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); } #ifdef DEBUG diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0d56a8d862e8..c8c65387136c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6146,39 +6146,37 @@ xfs_bmap_unmap_extent( int xfs_bmap_finish_one( struct xfs_trans *tp, - struct xfs_inode *ip, - enum xfs_bmap_intent_type type, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t *blockcount, - xfs_exntst_t state) + struct xfs_bmap_intent *bi) { + struct xfs_bmbt_irec *bmap = &bi->bi_bmap; int error = 0; ASSERT(tp->t_firstblock == NULLFSBLOCK); trace_xfs_bmap_deferred(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, - XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), - ip->i_ino, whichfork, startoff, *blockcount, state); + XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), + bi->bi_type, + XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), + bi->bi_owner->i_ino, bi->bi_whichfork, + bmap->br_startoff, bmap->br_blockcount, + bmap->br_state); - if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK)) + if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) return -EFSCORRUPTED; if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; - switch (type) { + switch (bi->bi_type) { case XFS_BMAP_MAP: - error = xfs_bmapi_remap(tp, ip, startoff, *blockcount, - startblock, 0); - *blockcount = 0; + error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff, + bmap->br_blockcount, bmap->br_startblock, 0); + bmap->br_blockcount = 0; break; case XFS_BMAP_UNMAP: - error = __xfs_bunmapi(tp, ip, startoff, blockcount, - XFS_BMAPI_REMAP, 1); + error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff, + &bmap->br_blockcount, XFS_BMAPI_REMAP, 1); break; default: ASSERT(0); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 16db95b11589..01c2df35c3e3 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -234,10 +234,7 @@ struct xfs_bmap_intent { struct xfs_bmbt_irec bi_bmap; }; -int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip, - enum xfs_bmap_intent_type type, int whichfork, - xfs_fileoff_t startoff, xfs_fsblock_t startblock, - xfs_filblks_t *blockcount, xfs_exntst_t state); +int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 35f574421670..da8c769887fd 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2913,9 +2913,22 @@ xfs_btree_split_worker( } /* - * BMBT split requests often come in with little stack to work on. Push + * BMBT split requests often come in with little stack to work on so we push * them off to a worker thread so there is lots of stack to use. For the other * btree types, just call directly to avoid the context switch overhead here. + * + * Care must be taken here - the work queue rescuer thread introduces potential + * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new + * AGFs to allocate blocks. A task being run by the rescuer could attempt to + * lock an AGF that is already locked by a task queued to run by the rescuer, + * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to + * release it until the current thread it is running gains the lock. + * + * To avoid this issue, we only ever queue BMBT splits that don't have an AGF + * already locked to allocate from. The only place that doesn't hold an AGF + * locked is unwritten extent conversion at IO completion, but that has already + * been offloaded to a worker thread and hence has no stack consumption issues + * we have to worry about. */ STATIC int /* error */ xfs_btree_split( @@ -2929,7 +2942,8 @@ xfs_btree_split( struct xfs_btree_split_args args; DECLARE_COMPLETION_ONSTACK(done); - if (cur->bc_btnum != XFS_BTNUM_BMAP) + if (cur->bc_btnum != XFS_BTNUM_BMAP || + cur->bc_tp->t_firstblock == NULLFSBLOCK) return __xfs_btree_split(cur, level, ptrp, key, curp, stat); args.cur = cur; diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 6f7ed9288fe4..bcf46aa0d08b 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1213,37 +1213,33 @@ out_error: STATIC int xfs_refcount_adjust( struct xfs_btree_cur *cur, - xfs_agblock_t agbno, - xfs_extlen_t aglen, - xfs_agblock_t *new_agbno, - xfs_extlen_t *new_aglen, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, enum xfs_refc_adjust_op adj) { bool shape_changed; int shape_changes = 0; int error; - *new_agbno = agbno; - *new_aglen = aglen; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_increase(cur->bc_mp, + cur->bc_ag.pag->pag_agno, *agbno, *aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno, aglen); + trace_xfs_refcount_decrease(cur->bc_mp, + cur->bc_ag.pag->pag_agno, *agbno, *aglen); /* * Ensure that no rcextents cross the boundary of the adjustment range. */ error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, - agbno, &shape_changed); + *agbno, &shape_changed); if (error) goto out_error; if (shape_changed) shape_changes++; error = xfs_refcount_split_extent(cur, XFS_REFC_DOMAIN_SHARED, - agbno + aglen, &shape_changed); + *agbno + *aglen, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1253,7 +1249,7 @@ xfs_refcount_adjust( * Try to merge with the left or right extents of the range. */ error = xfs_refcount_merge_extents(cur, XFS_REFC_DOMAIN_SHARED, - new_agbno, new_aglen, adj, &shape_changed); + agbno, aglen, adj, &shape_changed); if (error) goto out_error; if (shape_changed) @@ -1262,7 +1258,7 @@ xfs_refcount_adjust( cur->bc_ag.refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ - error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj); + error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj); if (error) goto out_error; @@ -1298,21 +1294,20 @@ xfs_refcount_finish_one_cleanup( static inline int xfs_refcount_continue_op( struct xfs_btree_cur *cur, - xfs_fsblock_t startblock, - xfs_agblock_t new_agbno, - xfs_extlen_t new_len, - xfs_fsblock_t *new_fsbno) + struct xfs_refcount_intent *ri, + xfs_agblock_t new_agbno) { struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = cur->bc_ag.pag; - if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, new_len))) + if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, + ri->ri_blockcount))) return -EFSCORRUPTED; - *new_fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); + ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); - ASSERT(xfs_verify_fsbext(mp, *new_fsbno, new_len)); - ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, *new_fsbno)); + ASSERT(xfs_verify_fsbext(mp, ri->ri_startblock, ri->ri_blockcount)); + ASSERT(pag->pag_agno == XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); return 0; } @@ -1327,11 +1322,7 @@ xfs_refcount_continue_op( int xfs_refcount_finish_one( struct xfs_trans *tp, - enum xfs_refcount_intent_type type, - xfs_fsblock_t startblock, - xfs_extlen_t blockcount, - xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; @@ -1339,17 +1330,16 @@ xfs_refcount_finish_one( struct xfs_buf *agbp = NULL; int error = 0; xfs_agblock_t bno; - xfs_agblock_t new_agbno; unsigned long nr_ops = 0; int shape_changes = 0; struct xfs_perag *pag; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); - bno = XFS_FSB_TO_AGBNO(mp, startblock); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); + bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); - trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), - type, XFS_FSB_TO_AGBNO(mp, startblock), - blockcount); + trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), + ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), + ri->ri_blockcount); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { error = -EIO; @@ -1380,42 +1370,42 @@ xfs_refcount_finish_one( } *pcur = rcur; - switch (type) { + switch (ri->ri_type) { case XFS_REFCOUNT_INCREASE: - error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, - new_len, XFS_REFCOUNT_ADJUST_INCREASE); + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_INCREASE); if (error) goto out_drop; - if (*new_len > 0) - error = xfs_refcount_continue_op(rcur, startblock, - new_agbno, *new_len, new_fsb); + if (ri->ri_blockcount > 0) + error = xfs_refcount_continue_op(rcur, ri, bno); break; case XFS_REFCOUNT_DECREASE: - error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, - new_len, XFS_REFCOUNT_ADJUST_DECREASE); + error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, + XFS_REFCOUNT_ADJUST_DECREASE); if (error) goto out_drop; - if (*new_len > 0) - error = xfs_refcount_continue_op(rcur, startblock, - new_agbno, *new_len, new_fsb); + if (ri->ri_blockcount > 0) + error = xfs_refcount_continue_op(rcur, ri, bno); break; case XFS_REFCOUNT_ALLOC_COW: - *new_fsb = startblock + blockcount; - *new_len = 0; - error = __xfs_refcount_cow_alloc(rcur, bno, blockcount); + error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); + if (error) + goto out_drop; + ri->ri_blockcount = 0; break; case XFS_REFCOUNT_FREE_COW: - *new_fsb = startblock + blockcount; - *new_len = 0; - error = __xfs_refcount_cow_free(rcur, bno, blockcount); + error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); + if (error) + goto out_drop; + ri->ri_blockcount = 0; break; default: ASSERT(0); error = -EFSCORRUPTED; } - if (!error && *new_len > 0) - trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type, - bno, blockcount, new_agbno, *new_len); + if (!error && ri->ri_blockcount > 0) + trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, + ri->ri_type, bno, ri->ri_blockcount); out_drop: xfs_perag_put(pag); return error; diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 452f30556f5a..c633477ce3ce 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -75,9 +75,7 @@ void xfs_refcount_decrease_extent(struct xfs_trans *tp, extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error); extern int xfs_refcount_finish_one(struct xfs_trans *tp, - enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, - xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur); extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index b56aca1e7c66..df720041cd3d 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2390,13 +2390,7 @@ xfs_rmap_finish_one_cleanup( int xfs_rmap_finish_one( struct xfs_trans *tp, - enum xfs_rmap_intent_type type, - uint64_t owner, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state, + struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; @@ -2408,11 +2402,13 @@ xfs_rmap_finish_one( xfs_agblock_t bno; bool unwritten; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); - bno = XFS_FSB_TO_AGBNO(mp, startblock); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock)); + bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); - trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork, - startoff, blockcount, state); + trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno, + ri->ri_owner, ri->ri_whichfork, + ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, + ri->ri_bmap.br_state); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { error = -EIO; @@ -2448,35 +2444,37 @@ xfs_rmap_finish_one( } *pcur = rcur; - xfs_rmap_ino_owner(&oinfo, owner, whichfork, startoff); - unwritten = state == XFS_EXT_UNWRITTEN; - bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, startblock); + xfs_rmap_ino_owner(&oinfo, ri->ri_owner, ri->ri_whichfork, + ri->ri_bmap.br_startoff); + unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; + bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); - switch (type) { + switch (ri->ri_type) { case XFS_RMAP_ALLOC: case XFS_RMAP_MAP: - error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); + error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount, + unwritten, &oinfo); break; case XFS_RMAP_MAP_SHARED: - error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten, - &oinfo); + error = xfs_rmap_map_shared(rcur, bno, + ri->ri_bmap.br_blockcount, unwritten, &oinfo); break; case XFS_RMAP_FREE: case XFS_RMAP_UNMAP: - error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, - &oinfo); + error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount, + unwritten, &oinfo); break; case XFS_RMAP_UNMAP_SHARED: - error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten, - &oinfo); + error = xfs_rmap_unmap_shared(rcur, bno, + ri->ri_bmap.br_blockcount, unwritten, &oinfo); break; case XFS_RMAP_CONVERT: - error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, - &oinfo); + error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount, + !unwritten, &oinfo); break; case XFS_RMAP_CONVERT_SHARED: - error = xfs_rmap_convert_shared(rcur, bno, blockcount, - !unwritten, &oinfo); + error = xfs_rmap_convert_shared(rcur, bno, + ri->ri_bmap.br_blockcount, !unwritten, &oinfo); break; default: ASSERT(0); diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 54741a591a17..2dac88cea28d 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -179,10 +179,8 @@ void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno, void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error); -int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, - uint64_t owner, int whichfork, xfs_fileoff_t startoff, - xfs_fsblock_t startblock, xfs_filblks_t blockcount, - xfs_exntst_t state, struct xfs_btree_cur **pcur); +int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, + struct xfs_btree_cur **pcur); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 41323da523d1..6e2f0013380a 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -246,18 +246,11 @@ static int xfs_trans_log_finish_bmap_update( struct xfs_trans *tp, struct xfs_bud_log_item *budp, - enum xfs_bmap_intent_type type, - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t *blockcount, - xfs_exntst_t state) + struct xfs_bmap_intent *bi) { int error; - error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff, - startblock, blockcount, state); + error = xfs_bmap_finish_one(tp, bi); /* * Mark the transaction dirty, even on error. This ensures the @@ -290,24 +283,24 @@ xfs_bmap_update_diff_items( /* Set the map extent flags for this mapping. */ static void xfs_trans_set_bmap_flags( - struct xfs_map_extent *bmap, + struct xfs_map_extent *map, enum xfs_bmap_intent_type type, int whichfork, xfs_exntst_t state) { - bmap->me_flags = 0; + map->me_flags = 0; switch (type) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: - bmap->me_flags = type; + map->me_flags = type; break; default: ASSERT(0); } if (state == XFS_EXT_UNWRITTEN) - bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; if (whichfork == XFS_ATTR_FORK) - bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; + map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; } /* Log bmap updates in the intent item. */ @@ -315,7 +308,7 @@ STATIC void xfs_bmap_update_log_item( struct xfs_trans *tp, struct xfs_bui_log_item *buip, - struct xfs_bmap_intent *bmap) + struct xfs_bmap_intent *bi) { uint next_extent; struct xfs_map_extent *map; @@ -331,12 +324,12 @@ xfs_bmap_update_log_item( next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; ASSERT(next_extent < buip->bui_format.bui_nextents); map = &buip->bui_format.bui_extents[next_extent]; - map->me_owner = bmap->bi_owner->i_ino; - map->me_startblock = bmap->bi_bmap.br_startblock; - map->me_startoff = bmap->bi_bmap.br_startoff; - map->me_len = bmap->bi_bmap.br_blockcount; - xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, - bmap->bi_bmap.br_state); + map->me_owner = bi->bi_owner->i_ino; + map->me_startblock = bi->bi_bmap.br_startblock; + map->me_startoff = bi->bi_bmap.br_startoff; + map->me_len = bi->bi_bmap.br_blockcount; + xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork, + bi->bi_bmap.br_state); } static struct xfs_log_item * @@ -348,15 +341,15 @@ xfs_bmap_update_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_bui_log_item *buip = xfs_bui_init(mp); - struct xfs_bmap_intent *bmap; + struct xfs_bmap_intent *bi; ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); xfs_trans_add_item(tp, &buip->bui_item); if (sort) list_sort(mp, items, xfs_bmap_update_diff_items); - list_for_each_entry(bmap, items, bi_list) - xfs_bmap_update_log_item(tp, buip, bmap); + list_for_each_entry(bi, items, bi_list) + xfs_bmap_update_log_item(tp, buip, bi); return &buip->bui_item; } @@ -378,25 +371,17 @@ xfs_bmap_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_bmap_intent *bmap; - xfs_filblks_t count; + struct xfs_bmap_intent *bi; int error; - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - count = bmap->bi_bmap.br_blockcount; - error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), - bmap->bi_type, - bmap->bi_owner, bmap->bi_whichfork, - bmap->bi_bmap.br_startoff, - bmap->bi_bmap.br_startblock, - &count, - bmap->bi_bmap.br_state); - if (!error && count > 0) { - ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); - bmap->bi_bmap.br_blockcount = count; + bi = container_of(item, struct xfs_bmap_intent, bi_list); + + error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi); + if (!error && bi->bi_bmap.br_blockcount > 0) { + ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; } - kmem_cache_free(xfs_bmap_intent_cache, bmap); + kmem_cache_free(xfs_bmap_intent_cache, bi); return error; } @@ -413,10 +398,10 @@ STATIC void xfs_bmap_update_cancel_item( struct list_head *item) { - struct xfs_bmap_intent *bmap; + struct xfs_bmap_intent *bi; - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - kmem_cache_free(xfs_bmap_intent_cache, bmap); + bi = container_of(item, struct xfs_bmap_intent, bi_list); + kmem_cache_free(xfs_bmap_intent_cache, bi); } const struct xfs_defer_op_type xfs_bmap_update_defer_type = { @@ -434,18 +419,18 @@ xfs_bui_validate( struct xfs_mount *mp, struct xfs_bui_log_item *buip) { - struct xfs_map_extent *bmap; + struct xfs_map_extent *map; /* Only one mapping operation per BUI... */ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) return false; - bmap = &buip->bui_format.bui_extents[0]; + map = &buip->bui_format.bui_extents[0]; - if (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS) + if (map->me_flags & ~XFS_BMAP_EXTENT_FLAGS) return false; - switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + switch (map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: break; @@ -453,13 +438,13 @@ xfs_bui_validate( return false; } - if (!xfs_verify_ino(mp, bmap->me_owner)) + if (!xfs_verify_ino(mp, map->me_owner)) return false; - if (!xfs_verify_fileext(mp, bmap->me_startoff, bmap->me_len)) + if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) return false; - return xfs_verify_fsbext(mp, bmap->me_startblock, bmap->me_len); + return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } /* @@ -471,17 +456,13 @@ xfs_bui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { - struct xfs_bmbt_irec irec; + struct xfs_bmap_intent fake = { }; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_map_extent *bmap; + struct xfs_map_extent *map; struct xfs_bud_log_item *budp; - xfs_filblks_t count; - xfs_exntst_t state; - unsigned int bui_type; - int whichfork; int iext_delta; int error = 0; @@ -491,14 +472,12 @@ xfs_bui_item_recover( return -EFSCORRUPTED; } - bmap = &buip->bui_format.bui_extents[0]; - state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + map = &buip->bui_format.bui_extents[0]; + fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; - bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - error = xlog_recover_iget(mp, bmap->me_owner, &ip); + error = xlog_recover_iget(mp, map->me_owner, &ip); if (error) return error; @@ -512,34 +491,34 @@ xfs_bui_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (bui_type == XFS_BMAP_MAP) + if (fake.bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta); + error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; - count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, - whichfork, bmap->me_startoff, bmap->me_startblock, - &count, state); + fake.bi_owner = ip; + fake.bi_bmap.br_startblock = map->me_startblock; + fake.bi_bmap.br_startoff = map->me_startoff; + fake.bi_bmap.br_blockcount = map->me_len; + fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + + error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap, - sizeof(*bmap)); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, + sizeof(*map)); if (error) goto err_cancel; - if (count > 0) { - ASSERT(bui_type == XFS_BMAP_UNMAP); - irec.br_startblock = bmap->me_startblock; - irec.br_blockcount = count; - irec.br_startoff = bmap->me_startoff; - irec.br_state = state; - xfs_bmap_unmap_extent(tp, ip, &irec); + if (fake.bi_bmap.br_blockcount > 0) { + ASSERT(fake.bi_type == XFS_BMAP_UNMAP); + xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap); } /* @@ -579,18 +558,18 @@ xfs_bui_item_relog( { struct xfs_bud_log_item *budp; struct xfs_bui_log_item *buip; - struct xfs_map_extent *extp; + struct xfs_map_extent *map; unsigned int count; count = BUI_ITEM(intent)->bui_format.bui_nextents; - extp = BUI_ITEM(intent)->bui_format.bui_extents; + map = BUI_ITEM(intent)->bui_format.bui_extents; tp->t_flags |= XFS_TRANS_DIRTY; budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); buip = xfs_bui_init(tp->t_mountp); - memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp)); + memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); xfs_trans_add_item(tp, &buip->bui_item); set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ae082808cfed..b2cbbba3e15a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -228,7 +228,7 @@ static struct attribute *xfs_errortag_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_errortag); -static struct kobj_type xfs_errortag_ktype = { +static const struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_errortag_sysfs_ops, .default_groups = xfs_errortag_groups, diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index dbe6c37dc697..0b9c5ba8a598 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -75,7 +75,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); /* * XFS panic tags -- allow a call to xfs_alert_tag() be turned into - * a panic by setting xfs_panic_mask in a sysctl. + * a panic by setting fs.xfs.panic_mask in a sysctl. */ #define XFS_NO_PTAG 0u #define XFS_PTAG_IFLUSH (1u << 0) @@ -88,6 +88,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_FSBLOCK_ZERO (1u << 7) #define XFS_PTAG_VERIFIER_ERROR (1u << 8) +#define XFS_PTAG_MASK (XFS_PTAG_IFLUSH | \ + XFS_PTAG_LOGRES | \ + XFS_PTAG_AILDELETE | \ + XFS_PTAG_ERROR_REPORT | \ + XFS_PTAG_SHUTDOWN_CORRUPT | \ + XFS_PTAG_SHUTDOWN_IOERROR | \ + XFS_PTAG_SHUTDOWN_LOGERROR | \ + XFS_PTAG_FSBLOCK_ZERO | \ + XFS_PTAG_VERIFIER_ERROR) + #define XFS_PTAG_STRINGS \ { XFS_NO_PTAG, "none" }, \ { XFS_PTAG_IFLUSH, "iflush" }, \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d5130d1fcfae..011b50469301 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -345,23 +345,30 @@ static int xfs_trans_free_extent( struct xfs_trans *tp, struct xfs_efd_log_item *efdp, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len, - const struct xfs_owner_info *oinfo, - bool skip_discard) + struct xfs_extent_free_item *xefi) { + struct xfs_owner_info oinfo = { }; struct xfs_mount *mp = tp->t_mountp; struct xfs_extent *extp; uint next_extent; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, + xefi->xefi_startblock); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, - start_block); + xefi->xefi_startblock); int error; - trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); + oinfo.oi_owner = xefi->xefi_owner; + if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + + trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, + xefi->xefi_blockcount); - error = __xfs_free_extent(tp, start_block, ext_len, - oinfo, XFS_AG_RESV_NONE, skip_discard); + error = __xfs_free_extent(tp, xefi->xefi_startblock, + xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: @@ -375,8 +382,8 @@ xfs_trans_free_extent( next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = start_block; - extp->ext_len = ext_len; + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; return error; @@ -404,7 +411,7 @@ STATIC void xfs_extent_free_log_item( struct xfs_trans *tp, struct xfs_efi_log_item *efip, - struct xfs_extent_free_item *free) + struct xfs_extent_free_item *xefi) { uint next_extent; struct xfs_extent *extp; @@ -420,8 +427,8 @@ xfs_extent_free_log_item( next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; ASSERT(next_extent < efip->efi_format.efi_nextents); extp = &efip->efi_format.efi_extents[next_extent]; - extp->ext_start = free->xefi_startblock; - extp->ext_len = free->xefi_blockcount; + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; } static struct xfs_log_item * @@ -433,15 +440,15 @@ xfs_extent_free_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; ASSERT(count > 0); xfs_trans_add_item(tp, &efip->efi_item); if (sort) list_sort(mp, items, xfs_extent_free_diff_items); - list_for_each_entry(free, items, xefi_list) - xfs_extent_free_log_item(tp, efip, free); + list_for_each_entry(xefi, items, xefi_list) + xfs_extent_free_log_item(tp, efip, xefi); return &efip->efi_item; } @@ -463,21 +470,13 @@ xfs_extent_free_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_owner_info oinfo = { }; - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; int error; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - oinfo.oi_owner = free->xefi_owner; - if (free->xefi_flags & XFS_EFI_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (free->xefi_flags & XFS_EFI_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - error = xfs_trans_free_extent(tp, EFD_ITEM(done), - free->xefi_startblock, - free->xefi_blockcount, - &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD); - kmem_cache_free(xfs_extfree_item_cache, free); + xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + + error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -494,10 +493,10 @@ STATIC void xfs_extent_free_cancel_item( struct list_head *item) { - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - kmem_cache_free(xfs_extfree_item_cache, free); + xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + kmem_cache_free(xfs_extfree_item_cache, xefi); } const struct xfs_defer_op_type xfs_extent_free_defer_type = { @@ -523,7 +522,7 @@ xfs_agfl_free_finish_item( struct xfs_owner_info oinfo = { }; struct xfs_mount *mp = tp->t_mountp; struct xfs_efd_log_item *efdp = EFD_ITEM(done); - struct xfs_extent_free_item *free; + struct xfs_extent_free_item *xefi; struct xfs_extent *extp; struct xfs_buf *agbp; int error; @@ -532,13 +531,13 @@ xfs_agfl_free_finish_item( uint next_extent; struct xfs_perag *pag; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - ASSERT(free->xefi_blockcount == 1); - agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); - agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); - oinfo.oi_owner = free->xefi_owner; + xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + ASSERT(xefi->xefi_blockcount == 1); + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); + oinfo.oi_owner = xefi->xefi_owner; - trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); + trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount); pag = xfs_perag_get(mp, agno); error = xfs_alloc_read_agf(pag, tp, 0, &agbp); @@ -559,11 +558,11 @@ xfs_agfl_free_finish_item( next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = free->xefi_startblock; - extp->ext_len = free->xefi_blockcount; + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; - kmem_cache_free(xfs_extfree_item_cache, free); + kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -599,7 +598,6 @@ xfs_efi_item_recover( struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; - struct xfs_extent *extp; int i; int error = 0; @@ -624,10 +622,17 @@ xfs_efi_item_recover( efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { + struct xfs_extent_free_item fake = { + .xefi_owner = XFS_RMAP_OWN_UNKNOWN, + }; + struct xfs_extent *extp; + extp = &efip->efi_format.efi_extents[i]; - error = xfs_trans_free_extent(tp, efdp, extp->ext_start, - extp->ext_len, - &XFS_RMAP_OINFO_ANY_OWNER, false); + + fake.xefi_startblock = extp->ext_start; + fake.xefi_blockcount = extp->ext_len; + + error = xfs_trans_free_extent(tp, efdp, &fake); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 4d0a98f920ca..9edc1f2bc939 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -4,6 +4,7 @@ * All Rights Reserved. */ #include "xfs.h" +#include "xfs_error.h" /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, @@ -15,7 +16,7 @@ xfs_param_t xfs_params = { /* MIN DFLT MAX */ .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 256 }, + .panic_mask = { 0, 0, XFS_PTAG_MASK}, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index fc1946f80a4a..69dbe7814128 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -83,7 +83,7 @@ xfs_iomap_valid( return true; } -static const struct iomap_page_ops xfs_iomap_page_ops = { +static const struct iomap_folio_ops xfs_iomap_folio_ops = { .iomap_valid = xfs_iomap_valid, }; @@ -133,7 +133,7 @@ xfs_bmbt_to_iomap( iomap->flags |= IOMAP_F_DIRTY; iomap->validity_cookie = sequence_cookie; - iomap->page_ops = &xfs_iomap_page_ops; + iomap->folio_ops = &xfs_iomap_folio_ops; return 0; } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 858e3e9eb4a8..48d771a76add 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -252,17 +252,12 @@ static int xfs_trans_log_finish_refcount_update( struct xfs_trans *tp, struct xfs_cud_log_item *cudp, - enum xfs_refcount_intent_type type, - xfs_fsblock_t startblock, - xfs_extlen_t blockcount, - xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, + struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur) { int error; - error = xfs_refcount_finish_one(tp, type, startblock, - blockcount, new_fsb, new_len, pcur); + error = xfs_refcount_finish_one(tp, ri, pcur); /* * Mark the transaction dirty, even on error. This ensures the @@ -297,16 +292,16 @@ xfs_refcount_update_diff_items( /* Set the phys extent flags for this reverse mapping. */ static void xfs_trans_set_refcount_flags( - struct xfs_phys_extent *refc, + struct xfs_phys_extent *pmap, enum xfs_refcount_intent_type type) { - refc->pe_flags = 0; + pmap->pe_flags = 0; switch (type) { case XFS_REFCOUNT_INCREASE: case XFS_REFCOUNT_DECREASE: case XFS_REFCOUNT_ALLOC_COW: case XFS_REFCOUNT_FREE_COW: - refc->pe_flags |= type; + pmap->pe_flags |= type; break; default: ASSERT(0); @@ -318,10 +313,10 @@ STATIC void xfs_refcount_update_log_item( struct xfs_trans *tp, struct xfs_cui_log_item *cuip, - struct xfs_refcount_intent *refc) + struct xfs_refcount_intent *ri) { uint next_extent; - struct xfs_phys_extent *ext; + struct xfs_phys_extent *pmap; tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); @@ -333,10 +328,10 @@ xfs_refcount_update_log_item( */ next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; ASSERT(next_extent < cuip->cui_format.cui_nextents); - ext = &cuip->cui_format.cui_extents[next_extent]; - ext->pe_startblock = refc->ri_startblock; - ext->pe_len = refc->ri_blockcount; - xfs_trans_set_refcount_flags(ext, refc->ri_type); + pmap = &cuip->cui_format.cui_extents[next_extent]; + pmap->pe_startblock = ri->ri_startblock; + pmap->pe_len = ri->ri_blockcount; + xfs_trans_set_refcount_flags(pmap, ri->ri_type); } static struct xfs_log_item * @@ -348,15 +343,15 @@ xfs_refcount_update_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); - struct xfs_refcount_intent *refc; + struct xfs_refcount_intent *ri; ASSERT(count > 0); xfs_trans_add_item(tp, &cuip->cui_item); if (sort) list_sort(mp, items, xfs_refcount_update_diff_items); - list_for_each_entry(refc, items, ri_list) - xfs_refcount_update_log_item(tp, cuip, refc); + list_for_each_entry(ri, items, ri_list) + xfs_refcount_update_log_item(tp, cuip, ri); return &cuip->cui_item; } @@ -378,25 +373,20 @@ xfs_refcount_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_refcount_intent *refc; - xfs_fsblock_t new_fsb; - xfs_extlen_t new_aglen; + struct xfs_refcount_intent *ri; int error; - refc = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), - refc->ri_type, refc->ri_startblock, refc->ri_blockcount, - &new_fsb, &new_aglen, state); + ri = container_of(item, struct xfs_refcount_intent, ri_list); + error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri, + state); /* Did we run out of reservation? Requeue what we didn't finish. */ - if (!error && new_aglen > 0) { - ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || - refc->ri_type == XFS_REFCOUNT_DECREASE); - refc->ri_startblock = new_fsb; - refc->ri_blockcount = new_aglen; + if (!error && ri->ri_blockcount > 0) { + ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || + ri->ri_type == XFS_REFCOUNT_DECREASE); return -EAGAIN; } - kmem_cache_free(xfs_refcount_intent_cache, refc); + kmem_cache_free(xfs_refcount_intent_cache, ri); return error; } @@ -413,10 +403,10 @@ STATIC void xfs_refcount_update_cancel_item( struct list_head *item) { - struct xfs_refcount_intent *refc; + struct xfs_refcount_intent *ri; - refc = container_of(item, struct xfs_refcount_intent, ri_list); - kmem_cache_free(xfs_refcount_intent_cache, refc); + ri = container_of(item, struct xfs_refcount_intent, ri_list); + kmem_cache_free(xfs_refcount_intent_cache, ri); } const struct xfs_defer_op_type xfs_refcount_update_defer_type = { @@ -433,15 +423,15 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { static inline bool xfs_cui_validate_phys( struct xfs_mount *mp, - struct xfs_phys_extent *refc) + struct xfs_phys_extent *pmap) { if (!xfs_has_reflink(mp)) return false; - if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) + if (pmap->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) return false; - switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { + switch (pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { case XFS_REFCOUNT_INCREASE: case XFS_REFCOUNT_DECREASE: case XFS_REFCOUNT_ALLOC_COW: @@ -451,7 +441,7 @@ xfs_cui_validate_phys( return false; } - return xfs_verify_fsbext(mp, refc->pe_startblock, refc->pe_len); + return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); } /* @@ -463,18 +453,13 @@ xfs_cui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { - struct xfs_bmbt_irec irec; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); - struct xfs_phys_extent *refc; struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - xfs_fsblock_t new_fsb; - xfs_extlen_t new_len; unsigned int refc_type; bool requeue_only = false; - enum xfs_refcount_intent_type type; int i; int error = 0; @@ -513,14 +498,17 @@ xfs_cui_item_recover( cudp = xfs_trans_get_cud(tp, cuip); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { - refc = &cuip->cui_format.cui_extents[i]; - refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + struct xfs_refcount_intent fake = { }; + struct xfs_phys_extent *pmap; + + pmap = &cuip->cui_format.cui_extents[i]; + refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; switch (refc_type) { case XFS_REFCOUNT_INCREASE: case XFS_REFCOUNT_DECREASE: case XFS_REFCOUNT_ALLOC_COW: case XFS_REFCOUNT_FREE_COW: - type = refc_type; + fake.ri_type = refc_type; break; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -529,13 +517,12 @@ xfs_cui_item_recover( error = -EFSCORRUPTED; goto abort_error; } - if (requeue_only) { - new_fsb = refc->pe_startblock; - new_len = refc->pe_len; - } else + + fake.ri_startblock = pmap->pe_startblock; + fake.ri_blockcount = pmap->pe_len; + if (!requeue_only) error = xfs_trans_log_finish_refcount_update(tp, cudp, - type, refc->pe_startblock, refc->pe_len, - &new_fsb, &new_len, &rcur); + &fake, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &cuip->cui_format, @@ -544,10 +531,13 @@ xfs_cui_item_recover( goto abort_error; /* Requeue what we didn't finish. */ - if (new_len > 0) { - irec.br_startblock = new_fsb; - irec.br_blockcount = new_len; - switch (type) { + if (fake.ri_blockcount > 0) { + struct xfs_bmbt_irec irec = { + .br_startblock = fake.ri_startblock, + .br_blockcount = fake.ri_blockcount, + }; + + switch (fake.ri_type) { case XFS_REFCOUNT_INCREASE: xfs_refcount_increase_extent(tp, &irec); break; @@ -596,18 +586,18 @@ xfs_cui_item_relog( { struct xfs_cud_log_item *cudp; struct xfs_cui_log_item *cuip; - struct xfs_phys_extent *extp; + struct xfs_phys_extent *pmap; unsigned int count; count = CUI_ITEM(intent)->cui_format.cui_nextents; - extp = CUI_ITEM(intent)->cui_format.cui_extents; + pmap = CUI_ITEM(intent)->cui_format.cui_extents; tp->t_flags |= XFS_TRANS_DIRTY; cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); cuip = xfs_cui_init(tp->t_mountp, count); - memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp)); + memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); xfs_trans_add_item(tp, &cuip->cui_item); set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 534504ede1a3..a1619d67015f 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -244,40 +244,40 @@ xfs_trans_get_rud( /* Set the map extent flags for this reverse mapping. */ static void xfs_trans_set_rmap_flags( - struct xfs_map_extent *rmap, + struct xfs_map_extent *map, enum xfs_rmap_intent_type type, int whichfork, xfs_exntst_t state) { - rmap->me_flags = 0; + map->me_flags = 0; if (state == XFS_EXT_UNWRITTEN) - rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; + map->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; if (whichfork == XFS_ATTR_FORK) - rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; + map->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; switch (type) { case XFS_RMAP_MAP: - rmap->me_flags |= XFS_RMAP_EXTENT_MAP; + map->me_flags |= XFS_RMAP_EXTENT_MAP; break; case XFS_RMAP_MAP_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; + map->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; break; case XFS_RMAP_UNMAP: - rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; + map->me_flags |= XFS_RMAP_EXTENT_UNMAP; break; case XFS_RMAP_UNMAP_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; + map->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; break; case XFS_RMAP_CONVERT: - rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; + map->me_flags |= XFS_RMAP_EXTENT_CONVERT; break; case XFS_RMAP_CONVERT_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; + map->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; break; case XFS_RMAP_ALLOC: - rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; + map->me_flags |= XFS_RMAP_EXTENT_ALLOC; break; case XFS_RMAP_FREE: - rmap->me_flags |= XFS_RMAP_EXTENT_FREE; + map->me_flags |= XFS_RMAP_EXTENT_FREE; break; default: ASSERT(0); @@ -293,19 +293,12 @@ static int xfs_trans_log_finish_rmap_update( struct xfs_trans *tp, struct xfs_rud_log_item *rudp, - enum xfs_rmap_intent_type type, - uint64_t owner, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state, + struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { int error; - error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff, - startblock, blockcount, state, pcur); + error = xfs_rmap_finish_one(tp, ri, pcur); /* * Mark the transaction dirty, even on error. This ensures the @@ -342,7 +335,7 @@ STATIC void xfs_rmap_update_log_item( struct xfs_trans *tp, struct xfs_rui_log_item *ruip, - struct xfs_rmap_intent *rmap) + struct xfs_rmap_intent *ri) { uint next_extent; struct xfs_map_extent *map; @@ -358,12 +351,12 @@ xfs_rmap_update_log_item( next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1; ASSERT(next_extent < ruip->rui_format.rui_nextents); map = &ruip->rui_format.rui_extents[next_extent]; - map->me_owner = rmap->ri_owner; - map->me_startblock = rmap->ri_bmap.br_startblock; - map->me_startoff = rmap->ri_bmap.br_startoff; - map->me_len = rmap->ri_bmap.br_blockcount; - xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork, - rmap->ri_bmap.br_state); + map->me_owner = ri->ri_owner; + map->me_startblock = ri->ri_bmap.br_startblock; + map->me_startoff = ri->ri_bmap.br_startoff; + map->me_len = ri->ri_bmap.br_blockcount; + xfs_trans_set_rmap_flags(map, ri->ri_type, ri->ri_whichfork, + ri->ri_bmap.br_state); } static struct xfs_log_item * @@ -375,15 +368,15 @@ xfs_rmap_update_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); - struct xfs_rmap_intent *rmap; + struct xfs_rmap_intent *ri; ASSERT(count > 0); xfs_trans_add_item(tp, &ruip->rui_item); if (sort) list_sort(mp, items, xfs_rmap_update_diff_items); - list_for_each_entry(rmap, items, ri_list) - xfs_rmap_update_log_item(tp, ruip, rmap); + list_for_each_entry(ri, items, ri_list) + xfs_rmap_update_log_item(tp, ruip, ri); return &ruip->rui_item; } @@ -405,16 +398,14 @@ xfs_rmap_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_rmap_intent *rmap; + struct xfs_rmap_intent *ri; int error; - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), - rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork, - rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock, - rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state, + ri = container_of(item, struct xfs_rmap_intent, ri_list); + + error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, state); - kmem_cache_free(xfs_rmap_intent_cache, rmap); + kmem_cache_free(xfs_rmap_intent_cache, ri); return error; } @@ -431,10 +422,10 @@ STATIC void xfs_rmap_update_cancel_item( struct list_head *item) { - struct xfs_rmap_intent *rmap; + struct xfs_rmap_intent *ri; - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - kmem_cache_free(xfs_rmap_intent_cache, rmap); + ri = container_of(item, struct xfs_rmap_intent, ri_list); + kmem_cache_free(xfs_rmap_intent_cache, ri); } const struct xfs_defer_op_type xfs_rmap_update_defer_type = { @@ -451,15 +442,15 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { static inline bool xfs_rui_validate_map( struct xfs_mount *mp, - struct xfs_map_extent *rmap) + struct xfs_map_extent *map) { if (!xfs_has_rmapbt(mp)) return false; - if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS) + if (map->me_flags & ~XFS_RMAP_EXTENT_FLAGS) return false; - switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: case XFS_RMAP_EXTENT_MAP_SHARED: case XFS_RMAP_EXTENT_UNMAP: @@ -473,14 +464,14 @@ xfs_rui_validate_map( return false; } - if (!XFS_RMAP_NON_INODE_OWNER(rmap->me_owner) && - !xfs_verify_ino(mp, rmap->me_owner)) + if (!XFS_RMAP_NON_INODE_OWNER(map->me_owner) && + !xfs_verify_ino(mp, map->me_owner)) return false; - if (!xfs_verify_fileext(mp, rmap->me_startoff, rmap->me_len)) + if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) return false; - return xfs_verify_fsbext(mp, rmap->me_startblock, rmap->me_len); + return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } /* @@ -493,15 +484,11 @@ xfs_rui_item_recover( struct list_head *capture_list) { struct xfs_rui_log_item *ruip = RUI_ITEM(lip); - struct xfs_map_extent *rmap; struct xfs_rud_log_item *rudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - enum xfs_rmap_intent_type type; - xfs_exntst_t state; int i; - int whichfork; int error = 0; /* @@ -526,35 +513,34 @@ xfs_rui_item_recover( rudp = xfs_trans_get_rud(tp, ruip); for (i = 0; i < ruip->rui_format.rui_nextents; i++) { - rmap = &ruip->rui_format.rui_extents[i]; - state = (rmap->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (rmap->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + struct xfs_rmap_intent fake = { }; + struct xfs_map_extent *map; + + map = &ruip->rui_format.rui_extents[i]; + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: - type = XFS_RMAP_MAP; + fake.ri_type = XFS_RMAP_MAP; break; case XFS_RMAP_EXTENT_MAP_SHARED: - type = XFS_RMAP_MAP_SHARED; + fake.ri_type = XFS_RMAP_MAP_SHARED; break; case XFS_RMAP_EXTENT_UNMAP: - type = XFS_RMAP_UNMAP; + fake.ri_type = XFS_RMAP_UNMAP; break; case XFS_RMAP_EXTENT_UNMAP_SHARED: - type = XFS_RMAP_UNMAP_SHARED; + fake.ri_type = XFS_RMAP_UNMAP_SHARED; break; case XFS_RMAP_EXTENT_CONVERT: - type = XFS_RMAP_CONVERT; + fake.ri_type = XFS_RMAP_CONVERT; break; case XFS_RMAP_EXTENT_CONVERT_SHARED: - type = XFS_RMAP_CONVERT_SHARED; + fake.ri_type = XFS_RMAP_CONVERT_SHARED; break; case XFS_RMAP_EXTENT_ALLOC: - type = XFS_RMAP_ALLOC; + fake.ri_type = XFS_RMAP_ALLOC; break; case XFS_RMAP_EXTENT_FREE: - type = XFS_RMAP_FREE; + fake.ri_type = XFS_RMAP_FREE; break; default: XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -563,13 +549,21 @@ xfs_rui_item_recover( error = -EFSCORRUPTED; goto abort_error; } - error = xfs_trans_log_finish_rmap_update(tp, rudp, type, - rmap->me_owner, whichfork, - rmap->me_startoff, rmap->me_startblock, - rmap->me_len, state, &rcur); + + fake.ri_owner = map->me_owner; + fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + fake.ri_bmap.br_startblock = map->me_startblock; + fake.ri_bmap.br_startoff = map->me_startoff; + fake.ri_bmap.br_blockcount = map->me_len; + fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + + error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, + &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - rmap, sizeof(*rmap)); + map, sizeof(*map)); if (error) goto abort_error; @@ -600,18 +594,18 @@ xfs_rui_item_relog( { struct xfs_rud_log_item *rudp; struct xfs_rui_log_item *ruip; - struct xfs_map_extent *extp; + struct xfs_map_extent *map; unsigned int count; count = RUI_ITEM(intent)->rui_format.rui_nextents; - extp = RUI_ITEM(intent)->rui_format.rui_extents; + map = RUI_ITEM(intent)->rui_format.rui_extents; tp->t_flags |= XFS_TRANS_DIRTY; rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); ruip = xfs_rui_init(tp->t_mountp, count); - memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp)); + memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); xfs_trans_add_item(tp, &ruip->rui_item); set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index f7faf6e70d7f..a3c6b1548723 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -69,7 +69,7 @@ static struct attribute *xfs_mp_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_mp); -struct kobj_type xfs_mp_ktype = { +const struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_mp_groups, @@ -266,7 +266,7 @@ static struct attribute *xfs_dbg_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_dbg); -struct kobj_type xfs_dbg_ktype = { +const struct kobj_type xfs_dbg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_dbg_groups, @@ -324,7 +324,7 @@ static struct attribute *xfs_stats_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_stats); -struct kobj_type xfs_stats_ktype = { +const struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_stats_groups, @@ -410,7 +410,7 @@ static struct attribute *xfs_log_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_log); -struct kobj_type xfs_log_ktype = { +const struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_log_groups, @@ -564,13 +564,13 @@ static struct attribute *xfs_error_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_error); -static struct kobj_type xfs_error_cfg_ktype = { +static const struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_error_groups, }; -static struct kobj_type xfs_error_ktype = { +static const struct kobj_type xfs_error_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, }; diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 513095e353a5..148893ebfdef 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -7,10 +7,10 @@ #ifndef __XFS_SYSFS_H__ #define __XFS_SYSFS_H__ -extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ -extern struct kobj_type xfs_dbg_ktype; /* debug */ -extern struct kobj_type xfs_log_ktype; /* xlog */ -extern struct kobj_type xfs_stats_ktype; /* stats */ +extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ +extern const struct kobj_type xfs_dbg_ktype; /* debug */ +extern const struct kobj_type xfs_log_ktype; /* xlog */ +extern const struct kobj_type xfs_stats_ktype; /* stats */ static inline struct xfs_kobj * to_kobj(struct kobject *kobject) @@ -28,7 +28,7 @@ xfs_sysfs_release(struct kobject *kobject) static inline int xfs_sysfs_init( struct xfs_kobj *kobj, - struct kobj_type *ktype, + const struct kobj_type *ktype, struct xfs_kobj *parent_kobj, const char *name) { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 421d1e504ac4..6b0e9ae7c513 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3207,17 +3207,14 @@ DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); TRACE_EVENT(xfs_refcount_finish_one_leftover, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int type, xfs_agblock_t agbno, xfs_extlen_t len, - xfs_agblock_t new_agbno, xfs_extlen_t new_len), - TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len), + int type, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, type, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(int, type) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(xfs_agblock_t, new_agbno) - __field(xfs_extlen_t, new_len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; @@ -3225,17 +3222,13 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover, __entry->type = type; __entry->agbno = agbno; __entry->len = len; - __entry->new_agbno = new_agbno; - __entry->new_len = new_len; ), - TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x", + TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, __entry->agbno, - __entry->len, - __entry->new_agbno, - __entry->new_len) + __entry->len) ); /* simple inode-based error/%ip tracepoint class */ diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile index 9fe54f5319f2..645f7229de4a 100644 --- a/fs/zonefs/Makefile +++ b/fs/zonefs/Makefile @@ -3,4 +3,4 @@ ccflags-y += -I$(src) obj-$(CONFIG_ZONEFS_FS) += zonefs.o -zonefs-y := super.o sysfs.o +zonefs-y := super.o file.o sysfs.o diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c new file mode 100644 index 000000000000..738b0e28d74b --- /dev/null +++ b/fs/zonefs/file.c @@ -0,0 +1,878 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Simple file system for zoned block devices exposing zones as files. + * + * Copyright (C) 2022 Western Digital Corporation or its affiliates. + */ +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/iomap.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/statfs.h> +#include <linux/writeback.h> +#include <linux/quotaops.h> +#include <linux/seq_file.h> +#include <linux/parser.h> +#include <linux/uio.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/task_io_accounting_ops.h> + +#include "zonefs.h" + +#include "trace.h" + +static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* + * All blocks are always mapped below EOF. If reading past EOF, + * act as if there is a hole up to the file maximum size. + */ + mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + isize = i_size_read(inode); + if (iomap->offset >= isize) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; + iomap->length = isize - iomap->offset; + } + mutex_unlock(&zi->i_truncate_mutex); + + trace_zonefs_iomap_begin(inode, iomap); + + return 0; +} + +static const struct iomap_ops zonefs_read_iomap_ops = { + .iomap_begin = zonefs_read_iomap_begin, +}; + +static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* All write I/Os should always be within the file maximum size */ + if (WARN_ON_ONCE(offset + length > z->z_capacity)) + return -EIO; + + /* + * Sequential zones can only accept direct writes. This is already + * checked when writes are issued, so warn if we see a page writeback + * operation. + */ + if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) + return -EIO; + + /* + * For conventional zones, all blocks are always mapped. For sequential + * zones, all blocks after always mapped below the inode size (zone + * write pointer) and unwriten beyond. + */ + mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; + isize = i_size_read(inode); + if (iomap->offset >= isize) { + iomap->type = IOMAP_UNWRITTEN; + iomap->length = z->z_capacity - iomap->offset; + } else { + iomap->type = IOMAP_MAPPED; + iomap->length = isize - iomap->offset; + } + mutex_unlock(&zi->i_truncate_mutex); + + trace_zonefs_iomap_begin(inode, iomap); + + return 0; +} + +static const struct iomap_ops zonefs_write_iomap_ops = { + .iomap_begin = zonefs_write_iomap_begin, +}; + +static int zonefs_read_folio(struct file *unused, struct folio *folio) +{ + return iomap_read_folio(folio, &zonefs_read_iomap_ops); +} + +static void zonefs_readahead(struct readahead_control *rac) +{ + iomap_readahead(rac, &zonefs_read_iomap_ops); +} + +/* + * Map blocks for page writeback. This is used only on conventional zone files, + * which implies that the page range can only be within the fixed inode size. + */ +static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) +{ + struct zonefs_zone *z = zonefs_inode_zone(inode); + + if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) + return -EIO; + if (WARN_ON_ONCE(offset >= i_size_read(inode))) + return -EIO; + + /* If the mapping is already OK, nothing needs to be done */ + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + + return zonefs_write_iomap_begin(inode, offset, + z->z_capacity - offset, + IOMAP_WRITE, &wpc->iomap, NULL); +} + +static const struct iomap_writeback_ops zonefs_writeback_ops = { + .map_blocks = zonefs_write_map_blocks, +}; + +static int zonefs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct iomap_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); +} + +static int zonefs_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) +{ + struct inode *inode = file_inode(swap_file); + + if (zonefs_inode_is_seq(inode)) { + zonefs_err(inode->i_sb, + "swap file: not a conventional zone file\n"); + return -EINVAL; + } + + return iomap_swapfile_activate(sis, swap_file, span, + &zonefs_read_iomap_ops); +} + +const struct address_space_operations zonefs_file_aops = { + .read_folio = zonefs_read_folio, + .readahead = zonefs_readahead, + .writepages = zonefs_writepages, + .dirty_folio = filemap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .migrate_folio = filemap_migrate_folio, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, + .direct_IO = noop_direct_IO, + .swap_activate = zonefs_swap_activate, +}; + +int zonefs_file_truncate(struct inode *inode, loff_t isize) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t old_isize; + enum req_op op; + int ret = 0; + + /* + * Only sequential zone files can be truncated and truncation is allowed + * only down to a 0 size, which is equivalent to a zone reset, and to + * the maximum file size, which is equivalent to a zone finish. + */ + if (!zonefs_zone_is_seq(z)) + return -EPERM; + + if (!isize) + op = REQ_OP_ZONE_RESET; + else if (isize == z->z_capacity) + op = REQ_OP_ZONE_FINISH; + else + return -EPERM; + + inode_dio_wait(inode); + + /* Serialize against page faults */ + filemap_invalidate_lock(inode->i_mapping); + + /* Serialize against zonefs_iomap_begin() */ + mutex_lock(&zi->i_truncate_mutex); + + old_isize = i_size_read(inode); + if (isize == old_isize) + goto unlock; + + ret = zonefs_inode_zone_mgmt(inode, op); + if (ret) + goto unlock; + + /* + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, + * take care of open zones. + */ + if (z->z_flags & ZONEFS_ZONE_OPEN) { + /* + * Truncating a zone to EMPTY or FULL is the equivalent of + * closing the zone. For a truncation to 0, we need to + * re-open the zone to ensure new writes can be processed. + * For a truncation to the maximum file size, the zone is + * closed and writes cannot be accepted anymore, so clear + * the open flag. + */ + if (!isize) + ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + else + z->z_flags &= ~ZONEFS_ZONE_OPEN; + } + + zonefs_update_stats(inode, isize); + truncate_setsize(inode, isize); + z->z_wpoffset = isize; + zonefs_inode_account_active(inode); + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + filemap_invalidate_unlock(inode->i_mapping); + + return ret; +} + +static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file_inode(file); + int ret = 0; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + /* + * Since only direct writes are allowed in sequential files, page cache + * flush is needed only for conventional zone files. + */ + if (zonefs_inode_is_cnv(inode)) + ret = file_write_and_wait_range(file, start, end); + if (!ret) + ret = blkdev_issue_flush(inode->i_sb->s_bdev); + + if (ret) + zonefs_io_error(inode, true); + + return ret; +} + +static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + vm_fault_t ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + + /* + * Sanity check: only conventional zone files can have shared + * writeable mappings. + */ + if (zonefs_inode_is_seq(inode)) + return VM_FAULT_NOPAGE; + + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + + /* Serialize against truncates */ + filemap_invalidate_lock_shared(inode->i_mapping); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + filemap_invalidate_unlock_shared(inode->i_mapping); + + sb_end_pagefault(inode->i_sb); + return ret; +} + +static const struct vm_operations_struct zonefs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = zonefs_filemap_page_mkwrite, +}; + +static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* + * Conventional zones accept random writes, so their files can support + * shared writable mappings. For sequential zone files, only read + * mappings are possible since there are no guarantees for write + * ordering between msync() and page cache writeback. + */ + if (zonefs_inode_is_seq(file_inode(file)) && + (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + + file_accessed(file); + vma->vm_ops = &zonefs_file_vm_ops; + + return 0; +} + +static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) +{ + loff_t isize = i_size_read(file_inode(file)); + + /* + * Seeks are limited to below the zone size for conventional zones + * and below the zone write pointer for sequential zones. In both + * cases, this limit is the inode size. + */ + return generic_file_llseek_size(file, offset, whence, isize, isize); +} + +static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (error) { + zonefs_io_error(inode, true); + return error; + } + + if (size && zonefs_inode_is_seq(inode)) { + /* + * Note that we may be seeing completions out of order, + * but that is not a problem since a write completed + * successfully necessarily means that all preceding writes + * were also successful. So we can safely increase the inode + * size to the write end location. + */ + mutex_lock(&zi->i_truncate_mutex); + if (i_size_read(inode) < iocb->ki_pos + size) { + zonefs_update_stats(inode, iocb->ki_pos + size); + zonefs_i_size_write(inode, iocb->ki_pos + size); + } + mutex_unlock(&zi->i_truncate_mutex); + } + + return 0; +} + +static const struct iomap_dio_ops zonefs_write_dio_ops = { + .end_io = zonefs_file_write_dio_end_io, +}; + +static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int max = bdev_max_zone_append_sectors(bdev); + struct bio *bio; + ssize_t size; + int nr_pages; + ssize_t ret; + + max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); + iov_iter_truncate(from, max); + + nr_pages = iov_iter_npages(from, BIO_MAX_VECS); + if (!nr_pages) + return 0; + + bio = bio_alloc(bdev, nr_pages, + REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); + bio->bi_iter.bi_sector = z->z_sector; + bio->bi_ioprio = iocb->ki_ioprio; + if (iocb_is_dsync(iocb)) + bio->bi_opf |= REQ_FUA; + + ret = bio_iov_iter_get_pages(bio, from); + if (unlikely(ret)) + goto out_release; + + size = bio->bi_iter.bi_size; + task_io_account_write(size); + + if (iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(bio, iocb); + + ret = submit_bio_wait(bio); + + /* + * If the file zone was written underneath the file system, the zone + * write pointer may not be where we expect it to be, but the zone + * append write can still succeed. So check manually that we wrote where + * we intended to, that is, at zi->i_wpoffset. + */ + if (!ret) { + sector_t wpsector = + z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT); + + if (bio->bi_iter.bi_sector != wpsector) { + zonefs_warn(inode->i_sb, + "Corrupted write pointer %llu for zone at %llu\n", + wpsector, z->z_sector); + ret = -EIO; + } + } + + zonefs_file_write_dio_end_io(iocb, size, ret, 0); + trace_zonefs_file_dio_append(inode, size, ret); + +out_release: + bio_release_pages(bio, false); + bio_put(bio); + + if (ret >= 0) { + iocb->ki_pos += size; + return size; + } + + return ret; +} + +/* + * Do not exceed the LFS limits nor the file zone size. If pos is under the + * limit it becomes a short access. If it exceeds the limit, return -EFBIG. + */ +static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, + loff_t count) +{ + struct inode *inode = file_inode(file); + struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t limit = rlimit(RLIMIT_FSIZE); + loff_t max_size = z->z_capacity; + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + count = min(count, limit - pos); + } + + if (!(file->f_flags & O_LARGEFILE)) + max_size = min_t(loff_t, MAX_NON_LFS, max_size); + + if (unlikely(pos >= max_size)) + return -EFBIG; + + return min(count, max_size - pos); +} + +static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t count; + + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + if (!iov_iter_count(from)) + return 0; + + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + if (iocb->ki_flags & IOCB_APPEND) { + if (zonefs_zone_is_cnv(z)) + return -EINVAL; + mutex_lock(&zi->i_truncate_mutex); + iocb->ki_pos = z->z_wpoffset; + mutex_unlock(&zi->i_truncate_mutex); + } + + count = zonefs_write_check_limits(file, iocb->ki_pos, + iov_iter_count(from)); + if (count < 0) + return count; + + iov_iter_truncate(from, count); + return iov_iter_count(from); +} + +/* + * Handle direct writes. For sequential zone files, this is the only possible + * write path. For these files, check that the user is issuing writes + * sequentially from the end of the file. This code assumes that the block layer + * delivers write requests to the device in sequential order. This is always the + * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE + * elevator feature is being used (e.g. mq-deadline). The block layer always + * automatically select such an elevator for zoned block devices during the + * device initialization. + */ +static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + bool sync = is_sync_kiocb(iocb); + bool append = false; + ssize_t ret, count; + + /* + * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT + * as this can cause write reordering (e.g. the first aio gets EAGAIN + * on the inode lock but the second goes through but is now unaligned). + */ + if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + count = zonefs_write_checks(iocb, from); + if (count <= 0) { + ret = count; + goto inode_unlock; + } + + if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { + ret = -EINVAL; + goto inode_unlock; + } + + /* Enforce sequential writes (append only) in sequential zones */ + if (zonefs_zone_is_seq(z)) { + mutex_lock(&zi->i_truncate_mutex); + if (iocb->ki_pos != z->z_wpoffset) { + mutex_unlock(&zi->i_truncate_mutex); + ret = -EINVAL; + goto inode_unlock; + } + mutex_unlock(&zi->i_truncate_mutex); + append = sync; + } + + if (append) + ret = zonefs_file_dio_append(iocb, from); + else + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, + &zonefs_write_dio_ops, 0, NULL, 0); + if (zonefs_zone_is_seq(z) && + (ret > 0 || ret == -EIOCBQUEUED)) { + if (ret > 0) + count = ret; + + /* + * Update the zone write pointer offset assuming the write + * operation succeeded. If it did not, the error recovery path + * will correct it. Also do active seq file accounting. + */ + mutex_lock(&zi->i_truncate_mutex); + z->z_wpoffset += count; + zonefs_inode_account_active(inode); + mutex_unlock(&zi->i_truncate_mutex); + } + +inode_unlock: + inode_unlock(inode); + + return ret; +} + +static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, + struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + /* + * Direct IO writes are mandatory for sequential zone files so that the + * write IO issuing order is preserved. + */ + if (zonefs_inode_is_seq(inode)) + return -EIO; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = zonefs_write_checks(iocb, from); + if (ret <= 0) + goto inode_unlock; + + ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); + if (ret > 0) + iocb->ki_pos += ret; + else if (ret == -EIO) + zonefs_io_error(inode, true); + +inode_unlock: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + + return ret; +} + +static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_zone *z = zonefs_inode_zone(inode); + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (sb_rdonly(inode->i_sb)) + return -EROFS; + + /* Write operations beyond the zone capacity are not allowed */ + if (iocb->ki_pos >= z->z_capacity) + return -EFBIG; + + if (iocb->ki_flags & IOCB_DIRECT) { + ssize_t ret = zonefs_file_dio_write(iocb, from); + + if (ret != -ENOTBLK) + return ret; + } + + return zonefs_file_buffered_write(iocb, from); +} + +static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + if (error) { + zonefs_io_error(file_inode(iocb->ki_filp), false); + return error; + } + + return 0; +} + +static const struct iomap_dio_ops zonefs_read_dio_ops = { + .end_io = zonefs_file_read_dio_end_io, +}; + +static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + ssize_t ret; + + /* Offline zones cannot be read */ + if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) + return -EPERM; + + if (iocb->ki_pos >= z->z_capacity) + return 0; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + /* Limit read operations to written data */ + mutex_lock(&zi->i_truncate_mutex); + isize = i_size_read(inode); + if (iocb->ki_pos >= isize) { + mutex_unlock(&zi->i_truncate_mutex); + ret = 0; + goto inode_unlock; + } + iov_iter_truncate(to, isize - iocb->ki_pos); + mutex_unlock(&zi->i_truncate_mutex); + + if (iocb->ki_flags & IOCB_DIRECT) { + size_t count = iov_iter_count(to); + + if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { + ret = -EINVAL; + goto inode_unlock; + } + file_accessed(iocb->ki_filp); + ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, + &zonefs_read_dio_ops, 0, NULL, 0); + } else { + ret = generic_file_read_iter(iocb, to); + if (ret == -EIO) + zonefs_io_error(inode, false); + } + +inode_unlock: + inode_unlock_shared(inode); + + return ret; +} + +/* + * Write open accounting is done only for sequential files. + */ +static inline bool zonefs_seq_file_need_wro(struct inode *inode, + struct file *file) +{ + if (zonefs_inode_is_cnv(inode)) + return false; + + if (!(file->f_mode & FMODE_WRITE)) + return false; + + return true; +} + +static int zonefs_seq_file_write_open(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + if (!zi->i_wr_refcnt) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); + + if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + + if (sbi->s_max_wro_seq_files + && wro > sbi->s_max_wro_seq_files) { + atomic_dec(&sbi->s_wro_seq_files); + ret = -EBUSY; + goto unlock; + } + + if (i_size_read(inode) < z->z_capacity) { + ret = zonefs_inode_zone_mgmt(inode, + REQ_OP_ZONE_OPEN); + if (ret) { + atomic_dec(&sbi->s_wro_seq_files); + goto unlock; + } + z->z_flags |= ZONEFS_ZONE_OPEN; + zonefs_inode_account_active(inode); + } + } + } + + zi->i_wr_refcnt++; + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + + return ret; +} + +static int zonefs_file_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = generic_file_open(inode, file); + if (ret) + return ret; + + if (zonefs_seq_file_need_wro(inode, file)) + return zonefs_seq_file_write_open(inode); + + return 0; +} + +static void zonefs_seq_file_write_close(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + zi->i_wr_refcnt--; + if (zi->i_wr_refcnt) + goto unlock; + + /* + * The file zone may not be open anymore (e.g. the file was truncated to + * its maximum size or it was fully written). For this case, we only + * need to decrement the write open count. + */ + if (z->z_flags & ZONEFS_ZONE_OPEN) { + ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) { + __zonefs_io_error(inode, false); + /* + * Leaving zones explicitly open may lead to a state + * where most zones cannot be written (zone resources + * exhausted). So take preventive action by remounting + * read-only. + */ + if (z->z_flags & ZONEFS_ZONE_OPEN && + !(sb->s_flags & SB_RDONLY)) { + zonefs_warn(sb, + "closing zone at %llu failed %d\n", + z->z_sector, ret); + zonefs_warn(sb, + "remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + } + goto unlock; + } + + z->z_flags &= ~ZONEFS_ZONE_OPEN; + zonefs_inode_account_active(inode); + } + + atomic_dec(&sbi->s_wro_seq_files); + +unlock: + mutex_unlock(&zi->i_truncate_mutex); +} + +static int zonefs_file_release(struct inode *inode, struct file *file) +{ + /* + * If we explicitly open a zone we must close it again as well, but the + * zone management operation can fail (either due to an IO error or as + * the zone has gone offline or read-only). Make sure we don't fail the + * close(2) for user-space. + */ + if (zonefs_seq_file_need_wro(inode, file)) + zonefs_seq_file_write_close(inode); + + return 0; +} + +const struct file_operations zonefs_file_operations = { + .open = zonefs_file_open, + .release = zonefs_file_release, + .fsync = zonefs_file_fsync, + .mmap = zonefs_file_mmap, + .llseek = zonefs_file_llseek, + .read_iter = zonefs_file_read_iter, + .write_iter = zonefs_file_write_iter, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .iopoll = iocb_bio_iopoll, +}; diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 72ef97320b99..23b8b299c64e 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -28,33 +28,47 @@ #include "trace.h" /* - * Manage the active zone count. Called with zi->i_truncate_mutex held. + * Get the name of a zone group directory. */ -static void zonefs_account_active(struct inode *inode) +static const char *zonefs_zgroup_name(enum zonefs_ztype ztype) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - struct zonefs_inode_info *zi = ZONEFS_I(inode); + switch (ztype) { + case ZONEFS_ZTYPE_CNV: + return "cnv"; + case ZONEFS_ZTYPE_SEQ: + return "seq"; + default: + WARN_ON_ONCE(1); + return "???"; + } +} - lockdep_assert_held(&zi->i_truncate_mutex); +/* + * Manage the active zone count. + */ +static void zonefs_account_active(struct super_block *sb, + struct zonefs_zone *z) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + if (zonefs_zone_is_cnv(z)) return; /* * For zones that transitioned to the offline or readonly condition, * we only need to clear the active state. */ - if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) + if (z->z_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) goto out; /* * If the zone is active, that is, if it is explicitly open or * partially written, check if it was already accounted as active. */ - if ((zi->i_flags & ZONEFS_ZONE_OPEN) || - (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) { - if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) { - zi->i_flags |= ZONEFS_ZONE_ACTIVE; + if ((z->z_flags & ZONEFS_ZONE_OPEN) || + (z->z_wpoffset > 0 && z->z_wpoffset < z->z_capacity)) { + if (!(z->z_flags & ZONEFS_ZONE_ACTIVE)) { + z->z_flags |= ZONEFS_ZONE_ACTIVE; atomic_inc(&sbi->s_active_seq_files); } return; @@ -62,18 +76,29 @@ static void zonefs_account_active(struct inode *inode) out: /* The zone is not active. If it was, update the active count */ - if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { - zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; + if (z->z_flags & ZONEFS_ZONE_ACTIVE) { + z->z_flags &= ~ZONEFS_ZONE_ACTIVE; atomic_dec(&sbi->s_active_seq_files); } } -static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) +/* + * Manage the active zone count. Called with zi->i_truncate_mutex held. + */ +void zonefs_inode_account_active(struct inode *inode) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret; + lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); - lockdep_assert_held(&zi->i_truncate_mutex); + return zonefs_account_active(inode->i_sb, zonefs_inode_zone(inode)); +} + +/* + * Execute a zone management operation. + */ +static int zonefs_zone_mgmt(struct super_block *sb, + struct zonefs_zone *z, enum req_op op) +{ + int ret; /* * With ZNS drives, closing an explicitly open zone that has not been @@ -83,201 +108,49 @@ static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) * are exceeded, make sure that the zone does not remain active by * resetting it. */ - if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset) + if (op == REQ_OP_ZONE_CLOSE && !z->z_wpoffset) op = REQ_OP_ZONE_RESET; - trace_zonefs_zone_mgmt(inode, op); - ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, - zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); + trace_zonefs_zone_mgmt(sb, z, op); + ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, + z->z_size >> SECTOR_SHIFT, GFP_NOFS); if (ret) { - zonefs_err(inode->i_sb, + zonefs_err(sb, "Zone management operation %s at %llu failed %d\n", - blk_op_str(op), zi->i_zsector, ret); + blk_op_str(op), z->z_sector, ret); return ret; } return 0; } -static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - i_size_write(inode, isize); - /* - * A full zone is no longer open/active and does not need - * explicit closing. - */ - if (isize >= zi->i_max_size) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - - if (zi->i_flags & ZONEFS_ZONE_ACTIVE) - atomic_dec(&sbi->s_active_seq_files); - zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); - } -} - -static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, - loff_t length, unsigned int flags, - struct iomap *iomap, struct iomap *srcmap) +int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - loff_t isize; + lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); - /* - * All blocks are always mapped below EOF. If reading past EOF, - * act as if there is a hole up to the file maximum size. - */ - mutex_lock(&zi->i_truncate_mutex); - iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); - isize = i_size_read(inode); - if (iomap->offset >= isize) { - iomap->type = IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; - iomap->length = length; - } else { - iomap->type = IOMAP_MAPPED; - iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; - iomap->length = isize - iomap->offset; - } - mutex_unlock(&zi->i_truncate_mutex); - - trace_zonefs_iomap_begin(inode, iomap); - - return 0; + return zonefs_zone_mgmt(inode->i_sb, zonefs_inode_zone(inode), op); } -static const struct iomap_ops zonefs_read_iomap_ops = { - .iomap_begin = zonefs_read_iomap_begin, -}; - -static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, - loff_t length, unsigned int flags, - struct iomap *iomap, struct iomap *srcmap) +void zonefs_i_size_write(struct inode *inode, loff_t isize) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - loff_t isize; + struct zonefs_zone *z = zonefs_inode_zone(inode); - /* All write I/Os should always be within the file maximum size */ - if (WARN_ON_ONCE(offset + length > zi->i_max_size)) - return -EIO; - - /* - * Sequential zones can only accept direct writes. This is already - * checked when writes are issued, so warn if we see a page writeback - * operation. - */ - if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && - !(flags & IOMAP_DIRECT))) - return -EIO; + i_size_write(inode, isize); /* - * For conventional zones, all blocks are always mapped. For sequential - * zones, all blocks after always mapped below the inode size (zone - * write pointer) and unwriten beyond. + * A full zone is no longer open/active and does not need + * explicit closing. */ - mutex_lock(&zi->i_truncate_mutex); - iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); - iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; - isize = i_size_read(inode); - if (iomap->offset >= isize) { - iomap->type = IOMAP_UNWRITTEN; - iomap->length = zi->i_max_size - iomap->offset; - } else { - iomap->type = IOMAP_MAPPED; - iomap->length = isize - iomap->offset; - } - mutex_unlock(&zi->i_truncate_mutex); - - trace_zonefs_iomap_begin(inode, iomap); - - return 0; -} - -static const struct iomap_ops zonefs_write_iomap_ops = { - .iomap_begin = zonefs_write_iomap_begin, -}; - -static int zonefs_read_folio(struct file *unused, struct folio *folio) -{ - return iomap_read_folio(folio, &zonefs_read_iomap_ops); -} - -static void zonefs_readahead(struct readahead_control *rac) -{ - iomap_readahead(rac, &zonefs_read_iomap_ops); -} - -/* - * Map blocks for page writeback. This is used only on conventional zone files, - * which implies that the page range can only be within the fixed inode size. - */ -static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) - return -EIO; - if (WARN_ON_ONCE(offset >= i_size_read(inode))) - return -EIO; - - /* If the mapping is already OK, nothing needs to be done */ - if (offset >= wpc->iomap.offset && - offset < wpc->iomap.offset + wpc->iomap.length) - return 0; - - return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, - IOMAP_WRITE, &wpc->iomap, NULL); -} - -static const struct iomap_writeback_ops zonefs_writeback_ops = { - .map_blocks = zonefs_write_map_blocks, -}; - -static int zonefs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct iomap_writepage_ctx wpc = { }; - - return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); -} - -static int zonefs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) -{ - struct inode *inode = file_inode(swap_file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); + if (isize >= z->z_capacity) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { - zonefs_err(inode->i_sb, - "swap file: not a conventional zone file\n"); - return -EINVAL; + if (z->z_flags & ZONEFS_ZONE_ACTIVE) + atomic_dec(&sbi->s_active_seq_files); + z->z_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); } - - return iomap_swapfile_activate(sis, swap_file, span, - &zonefs_read_iomap_ops); } -static const struct address_space_operations zonefs_file_aops = { - .read_folio = zonefs_read_folio, - .readahead = zonefs_readahead, - .writepages = zonefs_writepages, - .dirty_folio = filemap_dirty_folio, - .release_folio = iomap_release_folio, - .invalidate_folio = iomap_invalidate_folio, - .migrate_folio = filemap_migrate_folio, - .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, - .direct_IO = noop_direct_IO, - .swap_activate = zonefs_swap_activate, -}; - -static void zonefs_update_stats(struct inode *inode, loff_t new_isize) +void zonefs_update_stats(struct inode *inode, loff_t new_isize) { struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); @@ -310,63 +183,69 @@ static void zonefs_update_stats(struct inode *inode, loff_t new_isize) } /* - * Check a zone condition and adjust its file inode access permissions for - * offline and readonly zones. Return the inode size corresponding to the - * amount of readable data in the zone. + * Check a zone condition. Return the amount of written (and still readable) + * data in the zone. */ -static loff_t zonefs_check_zone_condition(struct inode *inode, - struct blk_zone *zone, bool warn, - bool mount) +static loff_t zonefs_check_zone_condition(struct super_block *sb, + struct zonefs_zone *z, + struct blk_zone *zone) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - switch (zone->cond) { case BLK_ZONE_COND_OFFLINE: - /* - * Dead zone: make the inode immutable, disable all accesses - * and set the file size to 0 (zone wp set to zone start). - */ - if (warn) - zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", - inode->i_ino); - inode->i_flags |= S_IMMUTABLE; - inode->i_mode &= ~0777; - zone->wp = zone->start; - zi->i_flags |= ZONEFS_ZONE_OFFLINE; + zonefs_warn(sb, "Zone %llu: offline zone\n", + z->z_sector); + z->z_flags |= ZONEFS_ZONE_OFFLINE; return 0; case BLK_ZONE_COND_READONLY: /* - * The write pointer of read-only zones is invalid. If such a - * zone is found during mount, the file size cannot be retrieved - * so we treat the zone as offline (mount == true case). - * Otherwise, keep the file size as it was when last updated - * so that the user can recover data. In both cases, writes are - * always disabled for the zone. + * The write pointer of read-only zones is invalid, so we cannot + * determine the zone wpoffset (inode size). We thus keep the + * zone wpoffset as is, which leads to an empty file + * (wpoffset == 0) on mount. For a runtime error, this keeps + * the inode size as it was when last updated so that the user + * can recover data. */ - if (warn) - zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", - inode->i_ino); - inode->i_flags |= S_IMMUTABLE; - if (mount) { - zone->cond = BLK_ZONE_COND_OFFLINE; - inode->i_mode &= ~0777; - zone->wp = zone->start; - zi->i_flags |= ZONEFS_ZONE_OFFLINE; - return 0; - } - zi->i_flags |= ZONEFS_ZONE_READONLY; - inode->i_mode &= ~0222; - return i_size_read(inode); + zonefs_warn(sb, "Zone %llu: read-only zone\n", + z->z_sector); + z->z_flags |= ZONEFS_ZONE_READONLY; + if (zonefs_zone_is_cnv(z)) + return z->z_capacity; + return z->z_wpoffset; case BLK_ZONE_COND_FULL: /* The write pointer of full zones is invalid. */ - return zi->i_max_size; + return z->z_capacity; default: - if (zi->i_ztype == ZONEFS_ZTYPE_CNV) - return zi->i_max_size; + if (zonefs_zone_is_cnv(z)) + return z->z_capacity; return (zone->wp - zone->start) << SECTOR_SHIFT; } } +/* + * Check a zone condition and adjust its inode access permissions for + * offline and readonly zones. + */ +static void zonefs_inode_update_mode(struct inode *inode) +{ + struct zonefs_zone *z = zonefs_inode_zone(inode); + + if (z->z_flags & ZONEFS_ZONE_OFFLINE) { + /* Offline zones cannot be read nor written */ + inode->i_flags |= S_IMMUTABLE; + inode->i_mode &= ~0777; + } else if (z->z_flags & ZONEFS_ZONE_READONLY) { + /* Readonly zones cannot be written */ + inode->i_flags |= S_IMMUTABLE; + if (z->z_flags & ZONEFS_ZONE_INIT_MODE) + inode->i_mode &= ~0777; + else + inode->i_mode &= ~0222; + } + + z->z_flags &= ~ZONEFS_ZONE_INIT_MODE; + z->z_mode = inode->i_mode; +} + struct zonefs_ioerr_data { struct inode *inode; bool write; @@ -377,7 +256,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, { struct zonefs_ioerr_data *err = data; struct inode *inode = err->inode; - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); loff_t isize, data_size; @@ -388,10 +267,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * as there is no inconsistency between the inode size and the amount of * data writen in the zone (data_size). */ - data_size = zonefs_check_zone_condition(inode, zone, true, false); + data_size = zonefs_check_zone_condition(sb, z, zone); isize = i_size_read(inode); - if (zone->cond != BLK_ZONE_COND_OFFLINE && - zone->cond != BLK_ZONE_COND_READONLY && + if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && !err->write && isize == data_size) return 0; @@ -414,8 +292,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * In all cases, warn about inode size inconsistency and handle the * IO error according to the zone condition and to the mount options. */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size) - zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", + if (zonefs_zone_is_seq(z) && isize != data_size) + zonefs_warn(sb, + "inode %lu: invalid size %lld (should be %lld)\n", inode->i_ino, isize, data_size); /* @@ -424,24 +303,22 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * zone condition to read-only and offline respectively, as if the * condition was signaled by the hardware. */ - if (zone->cond == BLK_ZONE_COND_OFFLINE || - sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) { + if ((z->z_flags & ZONEFS_ZONE_OFFLINE) || + (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { zonefs_warn(sb, "inode %lu: read/write access disabled\n", inode->i_ino); - if (zone->cond != BLK_ZONE_COND_OFFLINE) { - zone->cond = BLK_ZONE_COND_OFFLINE; - data_size = zonefs_check_zone_condition(inode, zone, - false, false); - } - } else if (zone->cond == BLK_ZONE_COND_READONLY || - sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) { + if (!(z->z_flags & ZONEFS_ZONE_OFFLINE)) + z->z_flags |= ZONEFS_ZONE_OFFLINE; + zonefs_inode_update_mode(inode); + data_size = 0; + } else if ((z->z_flags & ZONEFS_ZONE_READONLY) || + (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { zonefs_warn(sb, "inode %lu: write access disabled\n", inode->i_ino); - if (zone->cond != BLK_ZONE_COND_READONLY) { - zone->cond = BLK_ZONE_COND_READONLY; - data_size = zonefs_check_zone_condition(inode, zone, - false, false); - } + if (!(z->z_flags & ZONEFS_ZONE_READONLY)) + z->z_flags |= ZONEFS_ZONE_READONLY; + zonefs_inode_update_mode(inode); + data_size = isize; } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && data_size > isize) { /* Do not expose garbage data */ @@ -455,9 +332,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * close of the zone when the inode file is closed. */ if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && - (zone->cond == BLK_ZONE_COND_OFFLINE || - zone->cond == BLK_ZONE_COND_READONLY)) - zi->i_flags &= ~ZONEFS_ZONE_OPEN; + (z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) + z->z_flags &= ~ZONEFS_ZONE_OPEN; /* * If error=remount-ro was specified, any error result in remounting @@ -474,8 +350,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, */ zonefs_update_stats(inode, data_size); zonefs_i_size_write(inode, data_size); - zi->i_wpoffset = data_size; - zonefs_account_active(inode); + z->z_wpoffset = data_size; + zonefs_inode_account_active(inode); return 0; } @@ -487,9 +363,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * eventually correct the file size and zonefs inode write pointer offset * (which can be out of sync with the drive due to partial write failures). */ -static void __zonefs_io_error(struct inode *inode, bool write) +void __zonefs_io_error(struct inode *inode, bool write) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); unsigned int noio_flag; @@ -505,8 +381,8 @@ static void __zonefs_io_error(struct inode *inode, bool write) * files with aggregated conventional zones, for which the inode zone * size is always larger than the device zone size. */ - if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev)) - nr_zones = zi->i_zone_size >> + if (z->z_size > bdev_zone_sectors(sb->s_bdev)) + nr_zones = z->z_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT); /* @@ -518,7 +394,7 @@ static void __zonefs_io_error(struct inode *inode, bool write) * the GFP_NOIO context avoids both problems. */ noio_flag = memalloc_noio_save(); - ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones, + ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones, zonefs_io_error_cb, &err); if (ret != nr_zones) zonefs_err(sb, "Get inode %lu zone information failed %d\n", @@ -526,749 +402,6 @@ static void __zonefs_io_error(struct inode *inode, bool write) memalloc_noio_restore(noio_flag); } -static void zonefs_io_error(struct inode *inode, bool write) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - mutex_lock(&zi->i_truncate_mutex); - __zonefs_io_error(inode, write); - mutex_unlock(&zi->i_truncate_mutex); -} - -static int zonefs_file_truncate(struct inode *inode, loff_t isize) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t old_isize; - enum req_op op; - int ret = 0; - - /* - * Only sequential zone files can be truncated and truncation is allowed - * only down to a 0 size, which is equivalent to a zone reset, and to - * the maximum file size, which is equivalent to a zone finish. - */ - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return -EPERM; - - if (!isize) - op = REQ_OP_ZONE_RESET; - else if (isize == zi->i_max_size) - op = REQ_OP_ZONE_FINISH; - else - return -EPERM; - - inode_dio_wait(inode); - - /* Serialize against page faults */ - filemap_invalidate_lock(inode->i_mapping); - - /* Serialize against zonefs_iomap_begin() */ - mutex_lock(&zi->i_truncate_mutex); - - old_isize = i_size_read(inode); - if (isize == old_isize) - goto unlock; - - ret = zonefs_zone_mgmt(inode, op); - if (ret) - goto unlock; - - /* - * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, - * take care of open zones. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN) { - /* - * Truncating a zone to EMPTY or FULL is the equivalent of - * closing the zone. For a truncation to 0, we need to - * re-open the zone to ensure new writes can be processed. - * For a truncation to the maximum file size, the zone is - * closed and writes cannot be accepted anymore, so clear - * the open flag. - */ - if (!isize) - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - else - zi->i_flags &= ~ZONEFS_ZONE_OPEN; - } - - zonefs_update_stats(inode, isize); - truncate_setsize(inode, isize); - zi->i_wpoffset = isize; - zonefs_account_active(inode); - -unlock: - mutex_unlock(&zi->i_truncate_mutex); - filemap_invalidate_unlock(inode->i_mapping); - - return ret; -} - -static int zonefs_inode_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = d_inode(dentry); - int ret; - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - - ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); - if (ret) - return ret; - - /* - * Since files and directories cannot be created nor deleted, do not - * allow setting any write attributes on the sub-directories grouping - * files by zone type. - */ - if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && - (iattr->ia_mode & 0222)) - return -EPERM; - - if (((iattr->ia_valid & ATTR_UID) && - !uid_eq(iattr->ia_uid, inode->i_uid)) || - ((iattr->ia_valid & ATTR_GID) && - !gid_eq(iattr->ia_gid, inode->i_gid))) { - ret = dquot_transfer(&nop_mnt_idmap, inode, iattr); - if (ret) - return ret; - } - - if (iattr->ia_valid & ATTR_SIZE) { - ret = zonefs_file_truncate(inode, iattr->ia_size); - if (ret) - return ret; - } - - setattr_copy(&nop_mnt_idmap, inode, iattr); - - return 0; -} - -static const struct inode_operations zonefs_file_inode_operations = { - .setattr = zonefs_inode_setattr, -}; - -static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = file_inode(file); - int ret = 0; - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - - /* - * Since only direct writes are allowed in sequential files, page cache - * flush is needed only for conventional zone files. - */ - if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) - ret = file_write_and_wait_range(file, start, end); - if (!ret) - ret = blkdev_issue_flush(inode->i_sb->s_bdev); - - if (ret) - zonefs_io_error(inode, true); - - return ret; -} - -static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - vm_fault_t ret; - - if (unlikely(IS_IMMUTABLE(inode))) - return VM_FAULT_SIGBUS; - - /* - * Sanity check: only conventional zone files can have shared - * writeable mappings. - */ - if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) - return VM_FAULT_NOPAGE; - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - - /* Serialize against truncates */ - filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); - filemap_invalidate_unlock_shared(inode->i_mapping); - - sb_end_pagefault(inode->i_sb); - return ret; -} - -static const struct vm_operations_struct zonefs_file_vm_ops = { - .fault = filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = zonefs_filemap_page_mkwrite, -}; - -static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) -{ - /* - * Conventional zones accept random writes, so their files can support - * shared writable mappings. For sequential zone files, only read - * mappings are possible since there are no guarantees for write - * ordering between msync() and page cache writeback. - */ - if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && - (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) - return -EINVAL; - - file_accessed(file); - vma->vm_ops = &zonefs_file_vm_ops; - - return 0; -} - -static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) -{ - loff_t isize = i_size_read(file_inode(file)); - - /* - * Seeks are limited to below the zone size for conventional zones - * and below the zone write pointer for sequential zones. In both - * cases, this limit is the inode size. - */ - return generic_file_llseek_size(file, offset, whence, isize, isize); -} - -static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, - int error, unsigned int flags) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (error) { - zonefs_io_error(inode, true); - return error; - } - - if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { - /* - * Note that we may be seeing completions out of order, - * but that is not a problem since a write completed - * successfully necessarily means that all preceding writes - * were also successful. So we can safely increase the inode - * size to the write end location. - */ - mutex_lock(&zi->i_truncate_mutex); - if (i_size_read(inode) < iocb->ki_pos + size) { - zonefs_update_stats(inode, iocb->ki_pos + size); - zonefs_i_size_write(inode, iocb->ki_pos + size); - } - mutex_unlock(&zi->i_truncate_mutex); - } - - return 0; -} - -static const struct iomap_dio_ops zonefs_write_dio_ops = { - .end_io = zonefs_file_write_dio_end_io, -}; - -static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int max = bdev_max_zone_append_sectors(bdev); - struct bio *bio; - ssize_t size; - int nr_pages; - ssize_t ret; - - max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); - iov_iter_truncate(from, max); - - nr_pages = iov_iter_npages(from, BIO_MAX_VECS); - if (!nr_pages) - return 0; - - bio = bio_alloc(bdev, nr_pages, - REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); - bio->bi_iter.bi_sector = zi->i_zsector; - bio->bi_ioprio = iocb->ki_ioprio; - if (iocb_is_dsync(iocb)) - bio->bi_opf |= REQ_FUA; - - ret = bio_iov_iter_get_pages(bio, from); - if (unlikely(ret)) - goto out_release; - - size = bio->bi_iter.bi_size; - task_io_account_write(size); - - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(bio, iocb); - - ret = submit_bio_wait(bio); - - /* - * If the file zone was written underneath the file system, the zone - * write pointer may not be where we expect it to be, but the zone - * append write can still succeed. So check manually that we wrote where - * we intended to, that is, at zi->i_wpoffset. - */ - if (!ret) { - sector_t wpsector = - zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); - - if (bio->bi_iter.bi_sector != wpsector) { - zonefs_warn(inode->i_sb, - "Corrupted write pointer %llu for zone at %llu\n", - wpsector, zi->i_zsector); - ret = -EIO; - } - } - - zonefs_file_write_dio_end_io(iocb, size, ret, 0); - trace_zonefs_file_dio_append(inode, size, ret); - -out_release: - bio_release_pages(bio, false); - bio_put(bio); - - if (ret >= 0) { - iocb->ki_pos += size; - return size; - } - - return ret; -} - -/* - * Do not exceed the LFS limits nor the file zone size. If pos is under the - * limit it becomes a short access. If it exceeds the limit, return -EFBIG. - */ -static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, - loff_t count) -{ - struct inode *inode = file_inode(file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t limit = rlimit(RLIMIT_FSIZE); - loff_t max_size = zi->i_max_size; - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - count = min(count, limit - pos); - } - - if (!(file->f_flags & O_LARGEFILE)) - max_size = min_t(loff_t, MAX_NON_LFS, max_size); - - if (unlikely(pos >= max_size)) - return -EFBIG; - - return min(count, max_size - pos); -} - -static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t count; - - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - - if (!iov_iter_count(from)) - return 0; - - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) - return -EINVAL; - - if (iocb->ki_flags & IOCB_APPEND) { - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return -EINVAL; - mutex_lock(&zi->i_truncate_mutex); - iocb->ki_pos = zi->i_wpoffset; - mutex_unlock(&zi->i_truncate_mutex); - } - - count = zonefs_write_check_limits(file, iocb->ki_pos, - iov_iter_count(from)); - if (count < 0) - return count; - - iov_iter_truncate(from, count); - return iov_iter_count(from); -} - -/* - * Handle direct writes. For sequential zone files, this is the only possible - * write path. For these files, check that the user is issuing writes - * sequentially from the end of the file. This code assumes that the block layer - * delivers write requests to the device in sequential order. This is always the - * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE - * elevator feature is being used (e.g. mq-deadline). The block layer always - * automatically select such an elevator for zoned block devices during the - * device initialization. - */ -static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - bool sync = is_sync_kiocb(iocb); - bool append = false; - ssize_t ret, count; - - /* - * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT - * as this can cause write reordering (e.g. the first aio gets EAGAIN - * on the inode lock but the second goes through but is now unaligned). - */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && - (iocb->ki_flags & IOCB_NOWAIT)) - return -EOPNOTSUPP; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; - } else { - inode_lock(inode); - } - - count = zonefs_write_checks(iocb, from); - if (count <= 0) { - ret = count; - goto inode_unlock; - } - - if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { - ret = -EINVAL; - goto inode_unlock; - } - - /* Enforce sequential writes (append only) in sequential zones */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { - mutex_lock(&zi->i_truncate_mutex); - if (iocb->ki_pos != zi->i_wpoffset) { - mutex_unlock(&zi->i_truncate_mutex); - ret = -EINVAL; - goto inode_unlock; - } - mutex_unlock(&zi->i_truncate_mutex); - append = sync; - } - - if (append) - ret = zonefs_file_dio_append(iocb, from); - else - ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, - &zonefs_write_dio_ops, 0, NULL, 0); - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && - (ret > 0 || ret == -EIOCBQUEUED)) { - if (ret > 0) - count = ret; - - /* - * Update the zone write pointer offset assuming the write - * operation succeeded. If it did not, the error recovery path - * will correct it. Also do active seq file accounting. - */ - mutex_lock(&zi->i_truncate_mutex); - zi->i_wpoffset += count; - zonefs_account_active(inode); - mutex_unlock(&zi->i_truncate_mutex); - } - -inode_unlock: - inode_unlock(inode); - - return ret; -} - -static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, - struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - ssize_t ret; - - /* - * Direct IO writes are mandatory for sequential zone files so that the - * write IO issuing order is preserved. - */ - if (zi->i_ztype != ZONEFS_ZTYPE_CNV) - return -EIO; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; - } else { - inode_lock(inode); - } - - ret = zonefs_write_checks(iocb, from); - if (ret <= 0) - goto inode_unlock; - - ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); - if (ret > 0) - iocb->ki_pos += ret; - else if (ret == -EIO) - zonefs_io_error(inode, true); - -inode_unlock: - inode_unlock(inode); - if (ret > 0) - ret = generic_write_sync(iocb, ret); - - return ret; -} - -static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - - if (sb_rdonly(inode->i_sb)) - return -EROFS; - - /* Write operations beyond the zone size are not allowed */ - if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) - return -EFBIG; - - if (iocb->ki_flags & IOCB_DIRECT) { - ssize_t ret = zonefs_file_dio_write(iocb, from); - if (ret != -ENOTBLK) - return ret; - } - - return zonefs_file_buffered_write(iocb, from); -} - -static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, - int error, unsigned int flags) -{ - if (error) { - zonefs_io_error(file_inode(iocb->ki_filp), false); - return error; - } - - return 0; -} - -static const struct iomap_dio_ops zonefs_read_dio_ops = { - .end_io = zonefs_file_read_dio_end_io, -}; - -static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - loff_t isize; - ssize_t ret; - - /* Offline zones cannot be read */ - if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) - return -EPERM; - - if (iocb->ki_pos >= zi->i_max_size) - return 0; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock_shared(inode)) - return -EAGAIN; - } else { - inode_lock_shared(inode); - } - - /* Limit read operations to written data */ - mutex_lock(&zi->i_truncate_mutex); - isize = i_size_read(inode); - if (iocb->ki_pos >= isize) { - mutex_unlock(&zi->i_truncate_mutex); - ret = 0; - goto inode_unlock; - } - iov_iter_truncate(to, isize - iocb->ki_pos); - mutex_unlock(&zi->i_truncate_mutex); - - if (iocb->ki_flags & IOCB_DIRECT) { - size_t count = iov_iter_count(to); - - if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { - ret = -EINVAL; - goto inode_unlock; - } - file_accessed(iocb->ki_filp); - ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, - &zonefs_read_dio_ops, 0, NULL, 0); - } else { - ret = generic_file_read_iter(iocb, to); - if (ret == -EIO) - zonefs_io_error(inode, false); - } - -inode_unlock: - inode_unlock_shared(inode); - - return ret; -} - -/* - * Write open accounting is done only for sequential files. - */ -static inline bool zonefs_seq_file_need_wro(struct inode *inode, - struct file *file) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return false; - - if (!(file->f_mode & FMODE_WRITE)) - return false; - - return true; -} - -static int zonefs_seq_file_write_open(struct inode *inode) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret = 0; - - mutex_lock(&zi->i_truncate_mutex); - - if (!zi->i_wr_refcnt) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); - - if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { - - if (sbi->s_max_wro_seq_files - && wro > sbi->s_max_wro_seq_files) { - atomic_dec(&sbi->s_wro_seq_files); - ret = -EBUSY; - goto unlock; - } - - if (i_size_read(inode) < zi->i_max_size) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - if (ret) { - atomic_dec(&sbi->s_wro_seq_files); - goto unlock; - } - zi->i_flags |= ZONEFS_ZONE_OPEN; - zonefs_account_active(inode); - } - } - } - - zi->i_wr_refcnt++; - -unlock: - mutex_unlock(&zi->i_truncate_mutex); - - return ret; -} - -static int zonefs_file_open(struct inode *inode, struct file *file) -{ - int ret; - - ret = generic_file_open(inode, file); - if (ret) - return ret; - - if (zonefs_seq_file_need_wro(inode, file)) - return zonefs_seq_file_write_open(inode); - - return 0; -} - -static void zonefs_seq_file_write_close(struct inode *inode) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - int ret = 0; - - mutex_lock(&zi->i_truncate_mutex); - - zi->i_wr_refcnt--; - if (zi->i_wr_refcnt) - goto unlock; - - /* - * The file zone may not be open anymore (e.g. the file was truncated to - * its maximum size or it was fully written). For this case, we only - * need to decrement the write open count. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); - if (ret) { - __zonefs_io_error(inode, false); - /* - * Leaving zones explicitly open may lead to a state - * where most zones cannot be written (zone resources - * exhausted). So take preventive action by remounting - * read-only. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN && - !(sb->s_flags & SB_RDONLY)) { - zonefs_warn(sb, - "closing zone at %llu failed %d\n", - zi->i_zsector, ret); - zonefs_warn(sb, - "remounting filesystem read-only\n"); - sb->s_flags |= SB_RDONLY; - } - goto unlock; - } - - zi->i_flags &= ~ZONEFS_ZONE_OPEN; - zonefs_account_active(inode); - } - - atomic_dec(&sbi->s_wro_seq_files); - -unlock: - mutex_unlock(&zi->i_truncate_mutex); -} - -static int zonefs_file_release(struct inode *inode, struct file *file) -{ - /* - * If we explicitly open a zone we must close it again as well, but the - * zone management operation can fail (either due to an IO error or as - * the zone has gone offline or read-only). Make sure we don't fail the - * close(2) for user-space. - */ - if (zonefs_seq_file_need_wro(inode, file)) - zonefs_seq_file_write_close(inode); - - return 0; -} - -static const struct file_operations zonefs_file_operations = { - .open = zonefs_file_open, - .release = zonefs_file_release, - .fsync = zonefs_file_fsync, - .mmap = zonefs_file_mmap, - .llseek = zonefs_file_llseek, - .read_iter = zonefs_file_read_iter, - .write_iter = zonefs_file_write_iter, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .iopoll = iocb_bio_iopoll, -}; - static struct kmem_cache *zonefs_inode_cachep; static struct inode *zonefs_alloc_inode(struct super_block *sb) @@ -1282,7 +415,6 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); zi->i_wr_refcnt = 0; - zi->i_flags = 0; return &zi->i_vnode; } @@ -1315,8 +447,8 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { - if (sbi->s_nr_files[t]) - buf->f_files += sbi->s_nr_files[t] + 1; + if (sbi->s_zgroup[t].g_nr_zones) + buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1; } buf->f_ffree = 0; @@ -1408,185 +540,440 @@ static int zonefs_remount(struct super_block *sb, int *flags, char *data) return zonefs_parse_options(sb, data); } -static const struct super_operations zonefs_sops = { - .alloc_inode = zonefs_alloc_inode, - .free_inode = zonefs_free_inode, - .statfs = zonefs_statfs, - .remount_fs = zonefs_remount, - .show_options = zonefs_show_options, -}; +static int zonefs_inode_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + int ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); + if (ret) + return ret; + + /* + * Since files and directories cannot be created nor deleted, do not + * allow setting any write attributes on the sub-directories grouping + * files by zone type. + */ + if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && + (iattr->ia_mode & 0222)) + return -EPERM; + + if (((iattr->ia_valid & ATTR_UID) && + !uid_eq(iattr->ia_uid, inode->i_uid)) || + ((iattr->ia_valid & ATTR_GID) && + !gid_eq(iattr->ia_gid, inode->i_gid))) { + ret = dquot_transfer(&nop_mnt_idmap, inode, iattr); + if (ret) + return ret; + } + + if (iattr->ia_valid & ATTR_SIZE) { + ret = zonefs_file_truncate(inode, iattr->ia_size); + if (ret) + return ret; + } -static const struct inode_operations zonefs_dir_inode_operations = { - .lookup = simple_lookup, + setattr_copy(&nop_mnt_idmap, inode, iattr); + + if (S_ISREG(inode->i_mode)) { + struct zonefs_zone *z = zonefs_inode_zone(inode); + + z->z_mode = inode->i_mode; + z->z_uid = inode->i_uid; + z->z_gid = inode->i_gid; + } + + return 0; +} + +static const struct inode_operations zonefs_file_inode_operations = { .setattr = zonefs_inode_setattr, }; -static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, - enum zonefs_ztype type) +static long zonefs_fname_to_fno(const struct qstr *fname) { - struct super_block *sb = parent->i_sb; + const char *name = fname->name; + unsigned int len = fname->len; + long fno = 0, shift = 1; + const char *rname; + char c = *name; + unsigned int i; - inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1; - inode_init_owner(&nop_mnt_idmap, inode, parent, S_IFDIR | 0555); - inode->i_op = &zonefs_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - set_nlink(inode, 2); - inc_nlink(parent); + /* + * File names are always a base-10 number string without any + * leading 0s. + */ + if (!isdigit(c)) + return -ENOENT; + + if (len > 1 && c == '0') + return -ENOENT; + + if (len == 1) + return c - '0'; + + for (i = 0, rname = name + len - 1; i < len; i++, rname--) { + c = *rname; + if (!isdigit(c)) + return -ENOENT; + fno += (c - '0') * shift; + shift *= 10; + } + + return fno; } -static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, - enum zonefs_ztype type) +static struct inode *zonefs_get_file_inode(struct inode *dir, + struct dentry *dentry) { - struct super_block *sb = inode->i_sb; + struct zonefs_zone_group *zgroup = dir->i_private; + struct super_block *sb = dir->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret = 0; + struct zonefs_zone *z; + struct inode *inode; + ino_t ino; + long fno; - inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; - inode->i_mode = S_IFREG | sbi->s_perm; + /* Get the file number from the file name */ + fno = zonefs_fname_to_fno(&dentry->d_name); + if (fno < 0) + return ERR_PTR(fno); - zi->i_ztype = type; - zi->i_zsector = zone->start; - zi->i_zone_size = zone->len << SECTOR_SHIFT; - if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && - !(sbi->s_features & ZONEFS_F_AGGRCNV)) { - zonefs_err(sb, - "zone size %llu doesn't match device's zone sectors %llu\n", - zi->i_zone_size, - bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); - return -EINVAL; - } + if (!zgroup->g_nr_zones || fno >= zgroup->g_nr_zones) + return ERR_PTR(-ENOENT); - zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, - zone->capacity << SECTOR_SHIFT); - zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true); + z = &zgroup->g_zones[fno]; + ino = z->z_sector >> sbi->s_zone_sectors_shift; + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) { + WARN_ON_ONCE(inode->i_private != z); + return inode; + } - inode->i_uid = sbi->s_uid; - inode->i_gid = sbi->s_gid; - inode->i_size = zi->i_wpoffset; - inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT; + inode->i_ino = ino; + inode->i_mode = z->z_mode; + inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; + inode->i_uid = z->z_uid; + inode->i_gid = z->z_gid; + inode->i_size = z->z_wpoffset; + inode->i_blocks = z->z_capacity >> SECTOR_SHIFT; + inode->i_private = z; inode->i_op = &zonefs_file_inode_operations; inode->i_fop = &zonefs_file_operations; inode->i_mapping->a_ops = &zonefs_file_aops; - sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); - sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; - sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; + /* Update the inode access rights depending on the zone condition */ + zonefs_inode_update_mode(inode); + + unlock_new_inode(inode); + + return inode; +} + +static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, + enum zonefs_ztype ztype) +{ + struct inode *root = d_inode(sb->s_root); + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct inode *inode; + ino_t ino = bdev_nr_zones(sb->s_bdev) + ztype + 1; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + inode->i_ino = ino; + inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555); + inode->i_size = sbi->s_zgroup[ztype].g_nr_zones; + inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime; + inode->i_private = &sbi->s_zgroup[ztype]; + set_nlink(inode, 2); + + inode->i_op = &zonefs_dir_inode_operations; + inode->i_fop = &zonefs_dir_operations; + + unlock_new_inode(inode); + + return inode; +} - mutex_lock(&zi->i_truncate_mutex); + +static struct inode *zonefs_get_dir_inode(struct inode *dir, + struct dentry *dentry) +{ + struct super_block *sb = dir->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + const char *name = dentry->d_name.name; + enum zonefs_ztype ztype; /* - * For sequential zones, make sure that any open zone is closed first - * to ensure that the initial number of open zones is 0, in sync with - * the open zone accounting done when the mount option - * ZONEFS_MNTOPT_EXPLICIT_OPEN is used. + * We only need to check for the "seq" directory and + * the "cnv" directory if we have conventional zones. */ - if (type == ZONEFS_ZTYPE_SEQ && - (zone->cond == BLK_ZONE_COND_IMP_OPEN || - zone->cond == BLK_ZONE_COND_EXP_OPEN)) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); - if (ret) - goto unlock; + if (dentry->d_name.len != 3) + return ERR_PTR(-ENOENT); + + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + if (sbi->s_zgroup[ztype].g_nr_zones && + memcmp(name, zonefs_zgroup_name(ztype), 3) == 0) + break; } + if (ztype == ZONEFS_ZTYPE_MAX) + return ERR_PTR(-ENOENT); - zonefs_account_active(inode); + return zonefs_get_zgroup_inode(sb, ztype); +} -unlock: - mutex_unlock(&zi->i_truncate_mutex); +static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct inode *inode; - return ret; + if (dentry->d_name.len > ZONEFS_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + if (dir == d_inode(dir->i_sb->s_root)) + inode = zonefs_get_dir_inode(dir, dentry); + else + inode = zonefs_get_file_inode(dir, dentry); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_splice_alias(inode, dentry); } -static struct dentry *zonefs_create_inode(struct dentry *parent, - const char *name, struct blk_zone *zone, - enum zonefs_ztype type) +static int zonefs_readdir_root(struct file *file, struct dir_context *ctx) { - struct inode *dir = d_inode(parent); - struct dentry *dentry; - struct inode *inode; - int ret = -ENOMEM; + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + enum zonefs_ztype ztype = ZONEFS_ZTYPE_CNV; + ino_t base_ino = bdev_nr_zones(sb->s_bdev) + 1; - dentry = d_alloc_name(parent, name); - if (!dentry) - return ERR_PTR(ret); + if (ctx->pos >= inode->i_size) + return 0; - inode = new_inode(parent->d_sb); - if (!inode) - goto dput; + if (!dir_emit_dots(file, ctx)) + return 0; - inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; - if (zone) { - ret = zonefs_init_file_inode(inode, zone, type); - if (ret) { - iput(inode); - goto dput; - } - } else { - zonefs_init_dir_inode(dir, inode, type); + if (ctx->pos == 2) { + if (!sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) + ztype = ZONEFS_ZTYPE_SEQ; + + if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3, + base_ino + ztype, DT_DIR)) + return 0; + ctx->pos++; } - d_add(dentry, inode); - dir->i_size++; + if (ctx->pos == 3 && ztype != ZONEFS_ZTYPE_SEQ) { + ztype = ZONEFS_ZTYPE_SEQ; + if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3, + base_ino + ztype, DT_DIR)) + return 0; + ctx->pos++; + } - return dentry; + return 0; +} + +static int zonefs_readdir_zgroup(struct file *file, + struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct zonefs_zone_group *zgroup = inode->i_private; + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct zonefs_zone *z; + int fname_len; + char *fname; + ino_t ino; + int f; + + /* + * The size of zone group directories is equal to the number + * of zone files in the group and does note include the "." and + * ".." entries. Hence the "+ 2" here. + */ + if (ctx->pos >= inode->i_size + 2) + return 0; + + if (!dir_emit_dots(file, ctx)) + return 0; + + fname = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); + if (!fname) + return -ENOMEM; + + for (f = ctx->pos - 2; f < zgroup->g_nr_zones; f++) { + z = &zgroup->g_zones[f]; + ino = z->z_sector >> sbi->s_zone_sectors_shift; + fname_len = snprintf(fname, ZONEFS_NAME_MAX - 1, "%u", f); + if (!dir_emit(ctx, fname, fname_len, ino, DT_REG)) + break; + ctx->pos++; + } -dput: - dput(dentry); + kfree(fname); - return ERR_PTR(ret); + return 0; +} + +static int zonefs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + + if (inode == d_inode(inode->i_sb->s_root)) + return zonefs_readdir_root(file, ctx); + + return zonefs_readdir_zgroup(file, ctx); } +const struct inode_operations zonefs_dir_inode_operations = { + .lookup = zonefs_lookup, + .setattr = zonefs_inode_setattr, +}; + +const struct file_operations zonefs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = zonefs_readdir, +}; + struct zonefs_zone_data { struct super_block *sb; unsigned int nr_zones[ZONEFS_ZTYPE_MAX]; + sector_t cnv_zone_start; struct blk_zone *zones; }; +static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct zonefs_zone_data *zd = data; + struct super_block *sb = zd->sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + + /* + * We do not care about the first zone: it contains the super block + * and not exposed as a file. + */ + if (!idx) + return 0; + + /* + * Count the number of zones that will be exposed as files. + * For sequential zones, we always have as many files as zones. + * FOr conventional zones, the number of files depends on if we have + * conventional zones aggregation enabled. + */ + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + if (sbi->s_features & ZONEFS_F_AGGRCNV) { + /* One file per set of contiguous conventional zones */ + if (!(sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) || + zone->start != zd->cnv_zone_start) + sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; + zd->cnv_zone_start = zone->start + zone->len; + } else { + /* One file per zone */ + sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; + } + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + sbi->s_zgroup[ZONEFS_ZTYPE_SEQ].g_nr_zones++; + break; + default: + zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", + zone->type); + return -EIO; + } + + memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone)); + + return 0; +} + +static int zonefs_get_zone_info(struct zonefs_zone_data *zd) +{ + struct block_device *bdev = zd->sb->s_bdev; + int ret; + + zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone), + GFP_KERNEL); + if (!zd->zones) + return -ENOMEM; + + /* Get zones information from the device */ + ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, + zonefs_get_zone_info_cb, zd); + if (ret < 0) { + zonefs_err(zd->sb, "Zone report failed %d\n", ret); + return ret; + } + + if (ret != bdev_nr_zones(bdev)) { + zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n", + ret, bdev_nr_zones(bdev)); + return -EIO; + } + + return 0; +} + +static inline void zonefs_free_zone_info(struct zonefs_zone_data *zd) +{ + kvfree(zd->zones); +} + /* * Create a zone group and populate it with zone files. */ -static int zonefs_create_zgroup(struct zonefs_zone_data *zd, - enum zonefs_ztype type) +static int zonefs_init_zgroup(struct super_block *sb, + struct zonefs_zone_data *zd, + enum zonefs_ztype ztype) { - struct super_block *sb = zd->sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; struct blk_zone *zone, *next, *end; - const char *zgroup_name; - char *file_name; - struct dentry *dir, *dent; + struct zonefs_zone *z; unsigned int n = 0; int ret; - /* If the group is empty, there is nothing to do */ - if (!zd->nr_zones[type]) + /* Allocate the zone group. If it is empty, we have nothing to do. */ + if (!zgroup->g_nr_zones) return 0; - file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); - if (!file_name) + zgroup->g_zones = kvcalloc(zgroup->g_nr_zones, + sizeof(struct zonefs_zone), GFP_KERNEL); + if (!zgroup->g_zones) return -ENOMEM; - if (type == ZONEFS_ZTYPE_CNV) - zgroup_name = "cnv"; - else - zgroup_name = "seq"; - - dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type); - if (IS_ERR(dir)) { - ret = PTR_ERR(dir); - goto free; - } - /* - * The first zone contains the super block: skip it. + * Initialize the zone groups using the device zone information. + * We always skip the first zone as it contains the super block + * and is not use to back a file. */ end = zd->zones + bdev_nr_zones(sb->s_bdev); for (zone = &zd->zones[1]; zone < end; zone = next) { next = zone + 1; - if (zonefs_zone_type(zone) != type) + if (zonefs_zone_type(zone) != ztype) continue; + if (WARN_ON_ONCE(n >= zgroup->g_nr_zones)) + return -EINVAL; + /* * For conventional zones, contiguous zones can be aggregated * together to form larger files. Note that this overwrites the @@ -1595,10 +982,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, * found, assume that all zones aggregated have the same * condition. */ - if (type == ZONEFS_ZTYPE_CNV && + if (ztype == ZONEFS_ZTYPE_CNV && (sbi->s_features & ZONEFS_F_AGGRCNV)) { for (; next < end; next++) { - if (zonefs_zone_type(next) != type) + if (zonefs_zone_type(next) != ztype) break; zone->len += next->len; zone->capacity += next->capacity; @@ -1608,99 +995,118 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, else if (next->cond == BLK_ZONE_COND_OFFLINE) zone->cond = BLK_ZONE_COND_OFFLINE; } - if (zone->capacity != zone->len) { - zonefs_err(sb, "Invalid conventional zone capacity\n"); - ret = -EINVAL; - goto free; - } } + z = &zgroup->g_zones[n]; + if (ztype == ZONEFS_ZTYPE_CNV) + z->z_flags |= ZONEFS_ZONE_CNV; + z->z_sector = zone->start; + z->z_size = zone->len << SECTOR_SHIFT; + if (z->z_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && + !(sbi->s_features & ZONEFS_F_AGGRCNV)) { + zonefs_err(sb, + "Invalid zone size %llu (device zone sectors %llu)\n", + z->z_size, + bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); + return -EINVAL; + } + + z->z_capacity = min_t(loff_t, MAX_LFS_FILESIZE, + zone->capacity << SECTOR_SHIFT); + z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone); + + z->z_mode = S_IFREG | sbi->s_perm; + z->z_uid = sbi->s_uid; + z->z_gid = sbi->s_gid; + /* - * Use the file number within its group as file name. + * Let zonefs_inode_update_mode() know that we will need + * special initialization of the inode mode the first time + * it is accessed. */ - snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n); - dent = zonefs_create_inode(dir, file_name, zone, type); - if (IS_ERR(dent)) { - ret = PTR_ERR(dent); - goto free; + z->z_flags |= ZONEFS_ZONE_INIT_MODE; + + sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes); + sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits; + sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits; + + /* + * For sequential zones, make sure that any open zone is closed + * first to ensure that the initial number of open zones is 0, + * in sync with the open zone accounting done when the mount + * option ZONEFS_MNTOPT_EXPLICIT_OPEN is used. + */ + if (ztype == ZONEFS_ZTYPE_SEQ && + (zone->cond == BLK_ZONE_COND_IMP_OPEN || + zone->cond == BLK_ZONE_COND_EXP_OPEN)) { + ret = zonefs_zone_mgmt(sb, z, REQ_OP_ZONE_CLOSE); + if (ret) + return ret; } + zonefs_account_active(sb, z); + n++; } - zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", - zgroup_name, n, n > 1 ? "s" : ""); - - sbi->s_nr_files[type] = n; - ret = 0; + if (WARN_ON_ONCE(n != zgroup->g_nr_zones)) + return -EINVAL; -free: - kfree(file_name); + zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", + zonefs_zgroup_name(ztype), + zgroup->g_nr_zones, + zgroup->g_nr_zones > 1 ? "s" : ""); - return ret; + return 0; } -static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, - void *data) +static void zonefs_free_zgroups(struct super_block *sb) { - struct zonefs_zone_data *zd = data; - - /* - * Count the number of usable zones: the first zone at index 0 contains - * the super block and is ignored. - */ - switch (zone->type) { - case BLK_ZONE_TYPE_CONVENTIONAL: - zone->wp = zone->start + zone->len; - if (idx) - zd->nr_zones[ZONEFS_ZTYPE_CNV]++; - break; - case BLK_ZONE_TYPE_SEQWRITE_REQ: - case BLK_ZONE_TYPE_SEQWRITE_PREF: - if (idx) - zd->nr_zones[ZONEFS_ZTYPE_SEQ]++; - break; - default: - zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", - zone->type); - return -EIO; - } + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + enum zonefs_ztype ztype; - memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone)); + if (!sbi) + return; - return 0; + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + kvfree(sbi->s_zgroup[ztype].g_zones); + sbi->s_zgroup[ztype].g_zones = NULL; + } } -static int zonefs_get_zone_info(struct zonefs_zone_data *zd) +/* + * Create a zone group and populate it with zone files. + */ +static int zonefs_init_zgroups(struct super_block *sb) { - struct block_device *bdev = zd->sb->s_bdev; + struct zonefs_zone_data zd; + enum zonefs_ztype ztype; int ret; - zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone), - GFP_KERNEL); - if (!zd->zones) - return -ENOMEM; - - /* Get zones information from the device */ - ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, - zonefs_get_zone_info_cb, zd); - if (ret < 0) { - zonefs_err(zd->sb, "Zone report failed %d\n", ret); - return ret; - } + /* First get the device zone information */ + memset(&zd, 0, sizeof(struct zonefs_zone_data)); + zd.sb = sb; + ret = zonefs_get_zone_info(&zd); + if (ret) + goto cleanup; - if (ret != bdev_nr_zones(bdev)) { - zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n", - ret, bdev_nr_zones(bdev)); - return -EIO; + /* Allocate and initialize the zone groups */ + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + ret = zonefs_init_zgroup(sb, &zd, ztype); + if (ret) { + zonefs_info(sb, + "Zone group \"%s\" initialization failed\n", + zonefs_zgroup_name(ztype)); + break; + } } - return 0; -} +cleanup: + zonefs_free_zone_info(&zd); + if (ret) + zonefs_free_zgroups(sb); -static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd) -{ - kvfree(zd->zones); + return ret; } /* @@ -1785,6 +1191,50 @@ free_page: return ret; } +static const struct super_operations zonefs_sops = { + .alloc_inode = zonefs_alloc_inode, + .free_inode = zonefs_free_inode, + .statfs = zonefs_statfs, + .remount_fs = zonefs_remount, + .show_options = zonefs_show_options, +}; + +static int zonefs_get_zgroup_inodes(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct inode *dir_inode; + enum zonefs_ztype ztype; + + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + if (!sbi->s_zgroup[ztype].g_nr_zones) + continue; + + dir_inode = zonefs_get_zgroup_inode(sb, ztype); + if (IS_ERR(dir_inode)) + return PTR_ERR(dir_inode); + + sbi->s_zgroup[ztype].g_inode = dir_inode; + } + + return 0; +} + +static void zonefs_release_zgroup_inodes(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + enum zonefs_ztype ztype; + + if (!sbi) + return; + + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + if (sbi->s_zgroup[ztype].g_inode) { + iput(sbi->s_zgroup[ztype].g_inode); + sbi->s_zgroup[ztype].g_inode = NULL; + } + } +} + /* * Check that the device is zoned. If it is, get the list of zones and create * sub-directories and files according to the device zone configuration and @@ -1792,10 +1242,9 @@ free_page: */ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) { - struct zonefs_zone_data zd; struct zonefs_sb_info *sbi; struct inode *inode; - enum zonefs_ztype t; + enum zonefs_ztype ztype; int ret; if (!bdev_is_zoned(sb->s_bdev)) { @@ -1845,16 +1294,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (ret) return ret; - memset(&zd, 0, sizeof(struct zonefs_zone_data)); - zd.sb = sb; - ret = zonefs_get_zone_info(&zd); - if (ret) - goto cleanup; - - ret = zonefs_sysfs_register(sb); - if (ret) - goto cleanup; - zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev)); if (!sbi->s_max_wro_seq_files && @@ -1865,7 +1304,12 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; } - /* Create root directory inode */ + /* Initialize the zone groups */ + ret = zonefs_init_zgroups(sb); + if (ret) + goto cleanup; + + /* Create the root directory inode */ ret = -ENOMEM; inode = new_inode(sb); if (!inode) @@ -1875,22 +1319,37 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) inode->i_mode = S_IFDIR | 0555; inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode); inode->i_op = &zonefs_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + inode->i_fop = &zonefs_dir_operations; + inode->i_size = 2; set_nlink(inode, 2); + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + if (sbi->s_zgroup[ztype].g_nr_zones) { + inc_nlink(inode); + inode->i_size++; + } + } sb->s_root = d_make_root(inode); if (!sb->s_root) goto cleanup; - /* Create and populate files in zone groups directories */ - for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { - ret = zonefs_create_zgroup(&zd, t); - if (ret) - break; - } + /* + * Take a reference on the zone groups directory inodes + * to keep them in the inode cache. + */ + ret = zonefs_get_zgroup_inodes(sb); + if (ret) + goto cleanup; + + ret = zonefs_sysfs_register(sb); + if (ret) + goto cleanup; + + return 0; cleanup: - zonefs_cleanup_zone_info(&zd); + zonefs_release_zgroup_inodes(sb); + zonefs_free_zgroups(sb); return ret; } @@ -1905,11 +1364,13 @@ static void zonefs_kill_super(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - if (sb->s_root) - d_genocide(sb->s_root); + /* Release the reference on the zone group directory inodes */ + zonefs_release_zgroup_inodes(sb); - zonefs_sysfs_unregister(sb); kill_block_super(sb); + + zonefs_sysfs_unregister(sb); + zonefs_free_zgroups(sb); kfree(sbi); } diff --git a/fs/zonefs/sysfs.c b/fs/zonefs/sysfs.c index 9920689dc098..8ccb65c2b419 100644 --- a/fs/zonefs/sysfs.c +++ b/fs/zonefs/sysfs.c @@ -79,7 +79,7 @@ static const struct sysfs_ops zonefs_sysfs_attr_ops = { .show = zonefs_sysfs_attr_show, }; -static struct kobj_type zonefs_sb_ktype = { +static const struct kobj_type zonefs_sb_ktype = { .default_groups = zonefs_sysfs_groups, .sysfs_ops = &zonefs_sysfs_attr_ops, .release = zonefs_sysfs_sb_release, diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h index 42edcfd393ed..9969db3a9c7d 100644 --- a/fs/zonefs/trace.h +++ b/fs/zonefs/trace.h @@ -20,8 +20,9 @@ #define show_dev(dev) MAJOR(dev), MINOR(dev) TRACE_EVENT(zonefs_zone_mgmt, - TP_PROTO(struct inode *inode, enum req_op op), - TP_ARGS(inode, op), + TP_PROTO(struct super_block *sb, struct zonefs_zone *z, + enum req_op op), + TP_ARGS(sb, z, op), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) @@ -30,12 +31,12 @@ TRACE_EVENT(zonefs_zone_mgmt, __field(sector_t, nr_sectors) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; + __entry->dev = sb->s_dev; + __entry->ino = + z->z_sector >> ZONEFS_SB(sb)->s_zone_sectors_shift; __entry->op = op; - __entry->sector = ZONEFS_I(inode)->i_zsector; - __entry->nr_sectors = - ZONEFS_I(inode)->i_zone_size >> SECTOR_SHIFT; + __entry->sector = z->z_sector; + __entry->nr_sectors = z->z_size >> SECTOR_SHIFT; ), TP_printk("bdev=(%d,%d), ino=%lu op=%s, sector=%llu, nr_sectors=%llu", show_dev(__entry->dev), (unsigned long)__entry->ino, @@ -58,9 +59,10 @@ TRACE_EVENT(zonefs_file_dio_append, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->sector = ZONEFS_I(inode)->i_zsector; + __entry->sector = zonefs_inode_zone(inode)->z_sector; __entry->size = size; - __entry->wpoffset = ZONEFS_I(inode)->i_wpoffset; + __entry->wpoffset = + zonefs_inode_zone(inode)->z_wpoffset; __entry->ret = ret; ), TP_printk("bdev=(%d, %d), ino=%lu, sector=%llu, size=%zu, wpoffset=%llu, ret=%zu", diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 1dbe78119ff1..8175652241b5 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -39,31 +39,53 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } -#define ZONEFS_ZONE_OPEN (1U << 0) -#define ZONEFS_ZONE_ACTIVE (1U << 1) -#define ZONEFS_ZONE_OFFLINE (1U << 2) -#define ZONEFS_ZONE_READONLY (1U << 3) +#define ZONEFS_ZONE_INIT_MODE (1U << 0) +#define ZONEFS_ZONE_OPEN (1U << 1) +#define ZONEFS_ZONE_ACTIVE (1U << 2) +#define ZONEFS_ZONE_OFFLINE (1U << 3) +#define ZONEFS_ZONE_READONLY (1U << 4) +#define ZONEFS_ZONE_CNV (1U << 31) /* - * In-memory inode data. + * In-memory per-file inode zone data. */ -struct zonefs_inode_info { - struct inode i_vnode; +struct zonefs_zone { + /* Zone state flags */ + unsigned int z_flags; - /* File zone type */ - enum zonefs_ztype i_ztype; + /* Zone start sector (512B unit) */ + sector_t z_sector; - /* File zone start sector (512B unit) */ - sector_t i_zsector; + /* Zone size (bytes) */ + loff_t z_size; - /* File zone write pointer position (sequential zones only) */ - loff_t i_wpoffset; + /* Zone capacity (file maximum size, bytes) */ + loff_t z_capacity; - /* File maximum size */ - loff_t i_max_size; + /* Write pointer offset in the zone (sequential zones only, bytes) */ + loff_t z_wpoffset; + + /* Saved inode uid, gid and access rights */ + umode_t z_mode; + kuid_t z_uid; + kgid_t z_gid; +}; + +/* + * In memory zone group information: all zones of a group are exposed + * as files, one file per zone. + */ +struct zonefs_zone_group { + struct inode *g_inode; + unsigned int g_nr_zones; + struct zonefs_zone *g_zones; +}; - /* File zone size */ - loff_t i_zone_size; +/* + * In-memory inode data. + */ +struct zonefs_inode_info { + struct inode i_vnode; /* * To serialise fully against both syscall and mmap based IO and @@ -82,7 +104,6 @@ struct zonefs_inode_info { /* guarded by i_truncate_mutex */ unsigned int i_wr_refcnt; - unsigned int i_flags; }; static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) @@ -90,6 +111,31 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) return container_of(inode, struct zonefs_inode_info, i_vnode); } +static inline bool zonefs_zone_is_cnv(struct zonefs_zone *z) +{ + return z->z_flags & ZONEFS_ZONE_CNV; +} + +static inline bool zonefs_zone_is_seq(struct zonefs_zone *z) +{ + return !zonefs_zone_is_cnv(z); +} + +static inline struct zonefs_zone *zonefs_inode_zone(struct inode *inode) +{ + return inode->i_private; +} + +static inline bool zonefs_inode_is_cnv(struct inode *inode) +{ + return zonefs_zone_is_cnv(zonefs_inode_zone(inode)); +} + +static inline bool zonefs_inode_is_seq(struct inode *inode) +{ + return zonefs_zone_is_seq(zonefs_inode_zone(inode)); +} + /* * On-disk super block (block 0). */ @@ -181,7 +227,7 @@ struct zonefs_sb_info { uuid_t s_uuid; unsigned int s_zone_sectors_shift; - unsigned int s_nr_files[ZONEFS_ZTYPE_MAX]; + struct zonefs_zone_group s_zgroup[ZONEFS_ZTYPE_MAX]; loff_t s_blocks; loff_t s_used_blocks; @@ -209,6 +255,32 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) #define zonefs_warn(sb, format, args...) \ pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) +/* In super.c */ +void zonefs_inode_account_active(struct inode *inode); +int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op); +void zonefs_i_size_write(struct inode *inode, loff_t isize); +void zonefs_update_stats(struct inode *inode, loff_t new_isize); +void __zonefs_io_error(struct inode *inode, bool write); + +static inline void zonefs_io_error(struct inode *inode, bool write) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + mutex_lock(&zi->i_truncate_mutex); + __zonefs_io_error(inode, write); + mutex_unlock(&zi->i_truncate_mutex); +} + +/* In super.c */ +extern const struct inode_operations zonefs_dir_inode_operations; +extern const struct file_operations zonefs_dir_operations; + +/* In file.c */ +extern const struct address_space_operations zonefs_file_aops; +extern const struct file_operations zonefs_file_operations; +int zonefs_file_truncate(struct inode *inode, loff_t isize); + +/* In sysfs.c */ int zonefs_sysfs_register(struct super_block *sb); void zonefs_sysfs_unregister(struct super_block *sb); int zonefs_sysfs_init(void); |