diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 275 |
1 files changed, 188 insertions, 87 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 847ee43c2806..7c00f105845e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -11,6 +11,7 @@ */ #include <linux/export.h> #include <linux/compiler.h> +#include <linux/dax.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/capability.h> @@ -100,7 +101,7 @@ * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) + * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) @@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); if (shadow) { - mapping->nrshadows++; + mapping->nrexceptional++; /* - * Make sure the nrshadows update is committed before + * Make sure the nrexceptional update is committed before * the nrpages update so that final truncate racing * with reclaim does not see both counters 0 at the * same time and miss a shadow entry. @@ -175,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock and - * mem_cgroup_begin_page_stat(). + * is safe. The caller must hold the mapping's tree_lock. */ -void __delete_from_page_cache(struct page *page, void *shadow, - struct mem_cgroup *memcg) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; @@ -194,6 +193,30 @@ void __delete_from_page_cache(struct page *page, void *shadow, else cleancache_invalidate_page(mapping, page); + VM_BUG_ON_PAGE(page_mapped(page), page); + if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { + int mapcount; + + pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", + current->comm, page_to_pfn(page)); + dump_page(page, "still mapped when deleted"); + dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + mapcount = page_mapcount(page); + if (mapping_exiting(mapping) && + page_count(page) >= mapcount + 2) { + /* + * All vmas have already been torn down, so it's + * a good bet that actually the page is unmapped, + * and we'd prefer not to leak it: if we're wrong, + * some other bad page check should catch it later. + */ + page_mapcount_reset(page); + atomic_sub(mapcount, &page->_count); + } + } + page_cache_tree_delete(mapping, page, shadow); page->mapping = NULL; @@ -204,7 +227,6 @@ void __delete_from_page_cache(struct page *page, void *shadow, __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); - VM_BUG_ON_PAGE(page_mapped(page), page); /* * At this point page must be either written or cleaned by truncate. @@ -215,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow, * anyway will be cleared before returning page into buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, memcg, - inode_to_wb(mapping->host)); + account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } /** @@ -230,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow, void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - struct mem_cgroup *memcg; unsigned long flags; void (*freepage)(struct page *); @@ -239,11 +259,9 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; - memcg = mem_cgroup_begin_page_stat(page); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); if (freepage) freepage(page); @@ -445,7 +463,8 @@ int filemap_write_and_wait(struct address_space *mapping) { int err = 0; - if (mapping->nrpages) { + if ((!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional)) { err = filemap_fdatawrite(mapping); /* * Even if the above returned error, the pages may be @@ -481,7 +500,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; - if (mapping->nrpages) { + if ((!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ @@ -525,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); - struct mem_cgroup *memcg; unsigned long flags; pgoff_t offset = old->index; @@ -535,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - memcg = mem_cgroup_begin_page_stat(old); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(old, NULL, memcg); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -550,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); - mem_cgroup_replace_page(old, new); + mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) freepage(old); @@ -569,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, + error = __radix_tree_create(&mapping->page_tree, page->index, 0, &node, &slot); if (error) return error; @@ -579,9 +596,13 @@ static int page_cache_tree_insert(struct address_space *mapping, p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; + + if (WARN_ON(dax_mapping(mapping))) + return -EINVAL; + if (shadowp) *shadowp = p; - mapping->nrshadows--; + mapping->nrexceptional--; if (node) workingset_node_shadows_dec(node); } @@ -1234,7 +1255,6 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: @@ -1242,12 +1262,14 @@ repeat: if (unlikely(!page)) continue; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. */ goto export; } @@ -1296,7 +1318,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: @@ -1306,13 +1327,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - WARN_ON(iter.index); - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page, @@ -1363,7 +1379,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return 0; rcu_read_lock(); -restart: radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: @@ -1374,12 +1389,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page, @@ -1439,7 +1450,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *page; @@ -1450,12 +1460,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page. @@ -1494,6 +1500,69 @@ repeat: } EXPORT_SYMBOL(find_get_pages_tag); +/** + * find_get_entries_tag - find and return entries that match @tag + * @mapping: the address_space to search + * @start: the starting page cache index + * @tag: the tag index + * @nr_entries: the maximum number of entries + * @entries: where the resulting entries are placed + * @indices: the cache indices corresponding to the entries in @entries + * + * Like find_get_entries, except we only return entries which are tagged with + * @tag. + */ +unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, + int tag, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, start, tag) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(find_get_entries_tag); + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -1570,6 +1639,15 @@ find_page: index, last_index - index); } if (!PageUptodate(page)) { + /* + * See comment in do_read_cache_page on why + * wait_on_page_locked is used to avoid unnecessarily + * serialisations and why it's safe. + */ + wait_on_page_locked_killable(page); + if (PageUptodate(page)) + goto page_ok; + if (inode->i_blkbits == PAGE_CACHE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; @@ -1811,6 +1889,7 @@ EXPORT_SYMBOL(generic_file_read_iter); * page_cache_read - adds requested page to the page cache if not already there * @file: file to read * @offset: page index + * @gfp_mask: memory allocation flags * * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. @@ -2072,10 +2151,11 @@ repeat: if (unlikely(!page)) goto next; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - break; - else - goto next; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + goto next; } if (!page_cache_get_speculative(page)) @@ -2204,7 +2284,7 @@ static struct page *wait_on_page_read(struct page *page) return page; } -static struct page *__read_cache_page(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data, @@ -2226,53 +2306,74 @@ repeat: /* Presumably ENOMEM for radix tree node */ return ERR_PTR(err); } + +filler: err = filler(data, page); if (err < 0) { page_cache_release(page); - page = ERR_PTR(err); - } else { - page = wait_on_page_read(page); + return ERR_PTR(err); } - } - return page; -} -static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data, - gfp_t gfp) - -{ - struct page *page; - int err; + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; + goto out; + } + if (PageUptodate(page)) + goto out; -retry: - page = __read_cache_page(mapping, index, filler, data, gfp); - if (IS_ERR(page)) - return page; + /* + * Page is not up to date and may be locked due one of the following + * case a: Page is being filled and the page lock is held + * case b: Read/write error clearing the page uptodate status + * case c: Truncation in progress (page locked) + * case d: Reclaim in progress + * + * Case a, the page will be up to date when the page is unlocked. + * There is no need to serialise on the page lock here as the page + * is pinned so the lock gives no additional protection. Even if the + * the page is truncated, the data is still valid if PageUptodate as + * it's a race vs truncate race. + * Case b, the page will not be up to date + * Case c, the page may be truncated but in itself, the data may still + * be valid after IO completes as it's a read vs truncate race. The + * operation must restart if the page is not uptodate on unlock but + * otherwise serialising on page lock to stabilise the mapping gives + * no additional guarantees to the caller as the page lock is + * released before return. + * Case d, similar to truncation. If reclaim holds the page lock, it + * will be a race with remove_mapping that determines if the mapping + * is valid on unlock but otherwise the data is valid and there is + * no need to serialise with page lock. + * + * As the page lock gives no additional guarantee, we optimistically + * wait on the page to be unlocked and check if it's up to date and + * use the page if it is. Otherwise, the page lock is required to + * distinguish between the different cases. The motivation is that we + * avoid spurious serialisations and wakeups when multiple processes + * wait on the same page for IO to complete. + */ + wait_on_page_locked(page); if (PageUptodate(page)) goto out; + /* Distinguish between all the cases under the safety of the lock */ lock_page(page); + + /* Case c or d, restart the operation */ if (!page->mapping) { unlock_page(page); page_cache_release(page); - goto retry; + goto repeat; } + + /* Someone else locked and filled the page in a very small window */ if (PageUptodate(page)) { unlock_page(page); goto out; } - err = filler(data, page); - if (err < 0) { - page_cache_release(page); - return ERR_PTR(err); - } else { - page = wait_on_page_read(page); - if (IS_ERR(page)) - return page; - } + goto filler; + out: mark_page_accessed(page); return page; @@ -2684,11 +2785,11 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file->f_mapping->host; ssize_t ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) ret = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret > 0) { ssize_t err; |