[RFC v3 7/7] shm: isolate pinned pages when sealing files

From: David Herrmann
Date: Fri Jun 13 2014 - 06:45:40 EST


When setting SEAL_WRITE, we must make sure nobody has a writable reference
to the pages (via GUP or similar). We currently check references and wait
some time for them to be dropped. This, however, might fail for several
reasons, including:
- the page is pinned for longer than we wait
- while we wait, someone takes an already pinned page for read-access

Therefore, this patch introduces page-isolation. When sealing a file with
SEAL_WRITE, we copy all pages that have an elevated ref-count. The newpage
is put in place atomically, the old page is detached and left alone. It
will get reclaimed once the last external user dropped it.

Signed-off-by: David Herrmann <dh.herrmann@xxxxxxxxx>
---
mm/shmem.c | 218 +++++++++++++++++++++++++++++--------------------------------
1 file changed, 105 insertions(+), 113 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index ddc3998..34b14fb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1237,6 +1237,110 @@ unlock:
return error;
}

+static int shmem_isolate_page(struct inode *inode, struct page *oldpage)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct page *newpage;
+ int error;
+
+ if (oldpage->mapping != mapping)
+ return 0;
+ if (page_count(oldpage) - page_mapcount(oldpage) <= 2)
+ return 0;
+
+ if (page_mapped(oldpage))
+ unmap_mapping_range(mapping,
+ (loff_t)oldpage->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+
+ VM_BUG_ON_PAGE(PageWriteback(oldpage), oldpage);
+ VM_BUG_ON_PAGE(page_has_private(oldpage), oldpage);
+
+ newpage = shmem_alloc_page(mapping_gfp_mask(mapping), info,
+ oldpage->index);
+ if (!newpage)
+ return -ENOMEM;
+
+ __set_page_locked(newpage);
+ copy_highpage(newpage, oldpage);
+ flush_dcache_page(newpage);
+
+ page_cache_get(newpage);
+ SetPageUptodate(newpage);
+ SetPageSwapBacked(newpage);
+ newpage->mapping = mapping;
+ newpage->index = oldpage->index;
+
+ cancel_dirty_page(oldpage, PAGE_CACHE_SIZE);
+
+ spin_lock_irq(&mapping->tree_lock);
+ error = shmem_radix_tree_replace(mapping, oldpage->index,
+ oldpage, newpage);
+ if (!error) {
+ __inc_zone_page_state(newpage, NR_FILE_PAGES);
+ __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+ }
+ spin_unlock_irq(&mapping->tree_lock);
+
+ if (error) {
+ newpage->mapping = NULL;
+ unlock_page(newpage);
+ page_cache_release(newpage);
+ page_cache_release(newpage);
+ return error;
+ }
+
+ mem_cgroup_replace_page_cache(oldpage, newpage);
+ lru_cache_add_anon(newpage);
+
+ oldpage->mapping = NULL;
+ page_cache_release(oldpage);
+ unlock_page(newpage);
+ page_cache_release(newpage);
+
+ return 1;
+}
+
+static int shmem_isolate_pins(struct inode *inode)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t index;
+ int i, ret, error;
+
+ pagevec_init(&pvec, 0);
+ index = 0;
+ error = 0;
+ while ((pvec.nr = find_get_entries(mapping, index, PAGEVEC_SIZE,
+ pvec.pages, indices))) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ index = indices[i];
+ if (radix_tree_exceptional_entry(page))
+ continue;
+ if (page->mapping != mapping)
+ continue;
+ if (page_count(page) - page_mapcount(page) <= 2)
+ continue;
+
+ lock_page(page);
+ ret = shmem_isolate_page(inode, page);
+ if (ret < 0)
+ error = ret;
+ unlock_page(page);
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ cond_resched();
+ index++;
+ }
+
+ return error;
+}
+
static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
@@ -1734,118 +1838,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}

-/*
- * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
- * so reuse a tag which we firmly believe is never set or cleared on shmem.
- */
-#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE
-#define LAST_SCAN 4 /* about 150ms max */
-
-static void shmem_tag_pins(struct address_space *mapping)
-{
- struct radix_tree_iter iter;
- void **slot;
- pgoff_t start;
- struct page *page;
-
- start = 0;
- rcu_read_lock();
-
-restart:
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- page = radix_tree_deref_slot(slot);
- if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
- } else if (page_count(page) - page_mapcount(page) > 1) {
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_set(&mapping->page_tree, iter.index,
- SHMEM_TAG_PINNED);
- spin_unlock_irq(&mapping->tree_lock);
- }
-
- if (need_resched()) {
- cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
- }
- }
- rcu_read_unlock();
-}
-
-/*
- * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
- * via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
- * and see whether it has an elevated ref-count. If so, we tag them and wait for
- * them to be dropped.
- * The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
- */
-static int shmem_wait_for_pins(struct address_space *mapping)
-{
- struct radix_tree_iter iter;
- void **slot;
- pgoff_t start;
- struct page *page;
- int error, scan;
-
- shmem_tag_pins(mapping);
-
- error = 0;
- for (scan = 0; scan <= LAST_SCAN; scan++) {
- if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
- break;
-
- if (!scan)
- lru_add_drain_all();
- else if (schedule_timeout_killable((HZ << scan) / 200))
- scan = LAST_SCAN;
-
- start = 0;
- rcu_read_lock();
-restart:
- radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
- start, SHMEM_TAG_PINNED) {
-
- page = radix_tree_deref_slot(slot);
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
-
- page = NULL;
- }
-
- if (page &&
- page_count(page) - page_mapcount(page) != 1) {
- if (scan < LAST_SCAN)
- goto continue_resched;
-
- /*
- * On the last scan, we clean up all those tags
- * we inserted; but make a note that we still
- * found pages pinned.
- */
- error = -EBUSY;
- }
-
- spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_clear(&mapping->page_tree,
- iter.index, SHMEM_TAG_PINNED);
- spin_unlock_irq(&mapping->tree_lock);
-continue_resched:
- if (need_resched()) {
- cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
- }
- }
- rcu_read_unlock();
- }
-
- return error;
-}
-
#define F_ALL_SEALS (F_SEAL_SEAL | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
@@ -1907,7 +1899,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
if (error)
goto unlock;

- error = shmem_wait_for_pins(file->f_mapping);
+ error = shmem_isolate_pins(inode);
if (error) {
mapping_allow_writable(file->f_mapping);
goto unlock;
--
2.0.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/