Re: 2.6.19 file content corruption on ext3

From: Linus Torvalds
Date: Thu Dec 28 2006 - 16:25:24 EST




On Thu, 28 Dec 2006, Linus Torvalds wrote:
>
> What we need now is actually looking at the source code, and people who
> understand the VM, I'm afraid. I'm gathering traces now that I have a good
> test-case. I'll post my trace tools once I've tested that they work, in
> case others want to help.

Ok, I've got the traces, but quite frankly, I doubt anybody is crazy
enough to want to trawl through them. It's a bit painful, since we're
talking thousands of pages to trigger this problem.

Also, I've used the PG_arch_1 flag, which is fine on x86[-64] and probably
ARM, but is used for other things on ia64, powerpc and sparc64. But here's
the patch in case anybody cares.

It wants a _big_ kernel buffer to capture all the crud into (which is why
I made the thing accept a bigger log buffer), and quite frankly, I'm not
at all sure that all the locking is ok (ie I could imagine that the
dcache-locking thing there in "is_interesting()" could deadlock, what do I
know..)

But I've captured some real data with this, which I'll describe
separately.

Linus

----
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 350878a..967dd80 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -91,6 +91,8 @@
#define PG_nosave_free 18 /* Used for system suspend/resume */
#define PG_buddy 19 /* Page is free, on buddy lists */

+#define SetPageInteresting(page) set_bit(PG_arch_1, &(page)->flags)
+#define PageInteresting(page) test_bit(PG_arch_1, &(page)->flags)

#if (BITS_PER_LONG > 32)
/*
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5c26818..7735b83 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -79,7 +79,7 @@ config DEBUG_KERNEL

config LOG_BUF_SHIFT
int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL
- range 12 21
+ range 12 24
default 17 if S390 || LOCKDEP
default 16 if X86_NUMAQ || IA64
default 15 if SMP
diff --git a/mm/filemap.c b/mm/filemap.c
index 8332c77..d6a0f56 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -116,6 +116,7 @@ void __remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;

+if (PageInteresting(page)) printk("Removing index %08x from page cache\n", page->index);
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
mapping->nrpages--;
@@ -421,6 +422,23 @@ int filemap_write_and_wait_range(struct address_space *mapping,
return err;
}

+static noinline int is_interesting(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct dentry *dentry;
+ int retval = 0;
+
+ spin_lock(&dcache_lock);
+ list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+ if (strcmp(dentry->d_name.name, "mapfile"))
+ continue;
+ retval = 1;
+ break;
+ }
+ spin_unlock(&dcache_lock);
+ return retval;
+}
+
/**
* add_to_page_cache - add newly allocated pagecache pages
* @page: page to add
@@ -439,6 +457,9 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
{
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);

+ if (is_interesting(mapping))
+ SetPageInteresting(page);
+
if (error == 0) {
write_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
diff --git a/mm/memory.c b/mm/memory.c
index 563792f..14c9815 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -667,6 +667,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
+if (PageInteresting(page))
+ printk("Unmapped index %08x at %08x\n", page->index, addr);
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
addr) != page->index)
@@ -1605,6 +1607,7 @@ gotten:
*/
ptep_clear_flush(vma, address, page_table);
set_pte_at(mm, address, page_table, entry);
+if (PageInteresting(new_page)) printk("do_wp_page: mapping index %08x at %08lx\n", new_page->index, address);
update_mmu_cache(vma, address, entry);
lru_cache_add_active(new_page);
page_add_new_anon_rmap(new_page, vma, address);
@@ -2249,6 +2252,7 @@ retry:
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+if (PageInteresting(new_page)) printk("do_no_page: mapping index %08x at %08lx (%s)\n", new_page->index, address, write_access ? "write" : "read");
set_pte_at(mm, address, page_table, entry);
if (anon) {
inc_mm_counter(mm, anon_rss);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b3a198c..0466601 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -813,6 +813,7 @@ int fastcall set_page_dirty(struct page *page)
if (!spd)
spd = __set_page_dirty_buffers;
#endif
+if (PageInteresting(page)) printk("Setting page %08x dirty\n", page->index);
return (*spd)(page);
}
if (!PageDirty(page)) {
@@ -867,6 +868,7 @@ int clear_page_dirty_for_io(struct page *page)

if (TestClearPageDirty(page)) {
if (mapping_cap_account_dirty(mapping)) {
+if (PageInteresting(page)) printk("cpd_for_io: index %08x\n", page->index);
page_mkclean(page);
dec_zone_page_state(page, NR_FILE_DIRTY);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 57306fa..e98e84c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -448,6 +448,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
if (pte_dirty(*pte) || pte_write(*pte)) {
pte_t entry;

+if (PageInteresting(page)) printk("cleaning index %08x at %08x\n", page->index, address);
flush_cache_page(vma, address, pte_pfn(*pte));
entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
@@ -637,6 +638,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
goto out_unmap;
}

+if (PageInteresting(page)) printk("unmapping index %08x from %08lx\n", page->index, address);
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
pteval = ptep_clear_flush(vma, address, pte);
@@ -767,6 +769,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
if (ptep_clear_flush_young(vma, address, pte))
continue;

+if (PageInteresting(page)) printk("Cluster-unmapping %08x from %08lx\n", page->index, address);
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/