Updated patch for 2.1.117 swap

Bill Hawes (whawes@transmeta.com)
Tue, 25 Aug 1998 09:04:23 -0700


This is a multi-part message in MIME format.
--------------302DA68DBCF9B6ABBDC231B0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

I've attached patch an updated, simplified patch to fix a potential race
condition in the swap code, and to tidy up the code and header files a
bit.

The current read_swap_cache_async() code has a potential race in that a
swap entry could become unused while allocating a new page. This can
happen in two cases: for a swap-in case, another process could complete
the swap and the swap entry could be released, and for the case when
called from try_to_unuse, the process holding the swapped page could
simply exit.

The patch avoids this problem by modifying swap_duplicate() to return a
success value if the swap entry is valid and still in use. If the
return value indicates that the entry is no longer in use,
read_swap_cache_async() frees the allocated page and returns failure.

The caller can then check whether the read_swap_cache() failure
indicates an allocation failure or an unused swap entry. This is already
done by the swap_in code, and I've added a test to try_to_unuse to
check whether the swap map entry is 0, and if so to continue searching
for other entries. If the swap map is non-zero after a failure, it
indicates the page allocation failed and try_to_unuse bails out.

The remaining changes in the patch are relatively minor and are
summarized as:

(1) Clean up include/linux/swap.h a bit by grouping prototypes by source
file,

(2) Move the definition of swapper_inode outside SWAP_CACHE_INFO
conditional,

(3) Change delete_from_swap_cache() from int to void, as the return
value is no longer used,

(4) Simplify the search loop in try_to_unuse, and improve warning
message for unused swap entries,

(5) Streamline dentry dput() code in sys_swapoff,

(6) Add error level classification to various printk messages.

The patch is pretty well tested -- I've been running it for quite a
while with no problems, and the code changes are relatively
straightforward. The race condition itself is probably quite rare, but I
think it's best to get it fixed before 2.2.

Regards,
Bill

--------------302DA68DBCF9B6ABBDC231B0
Content-Type: text/plain; charset=us-ascii; name="swap_117-patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="swap_117-patch"

--- linux-2.1.117/include/linux/swap.h.old Wed Aug 19 21:23:21 1998
+++ linux-2.1.117/include/linux/swap.h Sat Aug 22 09:07:15 1998
@@ -75,6 +75,7 @@
/* linux/mm/page_io.c */
extern void rw_swap_page(int, unsigned long, char *, int);
extern void rw_swap_page_nocache(int, unsigned long, char *);
+extern void swap_after_unlock_page (unsigned long entry);

/* linux/mm/page_alloc.c */
extern void swap_in(struct task_struct *, struct vm_area_struct *,
@@ -84,10 +85,15 @@
/* linux/mm/swap_state.c */
extern void show_swap_cache_info(void);
extern int add_to_swap_cache(struct page *, unsigned long);
-extern void swap_duplicate(unsigned long);
-extern void swap_after_unlock_page (unsigned long entry);
+extern int swap_duplicate(unsigned long);
+extern int swap_check_entry(unsigned long);
extern struct page * read_swap_cache_async(unsigned long, int);
#define read_swap_cache(entry) read_swap_cache_async(entry, 1);
+/*
+ * Make these inline later once they are working properly.
+ */
+extern void delete_from_swap_cache(struct page *page);
+extern void free_page_and_swap_cache(unsigned long addr);

/* linux/mm/swapfile.c */
extern unsigned int nr_swapfiles;
@@ -100,6 +106,8 @@
int next; /* swapfile to be used next */
};
extern struct swap_list_t swap_list;
+int sys_swapoff(const char *);
+int sys_swapon(const char *, int);

/*
* vm_ops not present page codes for shared memory.
@@ -147,14 +155,6 @@
count--;
return (count > 1);
}
-
-/*
- * Make these inline later once they are working properly.
- */
-extern long find_in_swap_cache(struct page *page);
-extern int delete_from_swap_cache(struct page *page);
-extern void remove_from_swap_cache(struct page *page);
-extern void free_page_and_swap_cache(unsigned long addr);

#endif /* __KERNEL__*/

--- linux-2.1.117/mm/swap_state.c.old Wed Aug 19 21:23:21 1998
+++ linux-2.1.117/mm/swap_state.c Sat Aug 22 12:53:06 1998
@@ -24,14 +24,6 @@
#include <asm/bitops.h>
#include <asm/pgtable.h>

-#ifdef SWAP_CACHE_INFO
-unsigned long swap_cache_add_total = 0;
-unsigned long swap_cache_add_success = 0;
-unsigned long swap_cache_del_total = 0;
-unsigned long swap_cache_del_success = 0;
-unsigned long swap_cache_find_total = 0;
-unsigned long swap_cache_find_success = 0;
-
/*
* Keep a reserved false inode which we will use to mark pages in the
* page cache are acting as swap cache instead of file cache.
@@ -43,6 +35,13 @@
*/
struct inode swapper_inode;

+#ifdef SWAP_CACHE_INFO
+unsigned long swap_cache_add_total = 0;
+unsigned long swap_cache_add_success = 0;
+unsigned long swap_cache_del_total = 0;
+unsigned long swap_cache_del_success = 0;
+unsigned long swap_cache_find_total = 0;
+unsigned long swap_cache_find_success = 0;

void show_swap_cache_info(void)
{
@@ -63,13 +62,13 @@
page_address(page), atomic_read(&page->count), entry);
#endif
if (PageTestandSetSwapCache(page)) {
- printk("swap_cache: replacing non-empty entry %08lx "
+ printk(KERN_ERR "swap_cache: replacing non-empty entry %08lx "
"on page %08lx\n",
page->offset, page_address(page));
return 0;
}
if (page->inode) {
- printk("swap_cache: replacing page-cached entry "
+ printk(KERN_ERR "swap_cache: replacing page-cached entry "
"on page %08lx\n", page_address(page));
return 0;
}
@@ -85,12 +84,16 @@
}

/*
- * If swap_map[] reaches SWAP_MAP_MAX the entries are treated as "permanent".
+ * Verify that a swap entry is valid and increment its swap map count.
+ *
+ * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
+ * "permanent", but will be reclaimed by the next swapoff.
*/
-void swap_duplicate(unsigned long entry)
+int swap_duplicate(unsigned long entry)
{
struct swap_info_struct * p;
unsigned long offset, type;
+ int result = 0;

if (!entry)
goto out;
@@ -105,36 +108,44 @@
goto bad_offset;
if (!p->swap_map[offset])
goto bad_unused;
+ /*
+ * Entry is valid, so increment the map count.
+ */
if (p->swap_map[offset] < SWAP_MAP_MAX)
p->swap_map[offset]++;
else {
static int overflow = 0;
if (overflow++ < 5)
- printk("swap_duplicate: entry %08lx map count=%d\n",
+ printk(KERN_WARNING
+ "swap_duplicate: entry %08lx map count=%d\n",
entry, p->swap_map[offset]);
p->swap_map[offset] = SWAP_MAP_MAX;
}
+ result = 1;
#ifdef DEBUG_SWAP
printk("DebugVM: swap_duplicate(entry %08lx, count now %d)\n",
entry, p->swap_map[offset]);
#endif
out:
- return;
+ return result;

bad_file:
- printk("swap_duplicate: Trying to duplicate nonexistent swap-page\n");
+ printk(KERN_ERR
+ "swap_duplicate: entry %08lx, nonexistent swap file\n", entry);
goto out;
bad_offset:
- printk("swap_duplicate: offset exceeds max\n");
+ printk(KERN_ERR
+ "swap_duplicate: entry %08lx, offset exceeds max\n", entry);
goto out;
bad_unused:
- printk("swap_duplicate at %8p: unused page\n",
- __builtin_return_address(0));
+ printk(KERN_ERR
+ "swap_duplicate at %8p: entry %08lx, unused page\n",
+ __builtin_return_address(0), entry);
goto out;
}


-void remove_from_swap_cache(struct page *page)
+static inline void remove_from_swap_cache(struct page *page)
{
if (!page->inode) {
printk ("VM: Removing swap cache page with zero inode hash "
@@ -163,7 +174,7 @@
}


-int delete_from_swap_cache(struct page *page)
+void delete_from_swap_cache(struct page *page)
{
#ifdef SWAP_CACHE_INFO
swap_cache_del_total++;
@@ -180,9 +191,7 @@
#endif
remove_from_swap_cache (page);
swap_free (entry);
- return 1;
}
- return 0;
}

/*
@@ -218,57 +227,67 @@
found = find_page(&swapper_inode, entry);
if (!found)
return 0;
- if (found->inode != &swapper_inode
- || !PageSwapCache(found)) {
- __free_page(found);
- printk ("VM: Found a non-swapper swap page!\n");
- return 0;
- }
+ if (found->inode != &swapper_inode || !PageSwapCache(found))
+ goto out_bad;
if (!PageLocked(found))
return found;
__free_page(found);
__wait_on_page(found);
}
+
+out_bad:
+ printk (KERN_ERR "VM: Found a non-swapper swap page!\n");
+ __free_page(found);
+ return 0;
}

/*
* Locate a page of swap in physical memory, reserving swap cache space
* and reading the disk if it is not already cached. If wait==0, we are
* only doing readahead, so don't worry if the page is already locked.
+ *
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
*/

struct page * read_swap_cache_async(unsigned long entry, int wait)
{
- struct page *found_page, *new_page = 0;
- unsigned long new_page_addr = 0;
+ struct page *found_page, *new_page;
+ unsigned long new_page_addr;

#ifdef DEBUG_SWAP
printk("DebugVM: read_swap_cache_async entry %08lx%s\n",
entry, wait ? ", wait" : "");
#endif
-repeat:
+ /*
+ * Look for the page in the swap cache.
+ */
found_page = lookup_swap_cache(entry);
- if (found_page) {
- if (new_page)
- __free_page(new_page);
- return found_page;
- }
+ if (found_page)
+ goto out;
+
+ new_page_addr = __get_free_page(GFP_KERNEL);
+ if (!new_page_addr)
+ goto out; /* Out of memory */
+ new_page = mem_map + MAP_NR(new_page_addr);
+
+ /*
+ * Check the swap cache again, in case we stalled above.
+ */
+ found_page = lookup_swap_cache(entry);
+ if (found_page)
+ goto out_free_page;
+ /*
+ * Make sure the swap entry is still in use.
+ */
+ if (!swap_duplicate(entry)) /* Account for the swap cache */
+ goto out_free_page;
+ /*
+ * Add it to the swap cache and read its contents.
+ */
+ if (!add_to_swap_cache(new_page, entry))
+ goto out_free_page;

- /* The entry is not present. Lock down a new page, add it to
- * the swap cache and read its contents. */
- if (!new_page) {
- new_page_addr = __get_free_page(GFP_KERNEL);
- if (!new_page_addr)
- return 0; /* Out of memory */
- new_page = mem_map + MAP_NR(new_page_addr);
- goto repeat; /* We might have stalled */
- }
-
- if (!add_to_swap_cache(new_page, entry)) {
- free_page(new_page_addr);
- return 0;
- }
- swap_duplicate(entry); /* Account for the swap cache */
set_bit(PG_locked, &new_page->flags);
rw_swap_page(READ, entry, (char *) new_page_addr, wait);
#ifdef DEBUG_SWAP
@@ -277,5 +296,9 @@
entry, (char *) page_address(new_page));
#endif
return new_page;
-}

+out_free_page:
+ __free_page(new_page);
+out:
+ return found_page;
+}
--- linux-2.1.117/mm/swapfile.c.old Wed Aug 19 21:23:21 1998
+++ linux-2.1.117/mm/swapfile.c Sat Aug 22 09:32:06 1998
@@ -296,30 +296,36 @@
{
struct swap_info_struct * si = &swap_info[type];
struct task_struct *p;
- unsigned long page = 0;
struct page *page_map;
- unsigned long entry;
+ unsigned long entry, page;
int i;

while (1) {
/*
* Find a swap page in use and read it in.
*/
- for (i = 1 , entry = 0; i < si->max ; i++) {
+ for (i = 1; i < si->max ; i++) {
if (si->swap_map[i] > 0 && si->swap_map[i] != SWAP_MAP_BAD) {
- entry = SWP_ENTRY(type, i);
- break;
+ goto found_entry;
}
}
- if (!entry)
- break;
+ break;
+
+ found_entry:
+ entry = SWP_ENTRY(type, i);

/* Get a page for the entry, using the existing swap
cache page if there is one. Otherwise, get a clean
page and read the swap into it. */
page_map = read_swap_cache(entry);
- if (!page_map)
- return -ENOMEM;
+ if (!page_map) {
+ /*
+ * Continue searching if the entry became unused.
+ */
+ if (si->swap_map[i] == 0)
+ continue;
+ return -ENOMEM;
+ }
page = page_address(page_map);
read_lock(&tasklist_lock);
for_each_task(p)
@@ -329,11 +335,15 @@
page we've been using. */
if (PageSwapCache(page_map))
delete_from_swap_cache(page_map);
- free_page(page);
+ __free_page(page_map);
+ /*
+ * Check for and clear any overflowed swap map counts.
+ */
if (si->swap_map[i] != 0) {
if (si->swap_map[i] != SWAP_MAP_MAX)
- printk("try_to_unuse: entry %08lx "
- "not in use\n", entry);
+ printk(KERN_ERR
+ "try_to_unuse: entry %08lx count=%d\n",
+ entry, si->swap_map[i]);
si->swap_map[i] = 0;
nr_swap_pages++;
}
@@ -374,10 +384,9 @@
prev = type;
}
err = -EINVAL;
- if (type < 0){
- dput(dentry);
- goto out;
- }
+ if (type < 0)
+ goto out_dput;
+
if (prev < 0) {
swap_list.head = p->next;
} else {
@@ -390,7 +399,6 @@
p->flags = SWP_USED;
err = try_to_unuse(type);
if (err) {
- dput(dentry);
/* re-insert swap space back into swap_list */
for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
if (p->prio >= swap_info[i].prio)
@@ -401,7 +409,7 @@
else
swap_info[prev].next = p - swap_info;
p->flags = SWP_WRITEOK;
- goto out;
+ goto out_dput;
}
if(p->swap_device){
memset(&filp, 0, sizeof(filp));
@@ -416,9 +424,9 @@
}
dput(dentry);

- nr_swap_pages -= p->pages;
- dput(p->swap_file);
+ dentry = p->swap_file;
p->swap_file = NULL;
+ nr_swap_pages -= p->pages;
p->swap_device = 0;
vfree(p->swap_map);
p->swap_map = NULL;
@@ -426,6 +434,9 @@
p->swap_lockmap = NULL;
p->flags = 0;
err = 0;
+
+out_dput:
+ dput(dentry);
out:
unlock_kernel();
return err;
@@ -717,4 +728,3 @@
val->totalswap <<= PAGE_SHIFT;
return;
}
-

--------------302DA68DBCF9B6ABBDC231B0--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.altern.org/andrebalsa/doc/lkml-faq.html