Re: [RFC v2][PATCH 4/9] Memory management - dump state

From: Oren Laadan
Date: Sun Aug 24 2008 - 01:41:32 EST

Next message: FUJITA Tomonori: "Re: Writable packet CD-RW mounting broken in 2.6.26 by your commit68154e90c9d1492d570671ae181d9a8f8530da55"
Previous message: Andrew Morton: "Re: [PATCH 2.6.27-rc3] led: driver for LEDs on PCEngines ALIX.2 andALIX.3 boards"
Next in thread: Dave Hansen: "Re: [RFC v2][PATCH 4/9] Memory management - dump state"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Dave Hansen wrote:

On Wed, 2008-08-20 at 23:05 -0400, Oren Laadan wrote:
index 7ecafb3..0addb63 100644
--- a/checkpoint/ckpt.h
+++ b/checkpoint/ckpt.h
@@ -29,6 +29,9 @@ struct cr_ctx {
void *hbuf; /* header: to avoid many alloc/dealloc */
int hpos;

+ struct cr_pgarr *pgarr;
+ struct cr_pgarr *pgcur;
+
struct path *vfsroot; /* container root */
};

These need much better variable names. From this, I have no idea what
they do. Checkpoint Restart Pirates Go ARR!

@@ -62,6 +65,9 @@ int cr_read_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf, int n);
int cr_read_obj_type(struct cr_ctx *ctx, void *buf, int n, int type);
int cr_read_str(struct cr_ctx *ctx, void *str, int n);

+int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t);
+int cr_read_mm(struct cr_ctx *ctx);
+
int do_checkpoint(struct cr_ctx *ctx);
int do_restart(struct cr_ctx *ctx);

diff --git a/checkpoint/ckpt_arch.h b/checkpoint/ckpt_arch.h
index b7cc8c9..3b87a6f 100644
--- a/checkpoint/ckpt_arch.h
+++ b/checkpoint/ckpt_arch.h
@@ -2,6 +2,7 @@

int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t);
int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t);
+int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int ptag);

int cr_read_thread(struct cr_ctx *ctx);
int cr_read_cpu(struct cr_ctx *ctx);
diff --git a/checkpoint/ckpt_hdr.h b/checkpoint/ckpt_hdr.h
index a478b7c..a3919cf 100644
--- a/checkpoint/ckpt_hdr.h
+++ b/checkpoint/ckpt_hdr.h
@@ -30,6 +30,7 @@ struct cr_hdr {
__u32 ptag;
};

+/* header types */
enum {
CR_HDR_HEAD = 1,
CR_HDR_STR,
@@ -45,6 +46,12 @@ enum {
CR_HDR_TAIL = 5001
};

+/* vma subtypes */
+enum {
+ CR_VMA_ANON = 1,
+ CR_VMA_FILE
+};

Is this really necessary, or can we infer it from the contents of the
VMA?

This classification eventually simplifies both dump and restore. For
instance, it decides whether a file name follows or not.

There will be more, later: CR_VMA_FILE_UNLINKED (mapped to an unlinked
file), CR_VMA_ANON_SHARED (shared anonymous), CR_VMA_ANON_SHARED_SKIP
(shared anonymous, had been sent before) and so on.

struct cr_hdr_head {
__u64 magic;

@@ -83,4 +90,28 @@ struct cr_hdr_task {
char comm[TASK_COMM_LEN];
} __attribute__ ((aligned (8)));

+struct cr_hdr_mm {
+ __u32 tag; /* sharing identifier */

If this really is a sharing identifier, we need a:

struct cr_shared_object_ref {
__u32 tag;
};

And then one of *those* in the vma struct. Make it much more idiot (aka
Dave) proof.

I figured the use of 'tag' for the identifiers of shared objects is clear.
By using a __u32 the size of that field is immediately visible, while using
a structure will hide the actual size; in turn this is what we want visible
here (ABI), no ?

+ __s16 map_count;
+ __s16 _padding;
+
+ __u64 start_code, end_code, start_data, end_data;
+ __u64 start_brk, brk, start_stack;
+ __u64 arg_start, arg_end, env_start, env_end;
+
+} __attribute__ ((aligned (8)));
+
+struct cr_hdr_vma {
+ __u32 how;

It is too bad that we can't actually use the enum type here. It would
make it *much* more obvious what this was. I actually had to go look at
the code below to figure it out.

Using __u32 instead of enum guarantees the size. I'll change the
name and move the enum nearby.

+ __s16 npages;

Wow. Linux only supports 256MB in a single VMA? I didn't know that.
Maybe we should make this type bigger. :)

This also needs to get called something much more descriptive, like
nr_present_pages.

#endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/checkpoint/ckpt_mem.c b/checkpoint/ckpt_mem.c
new file mode 100644
index 0000000..a23aa29
--- /dev/null
+++ b/checkpoint/ckpt_mem.c
@@ -0,0 +1,382 @@
+/*
+ * Checkpoint memory contents
+ *
+ * Copyright (C) 2008 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/mm_types.h>
+
+#include "ckpt.h"
+#include "ckpt_hdr.h"
+#include "ckpt_arch.h"
+#include "ckpt_mem.h"
+
+/*
+ * utilities to alloc, free, and handle 'struct cr_pgarr'
+ * (common to ckpt_mem.c and rstr_mem.c)
+ */
+
+#define CR_PGARR_ORDER 0
+#define CR_PGARR_TOTAL ((PAGE_SIZE << CR_PGARR_ORDER) / sizeof(void *))

Another allocator? Really?

+/* release pages referenced by a page-array */
+void _cr_pgarr_release(struct cr_ctx *ctx, struct cr_pgarr *pgarr)
+{
+ int n;
+
+ /* only checkpoint keeps references to pages */
+ if (ctx->flags & CR_CTX_CKPT) {
+ cr_debug("nused %d\n", pgarr->nused);
+ for (n = pgarr->nused; n--; )
+ page_cache_release(pgarr->pages[n]);
+ }
+ pgarr->nused = 0;
+ pgarr->nleft = CR_PGARR_TOTAL;
+}
+
+/* release pages referenced by chain of page-arrays */
+void cr_pgarr_release(struct cr_ctx *ctx)
+{
+ struct cr_pgarr *pgarr;
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next)
+ _cr_pgarr_release(ctx, pgarr);
+}
+
+/* free a chain of page-arrays */
+void cr_pgarr_free(struct cr_ctx *ctx)
+{
+ struct cr_pgarr *pgarr, *pgnxt;
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgnxt) {
+ _cr_pgarr_release(ctx, pgarr);
+ free_pages((unsigned long) ctx->pgarr->addrs, CR_PGARR_ORDER);
+ free_pages((unsigned long) ctx->pgarr->pages, CR_PGARR_ORDER);
+ pgnxt = pgarr->next;
+ kfree(pgarr);
+ }
+}
+
+/* allocate and add a new page-array to chain */
+struct cr_pgarr *cr_pgarr_alloc(struct cr_ctx *ctx, struct cr_pgarr **pgnew)
+{
+ struct cr_pgarr *pgarr = ctx->pgcur;
+
+ if (pgarr && pgarr->next) {
+ ctx->pgcur = pgarr->next;
+ return pgarr->next;
+ }
+
+ if ((pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL))) {

This entire nested if(){} should be brought back to 1 tab. Remember, no
assignments in if() conditions.

+ pgarr->nused = 0;
+ pgarr->nleft = CR_PGARR_TOTAL;
+ pgarr->addrs = (unsigned long *)
+ __get_free_pages(GFP_KERNEL, CR_PGARR_ORDER);
+ pgarr->pages = (struct page **)
+ __get_free_pages(GFP_KERNEL, CR_PGARR_ORDER);
+ if (likely(pgarr->addrs && pgarr->pages)) {
+ *pgnew = pgarr;
+ ctx->pgcur = pgarr;
+ return pgarr;
+ } else if (pgarr->addrs)
+ free_pages((unsigned long) pgarr->addrs,
+ CR_PGARR_ORDER);
+ kfree(pgarr);
+ }
+
+ return NULL;
+}
+
+/* return current page-array (and allocate if needed) */
+struct cr_pgarr *cr_pgarr_prep(struct cr_ctx *ctx)
+{
+ struct cr_pgarr *pgarr = ctx->pgcur;
+
+ if (unlikely(!pgarr->nleft))
+ pgarr = cr_pgarr_alloc(ctx, &pgarr->next);
+ return pgarr;
+}
+
+/*
+ * Checkpoint is outside the context of the checkpointee, so one cannot
+ * simply read pages from user-space. Instead, we scan the address space
+ * of the target to cherry-pick pages of interest. Selected pages are
+ * enlisted in a page-array chain (attached to the checkpoint context).
+ * To save their contents, each page is mapped to kernel memory and then
+ * dumped to the file descriptor.
+ */
+
+/**
+ * cr_vma_fill_pgarr - fill a page-array with addr/page tuples for a vma
+ * @ctx - checkpoint context
+ * @pgarr - page-array to fill
+ * @vma - vma to scan
+ * @start - start address (updated)
+ */
+static int cr_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr,
+ struct vm_area_struct *vma, unsigned long *start)
+{
+ unsigned long end = vma->vm_end;
+ unsigned long addr = *start;
+ struct page **pagep;
+ unsigned long *addrp;
+ int cow, nr, ret = 0;
+
+ nr = pgarr->nleft;
+ pagep = &pgarr->pages[pgarr->nused];
+ addrp = &pgarr->addrs[pgarr->nused];
+ cow = !!vma->vm_file;
+
+ while (addr < end) {
+ struct page *page;
+
+ /* simplified version of get_user_pages(): already have vma,
+ * only need FOLL_TOUCH, and (for now) ignore fault stats */
+
+ cond_resched();
+ while (!(page = follow_page(vma, addr, FOLL_TOUCH))) {
+ ret = handle_mm_fault(vma->vm_mm, vma, addr, 0);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ ret = -ENOMEM;
+ else if (ret & VM_FAULT_SIGBUS)
+ ret = -EFAULT;
+ else
+ BUG();
+ break;
+ }
+ cond_resched();
+ }

At best this needs to get folded back into its own function. This is

This is almost identical to the original - see the preceding comment.

pretty hard to read as-is. Also, are there truly no in-kernel functions
that can be used for this?

Can you suggest one ?

+ if (IS_ERR(page)) {
+ ret = PTR_ERR(page);
+ break;
+ }
+
+ if (page == ZERO_PAGE(0))
+ page = NULL; /* zero page: ignore */
+ else if (cow && page_mapping(page) != NULL)
+ page = NULL; /* clean cow: ignore */
+ else {

Put the curly brackets in here. It doesn't take up space.

+ get_page(page);
+ *(addrp++) = addr;
+ *(pagep++) = page;
+ if (--nr == 0) {
+ addr += PAGE_SIZE;
+ break;
+ }
+ }
+
+ addr += PAGE_SIZE;
+ }
+
+ if (unlikely(ret < 0)) {
+ nr = pgarr->nleft - nr;
+ while (nr--)
+ page_cache_release(*(--pagep));
+ return ret;
+ }
+
+ *start = addr;
+ return (pgarr->nleft - nr);
+}
+
+/**
+ * cr_vma_scan_pages - scan vma for pages that will need to be dumped
+ * @ctx - checkpoint context
+ * @vma - vma to scan
+ *
+ * a list of addr/page tuples is kept in ctx->pgarr page-array chain
+ */
+static int cr_vma_scan_pages(struct cr_ctx *ctx, struct vm_area_struct *vma)
+{
+ unsigned long addr = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ struct cr_pgarr *pgarr;
+ int nr, total = 0;
+
+ while (addr < end) {
+ if (!(pgarr = cr_pgarr_prep(ctx)))
+ return -ENOMEM;
+ if ((nr = cr_vma_fill_pgarr(ctx, pgarr, vma, &addr)) < 0)
+ return nr;
+ pgarr->nleft -= nr;
+ pgarr->nused += nr;
+ total += nr;
+ }
+
+ cr_debug("total %d\n", total);
+ return total;
+}
+
+/**
+ * cr_vma_dump_pages - dump pages listed in the ctx page-array chain
+ * @ctx - checkpoint context
+ * @total - total number of pages
+ */
+static int cr_vma_dump_pages(struct cr_ctx *ctx, int total)
+{
+ struct cr_pgarr *pgarr;
+ int ret;
+
+ if (!total)
+ return 0;
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) {
+ ret = cr_kwrite(ctx, pgarr->addrs,
+ pgarr->nused * sizeof(*pgarr->addrs));
+ if (ret < 0)
+ return ret;
+ }
+
+ for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) {
+ struct page **pages = pgarr->pages;
+ int nr = pgarr->nused;
+ void *ptr;
+
+ while (nr--) {
+ ptr = kmap(*pages);
+ ret = cr_kwrite(ctx, ptr, PAGE_SIZE);
+ kunmap(*pages);

Why not use kmap_atomic() here?

It is my understanding that the code must not sleep between kmap_atomic()
and kunmap_atomic().

+ if (ret < 0)
+ return ret;
+ pages++;
+ }
+ }
+
+ return total;
+}
+
+static int cr_write_vma(struct cr_ctx *ctx, struct vm_area_struct *vma)
+{
+ struct cr_hdr h;
+ struct cr_hdr_vma *hh = ctx->hbuf;
+ char *fname = NULL;
+ int flen = 0, how, nr, ret;
+
+ h.type = CR_HDR_VMA;
+ h.len = sizeof(*hh);
+ h.ptag = 0;
+
+ hh->vm_start = vma->vm_start;
+ hh->vm_end = vma->vm_end;
+ hh->vm_page_prot = vma->vm_page_prot.pgprot;
+ hh->vm_flags = vma->vm_flags;
+ hh->vm_pgoff = vma->vm_pgoff;
+
+ if (vma->vm_flags & (VM_SHARED | VM_IO | VM_HUGETLB | VM_NONLINEAR)) {
+ pr_warning("CR: unknown VMA %#lx\n", vma->vm_flags);
+ return -ETXTBSY;
+ }

Hmmm. Interesting error code for VM_HUGETLB. :)

:) well, the usual EINVAL didn't seem suitable. Any better suggestions ?

+ /* by default assume anon memory */
+ how = CR_VMA_ANON;
+
+ /* if there is a backing file, assume private-mapped */
+ /* (NEED: check if the file is unlinked) */
+ if (vma->vm_file) {
+ flen = PAGE_SIZE;
+ fname = cr_fill_fname(&vma->vm_file->f_path,
+ ctx->vfsroot, ctx->tbuf, &flen);
+ if (IS_ERR(fname))
+ return PTR_ERR(fname);
+ how = CR_VMA_FILE;
+ }
+
+ hh->how = how;
+ hh->fname = !!fname;
+
+ /*
+ * it seems redundant now, but we do it in 3 steps for because:
+ * first, the logic is simpler when we how many pages before
+ * dumping them; second, a future optimization will defer the
+ * writeout (dump, and free) to a later step; in which case all
+ * the pages to be dumped will be aggregated on the checkpoint ctx
+ */
+
+ /* (1) scan: scan through the PTEs of the vma to count the pages
+ * to dump (and later make those pages COW), and keep the list of
+ * pages (and a reference to each page) on the checkpoint ctx */
+ nr = cr_vma_scan_pages(ctx, vma);
+ if (nr < 0)
+ return nr;
+
+ hh->npages = nr;
+ ret = cr_write_obj(ctx, &h, hh);
+
+ if (!ret && flen)
+ ret = cr_write_str(ctx, fname, flen);
+
+ if (ret < 0)
+ return ret;
+
+ /* (2) dump: write out the addresses of all pages in the list (on
+ * the checkpoint ctx) followed by the contents of all pages */
+ ret = cr_vma_dump_pages(ctx, nr);
+
+ /* (3) free: free the extra references to the pages in the list */
+ cr_pgarr_release(ctx);
+
+ return ret;
+}

This gets simpler-looking if you just defer the filename processing
until you actually go to write it out.

Yes. I'll encapsulate that in it's own function.

Oren.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: FUJITA Tomonori: "Re: Writable packet CD-RW mounting broken in 2.6.26 by your commit68154e90c9d1492d570671ae181d9a8f8530da55"
Previous message: Andrew Morton: "Re: [PATCH 2.6.27-rc3] led: driver for LEDs on PCEngines ALIX.2 andALIX.3 boards"
Next in thread: Dave Hansen: "Re: [RFC v2][PATCH 4/9] Memory management - dump state"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]