Re: [patch 5/5] elf: Add support for loading ET_CKPT files

From: Tejun Heo
Date: Fri Oct 14 2011 - 13:10:42 EST


Hello,

(cc'ing Oleg and Linus, and quoting whole body)

On Fri, Oct 14, 2011 at 03:04:21PM +0400, Cyrill Gorcunov wrote:
> This patch add ability to run that named "checkpoint" files by
> enhancing Elf file format, which includes
>
> - new Elf file type ET_CKPT
>
> - three additional program header types PT_CKPT_VMA, PT_CKPT_CORE
> and PT_CKPT_PAGES.
>
> PT_CKPT_VMA -- holds 'vma_entry' structure, which describes the
> memory area the kernel should map. It also might contain a file descriptor
> so the kernel will be mapping a file povided. Usually such file get
> opened by user-space helper tool which prepares 'vma_entry' structure
> for the kernel.
>
> PT_CKPT_CORE -- 'core_entry' structure (registers, tls, tasks specific
> settings). The structure is defined as a 16K container which should be
> enough for most cases. 8K of it is reserved for arch specific settings.
>
> PT_CKPT_PAGES -- a set of all pages which contents we should restored.
>
> Apart from Elf extension flush_old_exec() has been splitted to two
> functions -- the former flush_old_exec() and flush_exec_keep_thread().
> The later doesn't call for de_thread() allowing to keep threads
> relationship. Also arch_setup_additional_pages_at() helper added
> to setup vdso at predefined address.
>
> At moment only pure x86-64 architecture is supported.

I don't think this is a good idea. We already have most of interface
necessary for restoring task state and there's no need to put it into
the kernel as one piece. If there's something which can't be done
from userland using existing interfaces, let's please discuss what
they are and whether they can be resolved differently first.

The exec path is very intricate as it is and it would be very easy to
introduce security or other bugs by altering its basic behavior. exec
presumes destruction of (most of) the current process and all its
other threads and replacing them w/ fresh states from an executable.
The scary part - interaction with process hierarchy and zapping of the
current state - is handled from the core exec code.

I see that you removed zapping to allow restoring multi-threaded
process, which seems quite scary to me. It might be okay, I don't
know, but IMHO it just isn't a very good idea to introduce such
variation to basic exec behavior for this rather narrow use case.

In addition, leaving it alone doesn't really solve multi-threaded
restoring, does it? Who sets the task states for other threads? The
current code doesn't seem to be doing anything but if you're gonna add
it later, how are you gonna synchronize other threads? If not, what's
the point of pushing this chunk in when userland task state
restoration is necessary anyway?

So, at the moment, I'm rather strongly against this approach.

Thank you.

> Signed-off-by: Cyrill Gorcunov <gorcunov@xxxxxxxxxx>
> CC: Andrew Vagin <avagin@xxxxxxxxxxxxx>
> CC: Pavel Emelyanov <xemul@xxxxxxxxxxxxx>
> CC: James Bottomley <jbottomley@xxxxxxxxxxxxx>
> CC: Glauber Costa <glommer@xxxxxxxxxxxxx>
> CC: H. Peter Anvin <hpa@xxxxxxxxx>
> CC: Ingo Molnar <mingo@xxxxxxx>
> CC: Tejun Heo <tj@xxxxxxxxxx>
> CC: Dave Hansen <dave@xxxxxxxxxxxxxxxxxx>
> CC: Eric W. Biederman <ebiederm@xxxxxxxxxxxx>
> CC: Daniel Lezcano <dlezcano@xxxxxxxxxx>
> CC: Alexey Dobriyan <adobriyan@xxxxxxxxx>
> ---
> arch/x86/include/asm/elf.h | 3
> arch/x86/include/asm/elf_ckpt.h | 80 ++++++++
> arch/x86/kernel/Makefile | 2
> arch/x86/kernel/elf_ckpt.c | 161 ++++++++++++++++++
> arch/x86/vdso/vma.c | 22 ++
> fs/Kconfig.binfmt | 11 +
> fs/Makefile | 1
> fs/binfmt_elf.c | 17 +
> fs/binfmt_elf_ckpt.c | 356 ++++++++++++++++++++++++++++++++++++++++
> fs/exec.c | 27 +--
> include/linux/binfmts.h | 1
> include/linux/elf_ckpt.h | 103 +++++++++++
> 12 files changed, 772 insertions(+), 12 deletions(-)
>
> Index: linux-2.6.git/arch/x86/include/asm/elf.h
> ===================================================================
> --- linux-2.6.git.orig/arch/x86/include/asm/elf.h
> +++ linux-2.6.git/arch/x86/include/asm/elf.h
> @@ -314,7 +314,8 @@ struct linux_binprm;
> #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
> extern int arch_setup_additional_pages(struct linux_binprm *bprm,
> int uses_interp);
> -
> +extern int arch_setup_additional_pages_at(struct linux_binprm *bprm,
> + void *addr, int uses_interp);
> extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
> #define compat_arch_setup_additional_pages syscall32_setup_pages
>
> Index: linux-2.6.git/arch/x86/include/asm/elf_ckpt.h
> ===================================================================
> --- /dev/null
> +++ linux-2.6.git/arch/x86/include/asm/elf_ckpt.h
> @@ -0,0 +1,80 @@
> +#ifndef _LINUX_ELF_X86_CHECKPOINT_H
> +#define _LINUX_ELF_X86_CHECKPOINT_H
> +
> +#include <linux/errno.h>
> +
> +#include <asm/types.h>
> +#include <asm/ptrace.h>
> +
> +#define CKPT_GDT_ENTRY_TLS_ENTRIES 3
> +
> +struct user_regs_entry {
> + __u64 r15;
> + __u64 r14;
> + __u64 r13;
> + __u64 r12;
> + __u64 bp;
> + __u64 bx;
> + __u64 r11;
> + __u64 r10;
> + __u64 r9;
> + __u64 r8;
> + __u64 ax;
> + __u64 cx;
> + __u64 dx;
> + __u64 si;
> + __u64 di;
> + __u64 orig_ax;
> + __u64 ip;
> + __u64 cs;
> + __u64 flags;
> + __u64 sp;
> + __u64 ss;
> + __u64 fs_base;
> + __u64 gs_base;
> + __u64 ds;
> + __u64 es;
> + __u64 fs;
> + __u64 gs;
> +} __packed;
> +
> +struct desc_struct_entry {
> + __u32 a;
> + __u32 b;
> +} __packed;
> +
> +struct user_fpregs_entry {
> + __u16 cwd;
> + __u16 swd;
> + __u16 twd;
> + __u16 fop;
> + __u64 rip;
> + __u64 rdp;
> + __u32 mxcsr;
> + __u32 mxcsr_mask;
> + __u32 st_space[32];
> + __u32 xmm_space[64];
> + __u32 padding[24];
> +} __packed;
> +
> +struct ckpt_arch_entry {
> + struct user_regs_entry gpregs;
> + struct user_fpregs_entry fpregs;
> + struct desc_struct tls_array[CKPT_GDT_ENTRY_TLS_ENTRIES];
> +};
> +
> +struct core_entry;
> +
> +#ifdef CONFIG_X86_64
> +extern int load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs,
> + struct core_entry *core_entry);
> +#else
> +static inline int
> +load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs,
> + struct core_entry *core_entry)
> +{
> + return -ENOEXEC;
> +}
> +#endif
> +
> +#endif /* _LINUX_ELF_X86_CHECKPOINT_H */
> Index: linux-2.6.git/arch/x86/kernel/Makefile
> ===================================================================
> --- linux-2.6.git.orig/arch/x86/kernel/Makefile
> +++ linux-2.6.git/arch/x86/kernel/Makefile
> @@ -99,6 +99,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION)
> obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
> obj-$(CONFIG_OF) += devicetree.o
>
> +obj-$(CONFIG_BINFMT_ELF_CKPT) += elf_ckpt.o
> +
> ###
> # 64 bit specific files
> ifeq ($(CONFIG_X86_64),y)
> Index: linux-2.6.git/arch/x86/kernel/elf_ckpt.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6.git/arch/x86/kernel/elf_ckpt.c
> @@ -0,0 +1,161 @@
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/errno.h>
> +#include <linux/signal.h>
> +#include <linux/binfmts.h>
> +#include <linux/string.h>
> +#include <linux/file.h>
> +#include <linux/slab.h>
> +#include <linux/personality.h>
> +#include <linux/elfcore.h>
> +#include <linux/init.h>
> +#include <linux/highuid.h>
> +#include <linux/compiler.h>
> +#include <linux/highmem.h>
> +#include <linux/pagemap.h>
> +#include <linux/security.h>
> +#include <linux/random.h>
> +#include <linux/elf.h>
> +#include <linux/utsname.h>
> +#include <linux/coredump.h>
> +#include <linux/regset.h>
> +
> +#include <asm/uaccess.h>
> +#include <asm/param.h>
> +#include <asm/page.h>
> +#include <asm/prctl.h>
> +#include <asm/proto.h>
> +#include <asm/i387.h>
> +
> +#include <linux/elf_ckpt.h>
> +#include <linux/flex_array.h>
> +#include <asm/tlbflush.h>
> +#include <asm/desc.h>
> +
> +#ifdef CONFIG_X86_64
> +
> +#define cp_reg(d, s, r) d.r = s.r
> +
> +int load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs,
> + struct core_entry *core_entry)
> +{
> + struct ckpt_arch_entry *arch = (struct ckpt_arch_entry *)core_entry->arch;
> + struct thread_struct *thread = &current->thread;
> +
> + struct user_regs_struct gpregs;
> + struct user_i387_struct fpregs;
> +
> + mm_segment_t old_fs;
> + int i, ret;
> +
> + if (core_entry->header.arch != CKPT_HEADER_ARCH_X86_64) {
> + pr_err("elf-ckpt-x86: Unsupported or corrupted header\n");
> + return -ENOEXEC;
> + }
> +
> + BUILD_BUG_ON(CKPT_GDT_ENTRY_TLS_ENTRIES != GDT_ENTRY_TLS_ENTRIES);
> + BUILD_BUG_ON(sizeof(struct ckpt_arch_entry) > CKPT_ARCH_SIZE);
> +
> + memset(&gpregs, 0, sizeof(gpregs));
> + memset(&fpregs, 0, sizeof(fpregs));
> +
> + /*
> + * General purpose registers
> + */
> + cp_reg(gpregs, arch->gpregs, r15);
> + cp_reg(gpregs, arch->gpregs, r14);
> + cp_reg(gpregs, arch->gpregs, r13);
> + cp_reg(gpregs, arch->gpregs, r12);
> + cp_reg(gpregs, arch->gpregs, bp);
> + cp_reg(gpregs, arch->gpregs, bx);
> + cp_reg(gpregs, arch->gpregs, r11);
> + cp_reg(gpregs, arch->gpregs, r10);
> + cp_reg(gpregs, arch->gpregs, r9);
> + cp_reg(gpregs, arch->gpregs, r8);
> + cp_reg(gpregs, arch->gpregs, ax);
> + cp_reg(gpregs, arch->gpregs, cx);
> + cp_reg(gpregs, arch->gpregs, dx);
> + cp_reg(gpregs, arch->gpregs, si);
> + cp_reg(gpregs, arch->gpregs, di);
> + cp_reg(gpregs, arch->gpregs, orig_ax);
> + cp_reg(gpregs, arch->gpregs, ip);
> + cp_reg(gpregs, arch->gpregs, cs);
> + cp_reg(gpregs, arch->gpregs, flags);
> + cp_reg(gpregs, arch->gpregs, sp);
> + cp_reg(gpregs, arch->gpregs, ss);
> + cp_reg(gpregs, arch->gpregs, fs_base);
> + cp_reg(gpregs, arch->gpregs, gs_base);
> + cp_reg(gpregs, arch->gpregs, ds);
> + cp_reg(gpregs, arch->gpregs, es);
> + cp_reg(gpregs, arch->gpregs, fs);
> + cp_reg(gpregs, arch->gpregs, gs);
> +
> + old_fs = get_fs();
> + set_fs(KERNEL_DS);
> + ret = arch_ptrace(current, PTRACE_SETREGS, 0, (unsigned long)&gpregs);
> + set_fs(old_fs);
> + if (ret)
> + goto out;
> +
> + *regs = *task_pt_regs(current);
> +
> + thread->usersp = arch->gpregs.sp;
> + thread->ds = arch->gpregs.ds;
> + thread->es = arch->gpregs.es;
> + thread->fs = arch->gpregs.fs;
> + thread->gs = arch->gpregs.gs;
> +
> + thread->fsindex = thread->fs;
> + thread->gsindex = thread->gs;
> +
> + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) {
> + thread->tls_array[i].a = arch->tls_array[i].a;
> + thread->tls_array[i].b = arch->tls_array[i].b;
> + }
> +
> + if (arch->gpregs.fs_base) {
> + ret = do_arch_prctl(current, ARCH_SET_FS, arch->gpregs.fs_base);
> + if (ret)
> + goto out;
> + }
> +
> + if (arch->gpregs.gs_base) {
> + ret = do_arch_prctl(current, ARCH_SET_GS, arch->gpregs.gs_base);
> + if (ret)
> + goto out;
> + }
> +
> + /* Restoring FPU */
> + if (core_entry->task_flags & PF_USED_MATH) {
> +
> + cp_reg(fpregs, arch->fpregs, cwd);
> + cp_reg(fpregs, arch->fpregs, swd);
> + cp_reg(fpregs, arch->fpregs, twd);
> + cp_reg(fpregs, arch->fpregs, fop);
> + cp_reg(fpregs, arch->fpregs, rip);
> + cp_reg(fpregs, arch->fpregs, rdp);
> + cp_reg(fpregs, arch->fpregs, mxcsr);
> + cp_reg(fpregs, arch->fpregs, mxcsr_mask);
> +
> + for (i = 0; i < ARRAY_SIZE(arch->fpregs.st_space); i++)
> + cp_reg(fpregs, arch->fpregs, st_space[i]);
> +
> + for (i = 0; i < ARRAY_SIZE(arch->fpregs.xmm_space); i++)
> + cp_reg(fpregs, arch->fpregs, xmm_space[i]);
> +
> + old_fs = get_fs();
> + set_fs(KERNEL_DS);
> + ret = arch_ptrace(current, PTRACE_SETFPREGS, 0, (unsigned long)&fpregs);
> + set_fs(old_fs);
> + if (ret)
> + goto out;
> + }
> +
> +out:
> + return ret;
> +}
> +
> +#endif /* CONFIG_X86_64 */
> Index: linux-2.6.git/arch/x86/vdso/vma.c
> ===================================================================
> --- linux-2.6.git.orig/arch/x86/vdso/vma.c
> +++ linux-2.6.git/arch/x86/vdso/vma.c
> @@ -137,6 +137,28 @@ up_fail:
> return ret;
> }
>
> +int arch_setup_additional_pages_at(struct linux_binprm *bprm, void *addr, int uses_interp)
> +{
> + struct mm_struct *mm = current->mm;
> + int ret;
> +
> + if (!vdso_enabled)
> + return 0;
> +
> + down_write(&mm->mmap_sem);
> + current->mm->context.vdso = addr;
> + ret = install_special_mapping(mm, (unsigned long)addr, vdso_size,
> + VM_READ | VM_EXEC |
> + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC |
> + VM_ALWAYSDUMP,
> + vdso_pages);
> + if (ret)
> + current->mm->context.vdso = NULL;
> +
> + up_write(&mm->mmap_sem);
> + return ret;
> +}
> +
> static __init int vdso_setup(char *s)
> {
> vdso_enabled = simple_strtoul(s, NULL, 0);
> Index: linux-2.6.git/fs/Kconfig.binfmt
> ===================================================================
> --- linux-2.6.git.orig/fs/Kconfig.binfmt
> +++ linux-2.6.git/fs/Kconfig.binfmt
> @@ -23,6 +23,17 @@ config BINFMT_ELF
> ld.so (check the file <file:Documentation/Changes> for location and
> latest version).
>
> +config BINFMT_ELF_CKPT
> + tristate "Kernel support for CKPT ELF binaries"
> + default n
> + depends on BINFMT_ELF && X86_64
> + help
> + ELF CKPT (checkpoint) is an extension to ELF format to restore
> + checkpointed processes. It's not confirmed yet and highly
> + experimental.
> +
> + If unsure, say N.
> +
> config COMPAT_BINFMT_ELF
> bool
> depends on COMPAT && BINFMT_ELF
> Index: linux-2.6.git/fs/Makefile
> ===================================================================
> --- linux-2.6.git.orig/fs/Makefile
> +++ linux-2.6.git/fs/Makefile
> @@ -37,6 +37,7 @@ obj-$(CONFIG_BINFMT_MISC) += binfmt_misc
> obj-y += binfmt_script.o
>
> obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
> +obj-$(CONFIG_BINFMT_ELF_CKPT) += binfmt_elf_ckpt.o
> obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
> obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
> obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
> Index: linux-2.6.git/fs/binfmt_elf.c
> ===================================================================
> --- linux-2.6.git.orig/fs/binfmt_elf.c
> +++ linux-2.6.git/fs/binfmt_elf.c
> @@ -30,6 +30,7 @@
> #include <linux/security.h>
> #include <linux/random.h>
> #include <linux/elf.h>
> +#include <linux/elf_ckpt.h>
> #include <linux/utsname.h>
> #include <linux/coredump.h>
> #include <asm/uaccess.h>
> @@ -592,7 +593,11 @@ static int load_elf_binary(struct linux_
> if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
> goto out;
>
> - if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
> + if (loc->elf_ex.e_type != ET_EXEC &&
> +#ifdef CONFIG_BINFMT_ELF_CKPT
> + loc->elf_ex.e_type != ET_CKPT &&
> +#endif
> + loc->elf_ex.e_type != ET_DYN)
> goto out;
> if (!elf_check_arch(&loc->elf_ex))
> goto out;
> @@ -619,6 +624,16 @@ static int load_elf_binary(struct linux_
> goto out_free_ph;
> }
>
> +#ifdef CONFIG_BINFMT_ELF_CKPT
> + if (loc->elf_ex.e_type == ET_CKPT) {
> + retval = load_elf_ckpt(bprm, regs, &loc->elf_ex,
> + (struct elf_phdr *)elf_phdata);
> + if (!retval)
> + set_binfmt(&elf_format);
> + goto out_free_ph;
> + }
> +#endif
> +
> elf_ppnt = elf_phdata;
> elf_bss = 0;
> elf_brk = 0;
> Index: linux-2.6.git/fs/binfmt_elf_ckpt.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6.git/fs/binfmt_elf_ckpt.c
> @@ -0,0 +1,356 @@
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/errno.h>
> +#include <linux/signal.h>
> +#include <linux/binfmts.h>
> +#include <linux/string.h>
> +#include <linux/file.h>
> +#include <linux/slab.h>
> +#include <linux/personality.h>
> +#include <linux/elfcore.h>
> +#include <linux/init.h>
> +#include <linux/highuid.h>
> +#include <linux/compiler.h>
> +#include <linux/highmem.h>
> +#include <linux/pagemap.h>
> +#include <linux/security.h>
> +#include <linux/random.h>
> +#include <linux/elf.h>
> +#include <linux/utsname.h>
> +#include <linux/coredump.h>
> +#include <linux/regset.h>
> +
> +#include <asm/uaccess.h>
> +#include <asm/param.h>
> +#include <asm/page.h>
> +#include <asm/prctl.h>
> +#include <asm/proto.h>
> +#include <asm/i387.h>
> +
> +#include <linux/elf_ckpt.h>
> +#include <asm/elf_ckpt.h>
> +
> +#include <linux/flex_array.h>
> +#include <asm/tlbflush.h>
> +#include <asm/desc.h>
> +
> +int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
> + struct elfhdr *elf_ex, struct elf_phdr *elf_phdr)
> +{
> + struct elf_phdr *elf_phdr_pages;
> + struct flex_array *fa = NULL;
> + struct vma_entry *vma_entry_ptr;
> + int nr_vma_found, nr_vma_mapped;
> + struct vma_entry vma_entry;
> + struct file *file = NULL;
> + unsigned long map_addr;
> +
> +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
> + unsigned long vdso = -1UL;
> +#endif
> +
> + struct core_entry *core_entry = NULL;
> + unsigned long start_stack = -1UL;
> +
> + int i, ret = -ENOEXEC;
> + loff_t off;
> +
> + BUILD_BUG_ON(CKPT_TASK_COMM_LEN != TASK_COMM_LEN);
> + BUILD_BUG_ON(CKPT_PAGE_SIZE != PAGE_SIZE);
> + BUILD_BUG_ON(CKPT_CORE_SIZE != sizeof(*core_entry));
> +
> + elf_phdr_pages = NULL;
> + nr_vma_found = 0;
> + nr_vma_mapped = 0;
> +
> + /*
> + * An early check for header version so if we fail here
> + * we would not need to use flex array at all.
> + */
> + for (i = 0; i < elf_ex->e_phnum; i++) {
> + if (elf_phdr[i].p_type != PT_CKPT_CORE)
> + continue;
> +
> + core_entry = vmalloc(sizeof(*core_entry));
> + if (!core_entry) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + ret = kernel_read(bprm->file, elf_phdr[i].p_offset,
> + (char *)core_entry, sizeof(*core_entry));
> + if (ret != sizeof(*core_entry)) {
> + pr_err("elf-ckpt: Can't read core_entry\n");
> + ret = -EIO;
> + goto out;
> + }
> +
> + if (core_entry->header.version != CKPT_HEADER_VERSION) {
> + pr_err("elf-ckpt: Unsupported or corrupted header\n");
> + ret = -ENOEXEC;
> + goto out;
> + }
> +
> + break;
> + }
> +
> + if (i == elf_ex->e_phnum) {
> + pr_err("elf-ckpt: No header found\n");
> + ret = -ENOEXEC;
> + goto out;
> + }
> +
> +
> + fa = flex_array_alloc(sizeof(vma_entry), elf_ex->e_phnum, GFP_KERNEL);
> + if (!fa || flex_array_prealloc(fa, 0, elf_ex->e_phnum, GFP_KERNEL)) {
> + ret = -ENOMEM;
> + if (fa) {
> + flex_array_free(fa);
> + fa = NULL;
> + goto out;
> + }
> + }
> +
> + ret = flush_exec_keep_thread(bprm);
> + if (ret)
> + goto out;
> +
> + current->flags &= ~PF_FORKNOEXEC;
> + current->mm->def_flags = 0;
> +
> + /*
> + * We don't care about parameters passed (such as argc, argv, env)
> + * when execute checkpoint file because we're to substitute
> + * all things anyway.
> + */
> + do_munmap(current->mm, 0, TASK_SIZE);
> +
> + SET_PERSONALITY(loc->elf_ex);
> +
> + for (i = 0; i < elf_ex->e_phnum; i++) {
> +
> + switch (elf_phdr[i].p_type) {
> + case PT_CKPT_VMA:
> + ret = kernel_read(bprm->file, elf_phdr[i].p_offset,
> + (char *)&vma_entry, sizeof(vma_entry));
> + if (ret != sizeof(vma_entry)) {
> + pr_err("elf-ckpt: Can't read vma_entry\n");
> + ret = -EIO;
> + goto out;
> + }
> + if (flex_array_put(fa, i, &vma_entry, GFP_KERNEL))
> + BUG();
> +
> + /* We need to know if there is executable stack */
> + if (vma_entry.status & VMA_AREA_STACK) {
> + if (vma_entry.flags & PROT_EXEC)
> + current->personality |= READ_IMPLIES_EXEC;
> + }
> +
> + nr_vma_found++;
> + continue;
> + case PT_CKPT_PAGES:
> + elf_phdr_pages = &elf_phdr[i];
> + continue;
> + default:
> + continue;
> + }
> + }
> +
> + /* Be sure it has the file structure we expected to see. */
> + if (!elf_phdr_pages || !nr_vma_found) {
> + ret = -ENOEXEC;
> + goto out;
> + }
> +
> + /*
> + * VMA randomization still needs to be set (just in case if
> + * the program we restore will exec() something else later).
> + */
> + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
> + current->flags |= PF_RANDOMIZE;
> +
> + /*
> + * FIXME: Note it flushes signal handlers as well,
> + * so we need to dump queued signals and restore
> + * them here.
> + */
> + setup_new_exec(bprm);
> +
> + current->mm->free_area_cache = current->mm->mmap_base;
> + current->mm->cached_hole_size = 0;
> +
> + for (i = 0; i < nr_vma_found; i++) {
> + vma_entry_ptr = flex_array_get(fa, i);
> +
> +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
> + if (vma_entry_ptr->status & VMA_AREA_VDSO)
> + vdso = vma_entry_ptr->start;
> +#endif
> +
> + if (vma_entry_ptr->status & VMA_AREA_STACK) {
> + /* Note if stack is VM_GROWSUP -- it should be reversed */
> + start_stack = vma_entry_ptr->start;
> + }
> +
> + /* Anything special should be ignored */
> + if (!(vma_entry_ptr->status & VMA_AREA_REGULAR))
> + continue;
> +
> + /* It's a file mmap'ed */
> + if (vma_entry_ptr->fd != -1) {
> + file = fget((unsigned int)vma_entry_ptr->fd);
> + if (!file) {
> + ret = -EBADF;
> + goto out_unmap;
> + }
> +
> + /* Reuse this field to handle error cases */
> + vma_entry_ptr->fd = (__u64)file;
> + } else
> + file = NULL;
> +
> + down_write(&current->mm->mmap_sem);
> + map_addr = do_mmap(file,
> + vma_entry_ptr->start,
> + vma_entry_ptr->end - vma_entry_ptr->start,
> + vma_entry_ptr->prot,
> + vma_entry_ptr->flags | MAP_FIXED,
> + vma_entry_ptr->pgoff);
> + up_write(&current->mm->mmap_sem);
> +
> + if (file) {
> + fput(file);
> + do_close((unsigned int)vma_entry_ptr->fd);
> + }
> +
> + if ((unsigned long)(map_addr) >= TASK_SIZE) {
> + ret = IS_ERR((void *)map_addr) ? PTR_ERR((void*)map_addr) : -EINVAL;
> + goto out_unmap;
> + }
> +
> + nr_vma_mapped++;
> + }
> +
> +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
> + if (vdso == -1UL) {
> + pr_err("elf-ckpt: Can't find VDSO address\n");
> + ret = -ENOEXEC;
> + goto out_unmap;
> + }
> +#endif
> +
> + if (start_stack == -1UL) {
> + pr_err("elf-ckpt: Can't find stack VMA\n");
> + ret = -ENOEXEC;
> + goto out_unmap;
> + }
> +
> + /* The name it has before */
> + set_task_comm(current, core_entry->task_comm);
> +
> + bprm->p = core_entry->mm_start_stack;
> +
> + current->mm->start_code = core_entry->mm_start_code;
> + current->mm->end_code = core_entry->mm_end_code;
> + current->mm->start_data = core_entry->mm_start_data;
> + current->mm->end_data = core_entry->mm_end_data;
> + current->mm->start_stack = core_entry->mm_start_stack;
> + current->mm->start_brk = core_entry->mm_start_brk;
> + current->mm->brk = core_entry->mm_brk;
> +
> +#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
> + ret = arch_setup_additional_pages_at(bprm, (void *)vdso, 0);
> + if (ret) {
> + pr_err("elf-ckpt: Can't setup additional pages at %lx with %d\n",
> + vdso, ret);
> + goto out_unmap;
> + }
> +#endif
> +
> + /*
> + * Restore pages
> + */
> + off = elf_phdr_pages->p_offset;
> + while (1) {
> + struct vm_area_struct *vma;
> + struct page *page;
> + void *page_data;
> + __u64 va;
> +
> + ret = kernel_read(bprm->file, off, (char *)&va, sizeof(va));
> + if (ret != sizeof(va)) {
> + pr_err("elf-ckpt: Can't read page virtual address: "
> + "ret = %d off = %lx\n", ret, (unsigned long)off);
> + ret = -EIO;
> + goto out_unmap;
> + }
> +
> + /* End of pages reached */
> + if (!va)
> + break;
> +
> + vma = find_vma(current->mm, (unsigned long)va);
> + if (!vma) {
> + pr_err("elf-ckpt: No VMA for page: %16lx\n", (unsigned long)va);
> + ret = -ESRCH;
> + goto out_unmap;
> + }
> +
> + ret = get_user_pages(current, current->mm, (unsigned long)va,
> + 1, 1, 1, &page, NULL);
> + if (ret != 1) {
> + pr_err("elf-ckpt: Can't get user page: %16lx\n", (unsigned long)va);
> + ret = -EFAULT;
> + goto out_unmap;
> + }
> +
> + page_data = kmap(page);
> + ret = kernel_read(bprm->file, off + sizeof(va), page_data, PAGE_SIZE);
> + kunmap(page);
> + put_page(page);
> +
> + if (ret != PAGE_SIZE) {
> + pr_err("elf-ckpt: Can't read data on page: %16lx\n", (unsigned long)va);
> + ret = -EFAULT;
> + goto out_unmap;
> + }
> +
> + off += sizeof(va) + PAGE_SIZE;
> + }
> +
> + /*
> + * Architecture specific setup for registers
> + * and friends, it's done lately since if
> + * an error happened before there is no much
> + * point to setup arch-specific things at all.
> + */
> + ret = load_elf_ckpt_arch(current, regs, core_entry);
> + if (ret)
> + goto out_unmap;
> +
> + /* We're done */
> + ret = 0;
> +out:
> + if (core_entry)
> + vfree(core_entry);
> +
> + if (fa)
> + flex_array_free(fa);
> + return ret;
> +
> +out_unmap:
> + for (i = 0; i < nr_vma_mapped; i++) {
> + vma_entry_ptr = flex_array_get(fa, i);
> + down_write(&current->mm->mmap_sem);
> + do_munmap(current->mm, vma_entry_ptr->start,
> + vma_entry_ptr->end - vma_entry_ptr->start);
> + up_write(&current->mm->mmap_sem);
> + }
> +
> + send_sig(SIGKILL, current, 0);
> + goto out;
> +}
> Index: linux-2.6.git/fs/exec.c
> ===================================================================
> --- linux-2.6.git.orig/fs/exec.c
> +++ linux-2.6.git/fs/exec.c
> @@ -1071,18 +1071,10 @@ void set_task_comm(struct task_struct *t
> perf_event_comm(tsk);
> }
>
> -int flush_old_exec(struct linux_binprm * bprm)
> +int flush_exec_keep_thread(struct linux_binprm * bprm)
> {
> int retval;
>
> - /*
> - * Make sure we have a private signal table and that
> - * we are unassociated from the previous thread group.
> - */
> - retval = de_thread(current);
> - if (retval)
> - goto out;
> -
> set_mm_exe_file(bprm->mm, bprm->file);
>
> /*
> @@ -1101,10 +1093,25 @@ int flush_old_exec(struct linux_binprm *
> current->personality &= ~bprm->per_clear;
>
> return 0;
> -
> out:
> return retval;
> }
> +EXPORT_SYMBOL(flush_exec_keep_thread);
> +
> +int flush_old_exec(struct linux_binprm * bprm)
> +{
> + int retval;
> +
> + /*
> + * Make sure we have a private signal table and that
> + * we are unassociated from the previous thread group.
> + */
> + retval = de_thread(current);
> + if (retval)
> + return retval;
> +
> + return flush_exec_keep_thread(bprm);
> +}
> EXPORT_SYMBOL(flush_old_exec);
>
> void would_dump(struct linux_binprm *bprm, struct file *file)
> Index: linux-2.6.git/include/linux/binfmts.h
> ===================================================================
> --- linux-2.6.git.orig/include/linux/binfmts.h
> +++ linux-2.6.git/include/linux/binfmts.h
> @@ -110,6 +110,7 @@ extern int prepare_binprm(struct linux_b
> extern int __must_check remove_arg_zero(struct linux_binprm *);
> extern int search_binary_handler(struct linux_binprm *, struct pt_regs *);
> extern int flush_old_exec(struct linux_binprm * bprm);
> +extern int flush_exec_keep_thread(struct linux_binprm * bprm);
> extern void setup_new_exec(struct linux_binprm * bprm);
> extern void would_dump(struct linux_binprm *, struct file *);
>
> Index: linux-2.6.git/include/linux/elf_ckpt.h
> ===================================================================
> --- /dev/null
> +++ linux-2.6.git/include/linux/elf_ckpt.h
> @@ -0,0 +1,103 @@
> +#ifndef _LINUX_ELF_CHECKPOINT_H
> +#define _LINUX_ELF_CHECKPOINT_H
> +
> +#ifdef __KERNEL__
> +
> +#include <linux/types.h>
> +#include <linux/elf-em.h>
> +
> +#include <asm/elf.h>
> +#include <asm/elf_ckpt.h>
> +
> +/*
> + * Elf extension includes new Elf file type
> + * and program header types as well.
> + */
> +#define ET_CKPT 5
> +
> +#define PT_CKPT_OFFSET 0x01010101
> +
> +#define PT_CKPT_VMA (PT_LOOS + PT_CKPT_OFFSET + 1)
> +#define PT_CKPT_CORE (PT_LOOS + PT_CKPT_OFFSET + 2)
> +#define PT_CKPT_PAGES (PT_LOOS + PT_CKPT_OFFSET + 3)
> +
> +#define CKPT_PAGE_SIZE 4096
> +#define CKPT_TASK_COMM_LEN 16
> +
> +#define CKPT_HEADER_VERSION 1
> +#define CKPT_HEADER_ARCH_X86_64 1
> +
> +#define VMA_AREA_REGULAR (1 << 0)
> +#define VMA_AREA_STACK (1 << 1)
> +#define VMA_AREA_VSYSCALL (1 << 2)
> +#define VMA_AREA_VDSO (1 << 3)
> +#define VMA_FORCE_READ (1 << 4)
> +#define VMA_AREA_HEAP (1 << 5)
> +#define VMA_FILE_PRIVATE (1 << 6)
> +#define VMA_FILE_SHARED (1 << 7)
> +#define VMA_ANON_SHARED (1 << 8)
> +#define VMA_ANON_PRIVATE (1 << 9)
> +#define VMA_FORCE_WRITE (1 << 10)
> +
> +struct vma_entry {
> + __u64 start;
> + __u64 end;
> + __u64 pgoff;
> + __u32 prot;
> + __u32 flags;
> + __u32 status; /* from VMA_x above */
> + __u32 pid; /* pid VMA belongs to */
> + __s64 fd;
> + __u64 ino;
> + __u32 dev_maj;
> + __u32 dev_min;
> +} __packed;
> +
> +struct page_entry {
> + __u64 va; /* page virtual address */
> + __u8 data[CKPT_PAGE_SIZE]; /* page contents */
> +} __packed;
> +
> +struct image_header {
> + __u16 version;
> + __u16 arch;
> + __u32 flags;
> +} __packed;
> +
> +#define CKPT_ARCH_SIZE (2 * 4096)
> +#define CKPT_CORE_SIZE (4 * 4096)
> +
> +struct core_entry {
> + union {
> + struct {
> + struct image_header header;
> + __u8 arch[CKPT_ARCH_SIZE]; /* should be enough for all archs */
> + __u32 task_personality;
> + __u8 task_comm[CKPT_TASK_COMM_LEN];
> + __u32 task_flags;
> + __u64 mm_start_code;
> + __u64 mm_end_code;
> + __u64 mm_start_data;
> + __u64 mm_end_data;
> + __u64 mm_start_stack;
> + __u64 mm_start_brk;
> + __u64 mm_brk;
> + };
> + __u8 __core_pad[CKPT_CORE_SIZE];
> + };
> +} __packed;
> +
> +#ifdef CONFIG_BINFMT_ELF_CKPT
> +extern int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
> + struct elfhdr *elf_ex, struct elf_phdr *elf_phdr);
> +#else
> +static inline int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
> + struct elfhdr *elf_ex, struct elf_phdr *elf_phdr)
> +{
> + return -ENOEXEC;
> +}
> +#endif
> +
> +#endif /* __KERNEL__ */
> +
> +#endif /* _LINUX_ELF_CHECKPOINT_H */
>

--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/