[PATCH] mm: add per-user limit on mapping count

From: Kirill A. Shutemov
Date: Mon May 12 2014 - 08:13:12 EST

Next message: Peter Zijlstra: "Re: SCHED_DEADLINE, sched_getscheduler(), and sched_getparam()"
Previous message: Michael Kerrisk (man-pages): "SCHED_DEADLINE, sched_getscheduler(), and sched_getparam()"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

We're going to increase per-mm map_count. To avoid non-obvious memory
abuse by creating a lot of VMA's, let's introduce per-user limit.

The limit is implemented as sysctl. For now value of limit is pretty
arbitrary -- 2^20.

sizeof(vm_area_struct) with my kernel config (DEBUG_KERNEL=n) is 184
bytes. It means with the limit user can use up to 184 MiB of RAM in
VMAs.

The limit is not applicable for root (INIT_USER).

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
---
arch/unicore32/include/asm/mmu_context.h | 2 +-
include/linux/sched.h | 27 +++++++++++++++++++++++++++
include/linux/sched/sysctl.h | 1 +
kernel/fork.c | 3 ++-
kernel/sysctl.c | 8 ++++++++
mm/mmap.c | 17 +++++++++--------
mm/mremap.c | 2 +-
mm/nommu.c | 7 ++++---
8 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index ef470a7a3d0f..f370d74339da 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -76,7 +76,7 @@ do { \
mm->mmap = NULL; \
rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
vmacache_invalidate(mm); \
- mm->map_count--; \
+ dec_map_count(mm); \
remove_vma(high_vma); \
} \
} while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 25f54c79f757..f9f12c503d14 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -56,6 +56,7 @@ struct sched_param {
#include <linux/llist.h>
#include <linux/uidgid.h>
#include <linux/gfp.h>
+#include <linux/sched/sysctl.h>

#include <asm/processor.h>

@@ -747,6 +748,7 @@ struct user_struct {
atomic_t processes; /* How many processes does this user have? */
atomic_t files; /* How many open files does this user have? */
atomic_t sigpending; /* How many pending signals does this user have? */
+ atomic_t map_count; /* How many mapping does this user have? */
#ifdef CONFIG_INOTIFY_USER
atomic_t inotify_watches; /* How many inotify watches does this user have? */
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
@@ -2991,4 +2993,29 @@ static inline unsigned long rlimit_max(unsigned int limit)
return task_rlimit_max(current, limit);
}

+static inline void inc_map_count(struct mm_struct *mm)
+{
+ mm->map_count++;
+ atomic_inc(&current_user()->map_count);
+}
+
+static inline void dec_map_count(struct mm_struct *mm)
+{
+ mm->map_count--;
+ atomic_dec(&current_user()->map_count);
+}
+
+static inline bool map_count_check(struct mm_struct *mm, int limit_offset)
+{
+ struct user_struct *user = current_user();
+ if (mm->map_count > sysctl_max_map_count + limit_offset)
+ return true;
+ if (user == INIT_USER)
+ return false;
+ if (atomic_read(&user->map_count) >
+ sysctl_max_map_count_per_user + limit_offset)
+ return true;
+ return false;
+}
+
#endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 8045a554cafb..ce66c4697dbf 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -30,6 +30,7 @@ enum { sysctl_hung_task_timeout_secs = 0 };
#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)

extern int sysctl_max_map_count;
+extern long sysctl_max_map_count_per_user;

extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
diff --git a/kernel/fork.c b/kernel/fork.c
index 54a8d26f612f..8ea1c538c79e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,7 +454,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
rb_link = &tmp->vm_rb.rb_right;
rb_parent = &tmp->vm_rb;

- mm->map_count++;
+ inc_map_count(mm);
retval = copy_page_range(mm, oldmm, mpnt);

if (tmp->vm_ops && tmp->vm_ops->open)
@@ -600,6 +600,7 @@ void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
mm_free_pgd(mm);
+ atomic_sub(mm->map_count, &current_user()->map_count);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 74f5b580fe34..4efe2ed927f2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1316,6 +1316,14 @@ static struct ctl_table vm_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
},
+ {
+ .procname = "max_map_count_per_user",
+ .data = &sysctl_max_map_count_per_user,
+ .maxlen = sizeof(sysctl_max_map_count_per_user),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
#else
{
.procname = "nr_trim_pages",
diff --git a/mm/mmap.c b/mm/mmap.c
index b1202cf81f4b..8e2d581347f6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -89,6 +89,7 @@ int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic ove
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+long sysctl_max_map_count_per_user __read_mostly = 1UL << 20;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
/*
@@ -652,7 +653,7 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);

- mm->map_count++;
+ inc_map_count(mm);
validate_mm(mm);
}

@@ -669,7 +670,7 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
&prev, &rb_link, &rb_parent))
BUG();
__vma_link(mm, vma, prev, rb_link, rb_parent);
- mm->map_count++;
+ inc_map_count(mm);
}

static inline void
@@ -865,7 +866,7 @@ again: remove_next = 1 + (end > next->vm_end);
}
if (next->anon_vma)
anon_vma_merge(vma, next);
- mm->map_count--;
+ dec_map_count(mm);
mpol_put(vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
@@ -1259,7 +1260,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
return -EOVERFLOW;

/* Too many mappings? */
- if (mm->map_count > sysctl_max_map_count)
+ if (map_count_check(mm, 0))
return -ENOMEM;

/* Obtain the address to map to. we verify (or select) it and ensure
@@ -2378,7 +2379,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma->vm_prev = NULL;
do {
vma_rb_erase(vma, &mm->mm_rb);
- mm->map_count--;
+ dec_map_count(mm);
tail_vma = vma;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
@@ -2468,7 +2469,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
- if (mm->map_count >= sysctl_max_map_count)
+ if (map_count_check(mm, -1))
return -ENOMEM;

return __split_vma(mm, vma, addr, new_below);
@@ -2517,7 +2518,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
* not exceed its limit; but let map_count go just above
* its limit temporarily, to help free resources as expected.
*/
- if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
+ if (end < vma->vm_end && map_count_check(mm, -1))
return -ENOMEM;

error = __split_vma(mm, vma, start, 0);
@@ -2637,7 +2638,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
return -ENOMEM;

- if (mm->map_count > sysctl_max_map_count)
+ if (map_count_check(mm, 0))
return -ENOMEM;

if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
diff --git a/mm/mremap.c b/mm/mremap.c
index 05f1180e9f21..f0e34e87828d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -252,7 +252,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* We'd prefer to avoid failure later on in do_munmap:
* which may split one vma into three before unmapping.
*/
- if (mm->map_count >= sysctl_max_map_count - 3)
+ if (map_count_check(mm, -4))
return -ENOMEM;

/*
diff --git a/mm/nommu.c b/mm/nommu.c
index 85f8d6698d48..5b60bd88405c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -64,6 +64,7 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+long sysctl_max_map_count_per_user __read_mostly = 1UL << 20;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
@@ -710,7 +711,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)

BUG_ON(!vma->vm_region);

- mm->map_count++;
+ inc_map_count(mm);
vma->vm_mm = mm;

protect_vma(vma, vma->vm_flags);
@@ -779,7 +780,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)

protect_vma(vma, 0);

- mm->map_count--;
+ dec_map_count(mm);
for (i = 0; i < VMACACHE_SIZE; i++) {
/* if the vma is cached, invalidate the entire cache */
if (curr->vmacache[i] == vma) {
@@ -1554,7 +1555,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (vma->vm_file)
return -ENOMEM;

- if (mm->map_count >= sysctl_max_map_count)
+ if (check_map_count(mm, -1))
return -ENOMEM;

region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
--
Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Peter Zijlstra: "Re: SCHED_DEADLINE, sched_getscheduler(), and sched_getparam()"
Previous message: Michael Kerrisk (man-pages): "SCHED_DEADLINE, sched_getscheduler(), and sched_getparam()"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]