Re: Memory usage per memory zone

From: Thomas Schoebel-Theuer
Date: Wed Mar 11 2009 - 10:14:24 EST


Am Mittwoch, 11. März 2009 11:41:43 schrieb jack marrow:
> I have a box where the oom-killer is killing processes due to running
> out of memory in zone_normal. I can see using slabtop that the inode
> caches are using up lots of memory and guess this is the problem, so
> have cleared them using an echo to drop_caches.

Hi Jack,

my experience with plain old 2.6.24 on 32bit _production_ boxes was that under
heavy load and after >30days uptime we saw an sudden inflation of oom-killers
on some of them until those boxes died. The standard kernel statistics about
memory looked much the same as yours (and I suspect they could have been
wrong or at least misleading, but I have neither checked nor tried to fix).

> is it possible to use slabtop
> (or any other way) to view ram usage per zone so I can pick out the
> culprit?

Try the attached experimental hack which can provide you with some insight
about whats really going on in the _physical_ memory. Since it does not
allocate any memory for the purpose of displaying those memory patterns it
wants to examine, you have to allocate a large enough buffer in userspace.
Don't use cat, but something like dd with parameters such as bs=4M (as
mentioned in the comment). Probably you have to adjust the patch to some
newer kernel versions, and/or to fix some sysctl table checks if you want to
get it upstreams (I will not). And, of course, you can visualize more/other
flags as well.

After gaining some insight with /proc/sys/vm/mempattern and some development
of further experimental patches which successfully reduced fragmentation (but
finally only _delayed_ the oom problems without being able to _fundamentally_
resolve them), the ultimate solution was just to use CONFIG_VMSPLIT_2G or
even CONFIG_VMSPLIT_1G in order to overcome the artificial shortening of
zone_normal.

This supports old wisdom that an OS cannot give you resources it just does not
possess... Just make sure you have enough resources for your working set.
Thats all.

In the hope of being helpful,

Thomas
diff -urw linux-2.6.24.fragmentation-base/include/linux/mmzone.h linux-2.6.24.fragmentation-info/include/linux/mmzone.h
--- linux-2.6.24.fragmentation-base/include/linux/mmzone.h 2008-09-30 12:50:05.000000000 +0200
+++ linux-2.6.24.fragmentation-info/include/linux/mmzone.h 2009-01-27 14:27:42.000000000 +0100
@@ -687,6 +687,11 @@
struct file *, void __user *, size_t *, loff_t *);
extern char numa_zonelist_order[];
#define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */
+int sysctl_mempattern(struct ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen);
+int mempattern_sysctl_handler(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);

#include <linux/topology.h>
/* Returns the number of the current Node. */
diff -urw linux-2.6.24.fragmentation-base/include/linux/sysctl.h linux-2.6.24.fragmentation-info/include/linux/sysctl.h
--- linux-2.6.24.fragmentation-base/include/linux/sysctl.h 2008-11-12 12:41:47.000000000 +0100
+++ linux-2.6.24.fragmentation-info/include/linux/sysctl.h 2009-01-27 14:05:17.000000000 +0100
@@ -225,6 +225,7 @@
VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
+ VM_MEMPATTERN=36, /* output physical memory patterns */
};


diff -urw linux-2.6.24.fragmentation-base/kernel/sysctl.c linux-2.6.24.fragmentation-info/kernel/sysctl.c
--- linux-2.6.24.fragmentation-base/kernel/sysctl.c 2008-11-12 12:41:47.000000000 +0100
+++ linux-2.6.24.fragmentation-info/kernel/sysctl.c 2009-01-27 14:43:08.000000000 +0100
@@ -987,6 +987,13 @@
.extra1 = &zero,
},
{
+ .ctl_name = VM_MEMPATTERN,
+ .procname = "mempattern",
+ .mode = 0444,
+ .proc_handler = &mempattern_sysctl_handler,
+ .strategy = &sysctl_mempattern,
+ },
+ {
.ctl_name = VM_PERCPU_PAGELIST_FRACTION,
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
diff -urw linux-2.6.24.fragmentation-base/mm/page_alloc.c linux-2.6.24.fragmentation-info/mm/page_alloc.c
--- linux-2.6.24.fragmentation-base/mm/page_alloc.c 2008-09-30 12:50:05.000000000 +0200
+++ linux-2.6.24.fragmentation-info/mm/page_alloc.c 2009-01-29 15:54:31.000000000 +0100
@@ -1947,6 +1947,126 @@
show_swap_cache_info();
}

+/* Strategy routine for dumping memory patterns.
+ * It does not allocate kernel memory, but rather copies everything
+ * directly to userspace.
+ * The userspace buffer should be large enough, otherwise output is
+ * truncated.
+ * Use something like "dd if=/proc/sys/vm/mempattern bs=1M"
+ * to reserve such a large buffer in userspace.
+ */
+size_t do_mempattern(void __user *buf, size_t bufsize)
+{
+ struct zone *zone;
+ size_t pos = 0;
+
+#define WRITE_CHAR(out,order,pfn) \
+ if(bufsize - pos <= (1 << (order))) \
+ goto done; \
+ if(!((pfn) % 64)) { \
+ put_user('\n', (char __user *)(buf + pos)); \
+ pos++; \
+ } \
+ put_user((out), (char __user *)(buf + pos)); \
+ pos++;
+
+ //printk("called do_mempattern bufsize=%ld\n", bufsize);
+ for_each_zone(zone) {
+ size_t len = strlen(zone->name);
+ int pfn;
+
+ if(bufsize - pos <= len+7)
+ goto done;
+ if(copy_to_user(buf + pos, "Zone: ", 6))
+ return -EFAULT;
+ pos += 6;
+ if(copy_to_user(buf + pos, zone->name, len))
+ return -EFAULT;
+ pos += len;
+ WRITE_CHAR('\n', 0, -1);
+
+ for(pfn = zone->zone_start_pfn; pfn < zone->zone_start_pfn+zone->spanned_pages; pfn++) {
+ struct page *page;
+ int order = 0;
+ int i;
+ char out;
+ if(!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ if(PageBuddy(page)) {
+ order = page_order(page);
+ out = 'X';
+ } else if(PageSlab(page)) {
+ out = 's';
+ } else if(PageDirty(page)) {
+ out = '#';
+ if(PageLRU(page)) {
+ out = '@';
+ }
+ } else if(PageLRU(page)) {
+ out = ':';
+ } else {
+ out = '.';
+ }
+ WRITE_CHAR(out, order, pfn);
+ for(i = 1; i < (1 << order); i++) {
+ pfn++;
+ WRITE_CHAR('x', 0, pfn);
+ }
+ }
+ WRITE_CHAR('\n', 0, -1);
+ }
+ done:
+ return pos;
+}
+
+int sysctl_mempattern(struct ctl_table *table, int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ size_t res;
+ size_t bufsize;
+ printk("called sysctl_mempattern\n");
+
+ if (!table->data || !table->maxlen)
+ return -ENOTDIR;
+
+ if(!oldval || !oldlenp) {
+ return 0;
+ }
+
+ if (get_user(bufsize, oldlenp))
+ return -EFAULT;
+
+ res = do_mempattern(oldval, bufsize);
+ if(res >= 0) {
+ if (put_user(res, oldlenp))
+ return -EFAULT;
+ return 1;
+ }
+ return res;
+}
+
+int mempattern_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ size_t res;
+ //printk("called mempattern_sysctl_handler write=%d length=%ld\n", write, *length);
+ if (!*length || *ppos) {
+ *length = 0;
+ return 0;
+ }
+ res = do_mempattern(buffer, *length);
+ //printk("res=%ld\n", res);
+ if(res > 0) {
+ *ppos += res;
+ *length = res;
+ return 0;
+ }
+ return res;
+}
+
+
/*
* Builds allocation fallback zone lists.
*