Re: [PATCH 1/3] extend get/setrlimit to support setting rlimitsexternal to a process (v3)

From: Neil Horman
Date: Thu Oct 01 2009 - 13:16:56 EST


Augment /proc/<pid>/limits file to support limit setting

It was suggested to me recently that we support a mechanism by which we can set
various process limits from points external to the process. The reasoning being
that some processes are very long lived, and it would be beneficial to these
long lived processes if we could modify their various limits without needing to
kill them, adjust the limits for the user and restarting them. While individual
application can certainly export this control on their own, it would be nice if
such functionality were available to a sysadmin, without needing to have each
application re-invent the wheel.

As such, I've implemented the below patch, which makes /proc/pid/limits writable
for each process. By writing the following format:
<limit> <current value> <max value>
to the limits file, an administrator can now dynamically change the limits for
the respective process. Tested by myself with good results.

Signed-off-by: Neil Horman <nhorman@xxxxxxxxxxxxx>


fs/proc/base.c | 165 +++++++++++++++++++++++++++++++++++++++++---------
include/linux/sched.h | 3
kernel/sys.c | 48 +++++++++-----
3 files changed, 169 insertions(+), 47 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6f742f6..2f05799 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -49,6 +49,8 @@

#include <asm/uaccess.h>

+#include <linux/string.h>
+#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -455,72 +457,177 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
struct limit_names {
char *name;
char *unit;
+ char *match;
};

static const struct limit_names lnames[RLIM_NLIMITS] = {
- [RLIMIT_CPU] = {"Max cpu time", "ms"},
- [RLIMIT_FSIZE] = {"Max file size", "bytes"},
- [RLIMIT_DATA] = {"Max data size", "bytes"},
- [RLIMIT_STACK] = {"Max stack size", "bytes"},
- [RLIMIT_CORE] = {"Max core file size", "bytes"},
- [RLIMIT_RSS] = {"Max resident set", "bytes"},
- [RLIMIT_NPROC] = {"Max processes", "processes"},
- [RLIMIT_NOFILE] = {"Max open files", "files"},
- [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
- [RLIMIT_AS] = {"Max address space", "bytes"},
- [RLIMIT_LOCKS] = {"Max file locks", "locks"},
- [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
- [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
- [RLIMIT_NICE] = {"Max nice priority", NULL},
- [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
- [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
+ [RLIMIT_CPU] = {"Max cpu time", "ms", "cpu"},
+ [RLIMIT_FSIZE] = {"Max file size", "bytes", "fsize"},
+ [RLIMIT_DATA] = {"Max data size", "bytes", "data"},
+ [RLIMIT_STACK] = {"Max stack size", "bytes", "stack"},
+ [RLIMIT_CORE] = {"Max core file size", "bytes", "core"},
+ [RLIMIT_RSS] = {"Max resident set", "bytes", "rss"},
+ [RLIMIT_NPROC] = {"Max processes", "processes", "nproc"},
+ [RLIMIT_NOFILE] = {"Max open files", "files", "nofile"},
+ [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes", "memlock"},
+ [RLIMIT_AS] = {"Max address space", "bytes", "as"},
+ [RLIMIT_LOCKS] = {"Max file locks", "locks", "locks"},
+ [RLIMIT_SIGPENDING] = {"Max pending signals", "signals", "sigpending"},
+ [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes", "msgqueue"},
+ [RLIMIT_NICE] = {"Max nice priority", NULL, "nice"},
+ [RLIMIT_RTPRIO] = {"Max realtime priority", NULL, "rtprio"},
+ [RLIMIT_RTTIME] = {"Max realtime timeout", "us", "rttime"},
};

/* Display limits for a process */
-static int proc_pid_limits(struct task_struct *task, char *buffer)
+static ssize_t proc_pid_limit_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
{
unsigned int i;
- int count = 0;
unsigned long flags;
- char *bufptr = buffer;
+ char *bufptr;
+ size_t bcount = 0;
+ size_t ccount = 0;
+ struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);

struct rlimit rlim[RLIM_NLIMITS];

+ bufptr = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!bufptr)
+ goto out;
+
if (!lock_task_sighand(task, &flags))
- return 0;
+ goto out;
memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
unlock_task_sighand(task, &flags);

/*
* print the file header
*/
- count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
+ bcount += sprintf(&bufptr[bcount], "%-25s %-20s %-20s %-10s\n",
"Limit", "Soft Limit", "Hard Limit", "Units");

for (i = 0; i < RLIM_NLIMITS; i++) {
if (rlim[i].rlim_cur == RLIM_INFINITY)
- count += sprintf(&bufptr[count], "%-25s %-20s ",
+ bcount += sprintf(&bufptr[bcount], "%-25s %-20s ",
lnames[i].name, "unlimited");
else
- count += sprintf(&bufptr[count], "%-25s %-20lu ",
+ bcount += sprintf(&bufptr[bcount], "%-25s %-20lu ",
lnames[i].name, rlim[i].rlim_cur);

if (rlim[i].rlim_max == RLIM_INFINITY)
- count += sprintf(&bufptr[count], "%-20s ", "unlimited");
+ bcount += sprintf(&bufptr[bcount], "%-20s ",
+ "unlimited");
else
- count += sprintf(&bufptr[count], "%-20lu ",
+ bcount += sprintf(&bufptr[bcount], "%-20lu ",
rlim[i].rlim_max);

if (lnames[i].unit)
- count += sprintf(&bufptr[count], "%-10s\n",
+ bcount += sprintf(&bufptr[bcount], "%-10s\n",
lnames[i].unit);
else
- count += sprintf(&bufptr[count], "\n");
+ bcount += sprintf(&bufptr[bcount], "\n");
+ }
+
+ if (*ppos >= bcount)
+ goto out_task;
+
+ ccount = min(count, (size_t)(bcount-(*ppos)));
+ ccount = ccount - copy_to_user(buf, &bufptr[*ppos], ccount);
+ *ppos += ccount;
+ kfree(bufptr);
+out_task:
+ put_task_struct(task);
+out:
+ return ccount;
+}
+
+static ssize_t proc_pid_limit_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char *buffer;
+ char *element, *vmc, *vmm;
+ struct rlimit new_rlim;
+ unsigned long flags;
+ int i;
+ int index = -1;
+ size_t wcount = 0;
+ struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+
+
+ if (*ppos != 0)
+ goto out;
+
+ if (count > 128)
+ goto out;
+ buffer = kzalloc(128, GFP_KERNEL);
+
+ if (!buffer)
+ goto out;
+
+ element = kzalloc(sizeof(buffer), GFP_KERNEL);
+ vmc = kzalloc(sizeof(buffer), GFP_KERNEL);
+ vmm = kzalloc(sizeof(buffer), GFP_KERNEL);
+
+ if (!element || !vmm || !vmc)
+ goto out_free;
+
+ wcount = count - copy_from_user(buffer, buf, count);
+ if (wcount < count)
+ goto out_free;
+
+ i = sscanf(buffer, "%s %s %s", element, vmc, vmm);
+
+ if (i < 3)
+ goto out_free;
+
+ for (i = 0; i <= strlen(element); i++)
+ element[i] = tolower(element[i]);
+
+ if (!strncmp(vmc, "unlimited", 9))
+ new_rlim.rlim_cur = RLIM_INFINITY;
+ else
+ new_rlim.rlim_cur = simple_strtoull(vmc, NULL, 10);
+
+ if (!strncmp(vmm, "unlimited", 9))
+ new_rlim.rlim_max = RLIM_INFINITY;
+ else
+ new_rlim.rlim_max = simple_strtoull(vmm, NULL, 10);
+
+ for (i = 0; i < RLIM_NLIMITS; i++) {
+ if ((lnames[i].match) &&
+ !strncmp(element, lnames[i].match,
+ strlen(lnames[i].match))) {
+ index = i;
+ break;
+ }
}

+ if (!lock_task_sighand(task, &flags))
+ goto out_free;
+
+ if ((index >= 0) && (index < RLIM_NLIMITS))
+ do_setrlimit(index, &new_rlim, task);
+
+ unlock_task_sighand(task, &flags);
+
+out_free:
+ kfree(element);
+ kfree(vmc);
+ kfree(vmm);
+ kfree(buffer);
+out:
+ *ppos += count;
+ put_task_struct(task);
return count;
}

+
+static const struct file_operations proc_limit_operations = {
+ .read = proc_pid_limit_read,
+ .write = proc_pid_limit_write,
+};
+
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
static int proc_pid_syscall(struct task_struct *task, char *buffer)
{
@@ -2483,7 +2590,7 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
- INF("limits", S_IRUSR, proc_pid_limits),
+ REG("limits", S_IRUSR|S_IWUSR, proc_limit_operations),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
@@ -2822,7 +2929,7 @@ static const struct pid_entry tid_base_stuff[] = {
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
- INF("limits", S_IRUSR, proc_pid_limits),
+ REG("limits", S_IRUSR|S_IWUSR, proc_limit_operations),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0f1ea4a..cada5d2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -631,6 +631,9 @@ struct signal_struct {
#endif
};

+extern int do_setrlimit(unsigned int resource, struct rlimit *new_rlim,
+ struct task_struct *tsk);
+
/* Context switch must be unlocked if interrupts are to be enabled */
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
# define __ARCH_WANT_UNLOCKED_CTXSW
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097..05bd22a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1236,41 +1236,41 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,

#endif

-SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+int do_setrlimit(unsigned int resource, struct rlimit *new_rlim,
+ struct task_struct *tsk)
{
- struct rlimit new_rlim, *old_rlim;
int retval;
+ struct rlimit *old_rlim;

- if (resource >= RLIM_NLIMITS)
- return -EINVAL;
- if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
- return -EFAULT;
- if (new_rlim.rlim_cur > new_rlim.rlim_max)
+
+ if (new_rlim->rlim_cur > new_rlim->rlim_max)
return -EINVAL;
- old_rlim = current->signal->rlim + resource;
- if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
+ old_rlim = tsk->signal->rlim + resource;
+
+ if ((new_rlim->rlim_max > old_rlim->rlim_max) &&
!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
+
+ if (resource == RLIMIT_NOFILE && new_rlim->rlim_max > sysctl_nr_open)
return -EPERM;

- retval = security_task_setrlimit(resource, &new_rlim);
+ retval = security_task_setrlimit(resource, new_rlim);
if (retval)
return retval;

- if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
+ if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
/*
* The caller is asking for an immediate RLIMIT_CPU
* expiry. But we use the zero value to mean "it was
* never set". So let's cheat and make it one second
* instead
*/
- new_rlim.rlim_cur = 1;
+ new_rlim->rlim_cur = 1;
}

- task_lock(current->group_leader);
- *old_rlim = new_rlim;
- task_unlock(current->group_leader);
+ task_lock(tsk->group_leader);
+ *old_rlim = *new_rlim;
+ task_unlock(tsk->group_leader);

if (resource != RLIMIT_CPU)
goto out;
@@ -1281,14 +1281,26 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
* very long-standing error, and fixing it now risks breakage of
* applications, so we live with it
*/
- if (new_rlim.rlim_cur == RLIM_INFINITY)
+ if (new_rlim->rlim_cur == RLIM_INFINITY)
goto out;

- update_rlimit_cpu(new_rlim.rlim_cur);
+ update_rlimit_cpu(new_rlim->rlim_cur);
out:
return 0;
}

+SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+{
+ struct rlimit new_rlim;
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+ if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+ return -EFAULT;
+
+ return do_setrlimit(resource, &new_rlim, current);
+}
+
/*
* It would make sense to put struct rusage in the task_struct,
* except that would make the task_struct be *really big*. After
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/