Re: [RFC] ps command race fix

From: Eric W. Biederman
Date: Thu Aug 17 2006 - 09:38:10 EST


KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> writes:

> On Wed, 16 Aug 2006 22:59:50 -0600
> ebiederm@xxxxxxxxxxxx (Eric W. Biederman) wrote:
>
>>
>> So I just threw something together that seems to work.
>>
>> All of the pid are listed in order and the next used pid is found by
>> scanning the pid bitmap.
>>
>> Scanning the pid bitmap might be a little heavy but actually is likely
>> quite cache friendly as the accesses are extremely predictable, and
>> everything should fit into 64 cache lines, which is fewer cache lines
>> than walking a tree, and more predictable. Of course I turn around
>> and then do a hash table lookup which is at least one more cache line
>> and probably an unpredictable one at that.
>>
>> The point despite the brute force nature I have no reason to suspect
>> this will run any noticeably slower than the current implementation.
>>
>> Look at this try it out and if this solves the problem we can push
>> this upstream.
>>
> At first, Thanks.
>
> question:
>
> task = get_pid_task(find_next_pid(tgid), PIDTYPE_PID);
>
> Does this return only "task/process" ? and never return "thread" ?
>
> My another concern is that newly-created-process while ps running cannot be
> catched
> by this kind of "table walking" approach (as my previous work)
> But if people say O.K, I have no complaint.
>
> I(we)'ll post another list-based one in the next week, anyway.
> (I can't go-ahead this week...)
>

Here is a respin with a fixed version of next_tgid. I believe
I have all of the silly corner cases handled. So if the pid
I have found is only a process group or session id or not
a thread_group leader it should not be ignored.

> static struct task_struct *next_tgid(unsigned int tgid)
> {
> struct task_struct *task;
> struct pid *pid;
>
> task = NULL;
> rcu_read_lock();
> retry:
> pid = find_next_pid(tgid);
> if (pid) {
> tgid = pid->nr + 1;
> task = pid_task(pid, PIDTYPE_PID);
> if (!task || !thread_group_leader(task))
> goto retry;
> get_task_struct(task);
> }
> rcu_read_unlock();
> return task;
>
> }

Seeking should now just work.

---
fs/proc/base.c | 89 +++++++++++++---------------------------------------
include/linux/pid.h | 1
kernel/pid.c | 35 ++++++++++++++++++++
3 files changed, 59 insertions(+), 66 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9943527..05dc244 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1813,70 +1813,28 @@ out:
}

/*
- * Find the first tgid to return to user space.
+ * Find the first task with tgid >= tgid
*
- * Usually this is just whatever follows &init_task, but if the users
- * buffer was too small to hold the full list or there was a seek into
- * the middle of the directory we have more work to do.
- *
- * In the case of a short read we start with find_task_by_pid.
- *
- * In the case of a seek we start with &init_task and walk nr
- * threads past it.
*/
-static struct task_struct *first_tgid(int tgid, unsigned int nr)
+static struct task_struct *next_tgid(unsigned int tgid)
{
- struct task_struct *pos;
- rcu_read_lock();
- if (tgid && nr) {
- pos = find_task_by_pid(tgid);
- if (pos && thread_group_leader(pos))
- goto found;
- }
- /* If nr exceeds the number of processes get out quickly */
- pos = NULL;
- if (nr && nr >= nr_processes())
- goto done;
-
- /* If we haven't found our starting place yet start with
- * the init_task and walk nr tasks forward.
- */
- for (pos = next_task(&init_task); nr > 0; --nr) {
- pos = next_task(pos);
- if (pos == &init_task) {
- pos = NULL;
- goto done;
- }
- }
-found:
- get_task_struct(pos);
-done:
- rcu_read_unlock();
- return pos;
-}
+ struct task_struct *task;
+ struct pid *pid;

-/*
- * Find the next task in the task list.
- * Return NULL if we loop or there is any error.
- *
- * The reference to the input task_struct is released.
- */
-static struct task_struct *next_tgid(struct task_struct *start)
-{
- struct task_struct *pos;
+ task = NULL;
rcu_read_lock();
- pos = start;
- if (pid_alive(start))
- pos = next_task(start);
- if (pid_alive(pos) && (pos != &init_task)) {
- get_task_struct(pos);
- goto done;
+retry:
+ pid = find_next_pid(tgid);
+ if (pid) {
+ tgid = pid->nr + 1;
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task || !thread_group_leader(task))
+ goto retry;
+ get_task_struct(task);
}
- pos = NULL;
-done:
rcu_read_unlock();
- put_task_struct(start);
- return pos;
+ return task;
+
}

static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
@@ -1888,6 +1846,8 @@ static int proc_pid_fill_cache(struct fi
proc_pid_instantiate, task, NULL);
}

+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff) - 1)
+
/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
@@ -1906,23 +1866,20 @@ int proc_pid_readdir(struct file * filp,
}
nr -= ARRAY_SIZE(proc_base_stuff) - 1;

- /* f_version caches the tgid value that the last readdir call couldn't
- * return. lseek aka telldir automagically resets f_version to 0.
- */
- tgid = filp->f_version;
- filp->f_version = 0;
- for (task = first_tgid(tgid, nr);
+ tgid = filp->f_pos - TGID_OFFSET;
+ for (task = next_tgid(tgid);
task;
- task = next_tgid(task), filp->f_pos++) {
+ task = next_tgid(tgid + 1)) {
tgid = task->pid;
+ filp->f_pos = tgid + TGID_OFFSET;
if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
- filp->f_version = tgid;
put_task_struct(task);
- break;
+ goto out;
}
}
+ filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
out:
put_task_struct(reaper);
out_no_task:
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 9fd547f..d06d4ba 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -89,6 +89,7 @@ extern struct pid *FASTCALL(find_pid(int
* Lookup a PID in the hash table, and return with it's count elevated.
*/
extern struct pid *find_get_pid(int nr);
+extern struct pid *find_next_pid(int nr);

extern struct pid *alloc_pid(void);
extern void FASTCALL(free_pid(struct pid *pid));
diff --git a/kernel/pid.c b/kernel/pid.c
index 40e8e4d..112ff2a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -145,6 +145,23 @@ static int alloc_pidmap(void)
return -1;
}

+static int next_pidmap(int last)
+{
+ int offset;
+ pidmap_t *map;
+
+ offset = (last + 1) & BITS_PER_PAGE_MASK;
+ map = &pidmap_array[(last + 1)/BITS_PER_PAGE];
+ for (; map < &pidmap_array[PIDMAP_ENTRIES]; map++, offset = 0) {
+ if (unlikely(!map->page))
+ continue;
+ offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
+ if (offset < BITS_PER_PAGE)
+ return mk_pid(map, offset);
+ }
+ return -1;
+}
+
fastcall void put_pid(struct pid *pid)
{
if (!pid)
@@ -307,6 +324,24 @@ struct pid *find_get_pid(pid_t nr)
EXPORT_SYMBOL_GPL(find_get_pid);

/*
+ * Used by proc to find the pid with the next greater number.
+ * Specifying nr is used to handle the seek case.
+ */
+struct pid *find_next_pid(int nr)
+{
+ struct pid *next;
+
+ next = find_pid(nr);
+ while (!next) {
+ nr = next_pidmap(nr);
+ if (nr <= 0)
+ break;
+ next = find_pid(nr);
+ }
+ return next;
+}
+
+/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
* more.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/