(1/4) [PATCH] cpuset -- 2.6.0-test8

From: Stephen Hemminger
Date: Tue Oct 21 2003 - 18:23:11 EST


Here is an update of the last cpuset patch (for 2.6.0-test5) to 2.6.0-test8
The only two changes were reconciling changes to proc/base.c and the syscall
numbers.

diff -Nru a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
--- a/arch/i386/kernel/cpu/proc.c Tue Oct 21 16:05:27 2003
+++ b/arch/i386/kernel/cpu/proc.c Tue Oct 21 16:05:27 2003
@@ -4,6 +4,12 @@
#include <asm/semaphore.h>
#include <linux/seq_file.h>

+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+#include <linux/cpuset_types.h>
+#endif
+
/*
* Get CPU information for use by the procfs.
*/
@@ -63,12 +69,22 @@
if (!cpu_online(n))
return 0;
#endif
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+ /* show only CPUs in current cpuset */
+ if (!cpu_isset(n, current->cpuset->cpus_allowed))
+ return 0;
+#endif /* CONFIG_CPUSETS_PROC_CPUINFO */
+
seq_printf(m, "processor\t: %d\n"
"vendor_id\t: %s\n"
"cpu family\t: %d\n"
"model\t\t: %d\n"
"model name\t: %s\n",
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+ cpuset_realtologic_cpuid(current->cpuset, n),
+#else
n,
+#endif
c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
c->x86,
c->x86_model,
diff -Nru a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
--- a/arch/i386/kernel/entry.S Tue Oct 21 16:05:27 2003
+++ b/arch/i386/kernel/entry.S Tue Oct 21 16:05:27 2003
@@ -880,5 +880,14 @@
.long sys_utimes
.long sys_fadvise64_64
.long sys_ni_syscall /* sys_vserver */
+ .long sys_ni_syscall
+ .long sys_ni_syscall /* 275 */
+ .long sys_ni_syscall
+ .long sys_cpuset_create
+ .long sys_cpuset_destroy
+ .long sys_cpuset_alloc
+ .long sys_cpuset_attach
+ .long sys_cpuset_getfreecpus
+

nr_syscalls=(.-sys_call_table)/4
diff -Nru a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
--- a/arch/ia64/kernel/entry.S Tue Oct 21 16:05:27 2003
+++ b/arch/ia64/kernel/entry.S Tue Oct 21 16:05:27 2003
@@ -1481,11 +1481,19 @@
data8 ia64_ni_syscall
data8 ia64_ni_syscall // 1265
data8 ia64_ni_syscall
+#ifdef CONFIG_CPUSETS
+ data8 sys_cpuset_create
+ data8 sys_cpuset_destroy
+ data8 sys_cpuset_alloc
+ data8 sys_cpuset_attach // 1270
+ data8 sys_cpuset_getfreecpus
+#else
data8 ia64_ni_syscall
data8 ia64_ni_syscall
data8 ia64_ni_syscall
data8 ia64_ni_syscall // 1270
data8 ia64_ni_syscall
+#endif
data8 ia64_ni_syscall
data8 ia64_ni_syscall
data8 ia64_ni_syscall
diff -Nru a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
--- a/arch/ia64/kernel/setup.c Tue Oct 21 16:05:27 2003
+++ b/arch/ia64/kernel/setup.c Tue Oct 21 16:05:27 2003
@@ -50,6 +50,10 @@
#include <asm/system.h>
#include <asm/unistd.h>

+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+# include <linux/cpuset_types.h>
+#endif
+
#if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
# error "struct cpuinfo_ia64 too big!"
#endif
@@ -383,6 +387,15 @@
unsigned long mask;
int i;

+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+ /* show only CPUs in current cpuset */
+ if (!current->cpuset)
+ BUG();
+
+ if (!cpu_isset(cpunum, current->cpuset->cpus_allowed))
+ return 0;
+#endif /* CONFIG_CPUSETS_PROC_CPUINFO */
+
mask = c->features;

switch (c->family) {
@@ -427,7 +440,12 @@
"cpu MHz : %lu.%06lu\n"
"itc MHz : %lu.%06lu\n"
"BogoMIPS : %lu.%02lu\n\n",
- cpunum, c->vendor, family, c->model, c->revision, c->archrev,
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+ cpuset_realtologic_cpuid(current->cpuset, cpunum),
+#else
+ cpunum,
+#endif
+ c->vendor, family, c->model, c->revision, c->archrev,
features, c->ppn, c->number,
c->proc_freq / 1000000, c->proc_freq % 1000000,
c->itc_freq / 1000000, c->itc_freq % 1000000,
diff -Nru a/fs/proc/base.c b/fs/proc/base.c
--- a/fs/proc/base.c Tue Oct 21 16:05:27 2003
+++ b/fs/proc/base.c Tue Oct 21 16:05:27 2003
@@ -60,6 +60,9 @@
PROC_TGID_MAPS,
PROC_TGID_MOUNTS,
PROC_TGID_WCHAN,
+#ifdef CONFIG_CPUSETS_PROC
+ PROC_TGID_CPUSET,
+#endif
#ifdef CONFIG_SECURITY
PROC_TGID_ATTR,
PROC_TGID_ATTR_CURRENT,
@@ -123,6 +126,9 @@
#ifdef CONFIG_KALLSYMS
E(PROC_TGID_WCHAN, "wchan", S_IFREG|S_IRUGO),
#endif
+#ifdef CONFIG_CPUSETS_PROC
+ E(PROC_TGID_CPUSET, "cpuset", S_IFREG|S_IRUGO),
+#endif
{0,0,NULL,0}
};
static struct pid_entry tid_base_stuff[] = {
@@ -366,6 +372,11 @@
}
#endif /* CONFIG_KALLSYMS */

+
+#ifdef CONFIG_CPUSETS_PROC
+int proc_pid_cpuset(struct task_struct *task, char *buffer);
+#endif /* CONFIG_CPUSETS_PROC */
+
/************************************************************************/
/* Here the fs part begins */
/************************************************************************/
@@ -1359,6 +1370,12 @@
case PROC_TGID_WCHAN:
inode->i_fop = &proc_info_file_operations;
ei->op.proc_read = proc_pid_wchan;
+ break;
+#endif
+#ifdef CONFIG_CPUSETS_PROC
+ case PROC_TGID_CPUSET:
+ inode->i_fop = &proc_info_file_operations;
+ ei->op.proc_read = proc_pid_cpuset;
break;
#endif
default:
diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
--- a/fs/proc/proc_misc.c Tue Oct 21 16:05:27 2003
+++ b/fs/proc/proc_misc.c Tue Oct 21 16:05:27 2003
@@ -51,6 +51,10 @@
#include <asm/tlb.h>
#include <asm/div64.h>

+#ifdef CONFIG_CPUSETS_PROC_STAT
+# include <linux/cpuset_types.h>
+#endif
+
#define LOAD_INT(x) ((x) >> FSHIFT)
#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
/*
@@ -382,6 +386,12 @@
int j;

if (!cpu_online(i)) continue;
+#ifdef CONFIG_CPUSETS_PROC_STAT
+ /* show only CPUs in current cpuset */
+ if (!cpu_isset(i, current->cpuset->cpus_allowed))
+ continue;
+#endif
+
user += kstat_cpu(i).cpustat.user;
nice += kstat_cpu(i).cpustat.nice;
system += kstat_cpu(i).cpustat.system;
@@ -403,8 +413,17 @@
jiffies_to_clock_t(softirq));
for (i = 0; i < NR_CPUS; i++){
if (!cpu_online(i)) continue;
+#ifdef CONFIG_CPUSETS_PROC_STAT
+ /* show only CPUs in current cpuset */
+ if (!cpu_isset(i, current->cpuset->cpus_allowed))
+ continue;
+#endif
seq_printf(p, "cpu%d %u %u %u %u %u %u %u\n",
+#ifdef CONFIG_CPUSETS_PROC_STAT
+ cpuset_realtologic_cpuid(current->cpuset, i),
+#else
i,
+#endif
jiffies_to_clock_t(kstat_cpu(i).cpustat.user),
jiffies_to_clock_t(kstat_cpu(i).cpustat.nice),
jiffies_to_clock_t(kstat_cpu(i).cpustat.system),
diff -Nru a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
--- a/include/asm-i386/unistd.h Tue Oct 21 16:05:27 2003
+++ b/include/asm-i386/unistd.h Tue Oct 21 16:05:27 2003
@@ -280,7 +280,13 @@
#define __NR_fadvise64_64 272
#define __NR_vserver 273

-#define NR_syscalls 274
+#define __NR_sys_cpuset_create 277
+#define __NR_sys_cpuset_destroy 278
+#define __NR_sys_cpuset_alloc 279
+#define __NR_sys_cpuset_attach 280
+#define __NR_sys_cpuset_getfreecpus 281
+
+#define NR_syscalls 282

/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */

diff -Nru a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
--- a/include/asm-ia64/unistd.h Tue Oct 21 16:05:27 2003
+++ b/include/asm-ia64/unistd.h Tue Oct 21 16:05:27 2003
@@ -253,6 +253,12 @@

#define NR_syscalls 256 /* length of syscall table */

+#define __NR_sys_cpuset_create 1267
+#define __NR_sys_cpuset_destroy 1268
+#define __NR_sys_cpuset_alloc 1269
+#define __NR_sys_cpuset_attach 1270
+#define __NR_sys_cpuset_getfreecpus 1271
+
#if !defined(__ASSEMBLY__) && !defined(ASSEMBLER)

extern long __ia64_syscall (long a0, long a1, long a2, long a3, long a4, long nr);
diff -Nru a/include/linux/cpuset.h b/include/linux/cpuset.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/linux/cpuset.h Tue Oct 21 16:05:27 2003
@@ -0,0 +1,29 @@
+/*
+ * BULL cpuset interface
+ */
+
+#ifndef _LINUX_CPUSET_H
+#define _LINUX_CPUSET_H
+
+typedef unsigned int cpuset_t;
+
+#define CPUSET_STRICT 0x00000001
+#define CPUSET_AUTOCLEAN 0x00000002
+
+#ifdef __KERNEL__
+
+extern struct cpuset top_cpuset;
+
+void use_cpuset(struct cpuset *);
+void release_cpuset(struct cpuset *);
+
+struct task_struct;
+int cpuset_setaffinity(struct task_struct * task, unsigned long mask);
+
+void cpusets_update_cpus_online(void);
+
+int cpuset_realtologic_cpuid(struct cpuset * cs, int cpuid);
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_CPUSET_H */
diff -Nru a/include/linux/cpuset_types.h b/include/linux/cpuset_types.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/linux/cpuset_types.h Tue Oct 21 16:05:27 2003
@@ -0,0 +1,39 @@
+#ifndef _LINUX_CPUSET_TYPES_H
+#define _LINUX_CPUSET_TYPES_H
+
+
+struct cpuset {
+ cpuset_t id;
+ int flags;
+ int has_been_attached;
+
+ /* bitmask of the cpus present in this cpuset */
+ cpumask_t cpus_allowed;
+
+ /* bitmask of the cpus reserved in this cpuset */
+ cpumask_t cpus_reserved;
+
+ /* bitmask of the cpus reserved with CPUSET_STRICT */
+ cpumask_t cpus_strictly_reserved;
+
+ struct cpuset * parent;
+ struct list_head list; /* for the whole list */
+
+ struct list_head children;
+ struct list_head brothers;
+
+ /* overall users (processes + children) */
+ /* will be replaced by atomic_t in the future */
+ atomic_t count;
+
+ spinlock_t attach_lock;
+
+ /* owner */
+ uid_t uid;
+ uid_t suid;
+
+
+};
+
+
+#endif
diff -Nru a/include/linux/init_task.h b/include/linux/init_task.h
--- a/include/linux/init_task.h Tue Oct 21 16:05:27 2003
+++ b/include/linux/init_task.h Tue Oct 21 16:05:27 2003
@@ -56,6 +56,12 @@
.siglock = SPIN_LOCK_UNLOCKED, \
}

+#ifdef CONFIG_CPUSETS
+#define CPUSET_TSKINIT(a,b) .a = b,
+#else
+#define CPUSET_TSKINIT(a,b)
+#endif
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -108,6 +114,9 @@
.proc_lock = SPIN_LOCK_UNLOCKED, \
.switch_lock = SPIN_LOCK_UNLOCKED, \
.journal_info = NULL, \
+ CPUSET_TSKINIT(cpus_wanted, CPU_MASK_ALL) \
+ CPUSET_TSKINIT(cpuset, &top_cpuset) \
+ CPUSET_TSKINIT(cpuset_attach_lock, SPIN_LOCK_UNLOCKED) \
}


diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h Tue Oct 21 16:05:27 2003
+++ b/include/linux/sched.h Tue Oct 21 16:05:27 2003
@@ -29,6 +29,7 @@
#include <linux/completion.h>
#include <linux/pid.h>
#include <linux/percpu.h>
+#include <linux/cpuset.h>

struct exec_domain;

@@ -464,6 +465,13 @@

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
+
+/* cpuset info */
+#ifdef CONFIG_CPUSETS
+ struct cpuset * cpuset;
+ unsigned long cpus_wanted;
+ spinlock_t cpuset_attach_lock;
+#endif
};

static inline pid_t process_group(struct task_struct *tsk)
diff -Nru a/init/Kconfig b/init/Kconfig
--- a/init/Kconfig Tue Oct 21 16:05:27 2003
+++ b/init/Kconfig Tue Oct 21 16:05:27 2003
@@ -194,6 +194,41 @@
Disabling this option will cause the kernel to be built without
support for epoll family of system calls.

+if X86 || IA64
+
+config CPUSETS
+ bool "cpusets"
+ depends on SMP
+ help
+ This options will let you create and manage sets of cpu where you
+ can run the processes.
+
+ Say N if unsure.
+
+config CPUSETS_PROC
+ bool "/proc/cpusets support"
+ depends on CPUSETS
+ help
+ Get some info about the existing cpusets in your system.
+ To use this option, you have to ensure that the "/proc file system
+ support" (CONFIG_PROC_FS) is enabled, too.
+
+config CPUSETS_PROC_CPUINFO
+ bool "/proc/cpuinfo uses current cpuset"
+ depends on CPUSETS_PROC
+ help
+ With this option enabled, a process reading /proc/cpuinfo will
+ only see the CPUs that are in its current cpuset.
+
+config CPUSETS_PROC_STAT
+ bool "/proc/stat uses current cpuset"
+ depends on CPUSETS_PROC
+ help
+ With this option enabled, a process reading /proc/stat will
+ only see the CPUs that are in its current cpuset.
+
+endif
+
source "drivers/block/Kconfig.iosched"

endmenu # General setup
diff -Nru a/init/main.c b/init/main.c
--- a/init/main.c Tue Oct 21 16:05:27 2003
+++ b/init/main.c Tue Oct 21 16:05:27 2003
@@ -39,6 +39,13 @@
#include <linux/writeback.h>
#include <linux/cpu.h>

+
+
+#ifdef CONFIG_CPUSETS
+#include <linux/cpuset.h>
+#endif
+
+
#include <asm/io.h>
#include <asm/bugs.h>

@@ -85,6 +92,7 @@
extern void free_initmem(void);
extern void populate_rootfs(void);
extern void driver_init(void);
+extern void cpusets_init(void);

#ifdef CONFIG_TC
extern void tc_init(void);
@@ -456,6 +464,10 @@
#ifdef CONFIG_PROC_FS
proc_root_init();
#endif
+#ifdef CONFIG_CPUSETS
+ cpusets_init();
+#endif
+
check_bugs();
printk("POSIX conformance testing by UNIFIX\n");

@@ -524,6 +536,10 @@
*/
static void __init do_basic_setup(void)
{
+#ifdef CONFIG_CPUSETS
+ cpusets_update_cpus_online();
+#endif
+
driver_init();

#ifdef CONFIG_SYSCTL
@@ -579,6 +595,7 @@
do_basic_setup();

prepare_namespace();
+

/*
* Ok, we have completed the initial bootup, and
diff -Nru a/kernel/Makefile b/kernel/Makefile
--- a/kernel/Makefile Tue Oct 21 16:05:27 2003
+++ b/kernel/Makefile Tue Oct 21 16:05:27 2003
@@ -19,6 +19,7 @@
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKCONFIG_PROC) += configs.o
+obj-$(CONFIG_CPUSETS) += cpuset.o

ifneq ($(CONFIG_IA64),y)
# According to Alan Modra <alan@xxxxxxxxxxxxxxxx>, the -fno-omit-frame-pointer is
diff -Nru a/kernel/cpuset.c b/kernel/cpuset.c
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/kernel/cpuset.c Tue Oct 21 16:05:27 2003
@@ -0,0 +1,785 @@
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/slab.h> /* for kmalloc */
+#include <linux/list.h>
+#include <linux/sched.h> /* for find_task_by_pid and task_struct */
+#include <asm/uaccess.h>
+#include <linux/errno.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/cpuset_types.h>
+
+#define info(args...) do {} while(0)
+//#define info(args...) printk(KERN_INFO args)
+
+
+#ifdef CPU_ARRAY_SIZE
+#warning "CPU ARRAY SIZE !"
+#endif
+rwlock_t cpuset_lock = RW_LOCK_UNLOCKED;
+
+#define CPUSET_TOP_ID 1
+
+struct cpuset top_cpuset = {
+ .id = CPUSET_TOP_ID,
+ .flags = CPUSET_STRICT,
+ .cpus_reserved = CPU_MASK_NONE,
+ .cpus_strictly_reserved = CPU_MASK_NONE,
+ .parent = 0,
+ .children = LIST_HEAD_INIT(top_cpuset.children),
+ .brothers = LIST_HEAD_INIT(top_cpuset.brothers),
+ .list = LIST_HEAD_INIT(top_cpuset.list),
+ .count = ATOMIC_INIT(1), /* this cpuset can't be deleted */
+ .has_been_attached = 0,
+ .uid = 0,
+ .attach_lock = SPIN_LOCK_UNLOCKED,
+ .suid = 0
+};
+
+
+static int proc_cpusets_init(void);
+
+int __init cpusets_init(void)
+{
+ info("cpusets ("__FILE__ " compiled " __DATE__ " " __TIME__ "initializing..\n");
+
+
+#ifdef CONFIG_CPUSETS_PROC
+ proc_cpusets_init();
+#endif /* CONFIG_CPUSETS_PROC */
+ return 0;
+}
+
+/*
+ * later this function may be used to indicate that a CPU has been put
+ * online/offline
+ * BUT currently it only exists because cpu_online_map becomes available
+ * only late during kernel boot
+ */
+void cpusets_update_cpus_online(void)
+{
+ top_cpuset.cpus_allowed = cpu_online_map ;
+}
+
+
+static const int N = (8*sizeof(cpumask_t));
+/* mask must NOT be ZERO ! */
+/* this is a cyclic version of next_cpu */
+static inline void _next_cpu(const cpumask_t mask, int * index)
+{
+ for(;;) {
+ if (++*index >= N) *index = 0;
+ if (cpu_isset(*index, mask)) return;
+ }
+}
+
+static unsigned long cpuset_combine_mask(const cpumask_t wanted, const cpumask_t allowed)
+{
+ int i;
+ cpumask_t mask;
+
+ /* start with current cpu out of the mask
+ * so the first call to next_cpu will take the first cpu
+ * even if it is cpu zero
+ */
+ int cpu = N;
+
+ if (cpus_empty(wanted)) return 0;
+ if (cpus_empty(allowed)) return 0;
+
+ cpus_clear(mask);
+
+ for(i=0; i < N; i++) {
+ _next_cpu(allowed, &cpu);
+ if (cpu_isset(i, wanted))
+ cpu_set(cpu, mask);
+ }
+ info("cpuset_combine_mask: %016lx + %016lx --> %016lx\n",
+ wanted, allowed, mask);
+ return mask;
+}
+
+/* translate a "real" cpu number to a "inside cpuset" (aka logical)
+ * cpu number. Used for /proc/cpuinfo
+ */
+int cpuset_realtologic_cpuid(struct cpuset * cs, int cpuid)
+{
+ int i;
+ int l = 0;
+ for(i=0; i < NR_CPUS; i++)
+ {
+ if (i == cpuid) return l;
+ if (cpu_isset(i, cs->cpus_allowed))
+ l++;
+ }
+ /* NOT REACHED */
+ BUG();
+ return 0;
+}
+
+static struct cpuset * find_cpuset_by_id(cpuset_t id)
+{
+ struct cpuset * cs;
+ if (id == CPUSET_TOP_ID) return &top_cpuset;
+
+ list_for_each_entry(cs, &top_cpuset.list, list) {
+ if (cs->id == id) return cs;
+ }
+ /* Not found */
+ return 0;
+}
+
+/* increment a cpuset use count */
+void use_cpuset(struct cpuset * cs)
+{
+ atomic_inc(&cs->count);
+}
+
+static void check_cpuset_autoclean(struct cpuset *);
+
+/* decrement a cpuset use count, and maybe autodestroy it */
+/* cpuset_lock MUST NOT BE HELD */
+void release_cpuset(struct cpuset * cs)
+{
+ if (atomic_dec_and_test(&cs->count))
+ check_cpuset_autoclean(cs);
+}
+
+/* find a free cpuset ID */
+static cpuset_t cpuset_mkid(void)
+{
+ static cpuset_t curid = CPUSET_TOP_ID;
+
+ while (find_cpuset_by_id(++curid));
+
+ return curid;
+}
+
+asmlinkage long sys_cpuset_create(cpuset_t * cpusetp, int flags)
+{
+ struct cpuset * cs;
+
+ info("sys_cpuset_create(%016lx, %d) called\n",
+ (unsigned long) cpusetp, flags);
+
+ /* can only create a strict cs in another strict cs */
+ if ((flags & CPUSET_STRICT) && (!(current->cpuset->flags & CPUSET_STRICT)))
+ return -EINVAL;
+
+ /* check if given pointer is valid */
+ if (verify_area(VERIFY_WRITE, cpusetp, sizeof(cpuset_t)))
+ return -EFAULT;
+
+ cs = (struct cpuset *) kmalloc(sizeof(struct cpuset), GFP_KERNEL);
+ if (!cs)
+ return -ENOMEM;
+
+ cs->flags = flags;
+ atomic_set(&cs->count, 0);
+ INIT_LIST_HEAD(&cs->children);
+ cs->cpus_allowed = 0;
+ cs->cpus_reserved = 0;
+ cs->cpus_strictly_reserved = 0;
+ cs->has_been_attached = 0;
+ cs->uid = current->uid;
+ cs->suid = current->suid;
+ cs->attach_lock = SPIN_LOCK_UNLOCKED;
+
+ cs->parent = current->cpuset;
+
+ use_cpuset(cs->parent);
+
+ write_lock(&cpuset_lock);
+
+ cs->id = cpuset_mkid();
+ list_add(&cs->brothers, &cs->parent->children);
+ list_add(&cs->list, &top_cpuset.list);
+
+ write_unlock(&cpuset_lock);
+
+ if (put_user(cs->id, cpusetp))
+ info("put_user failed !\n");
+
+ return 0;
+}
+
+
+static inline int bad_permission(struct cpuset * cs)
+{
+ return ((current->euid) && (current->euid != cs->uid) && (current->euid != cs->suid));
+}
+
+static void __cpuset_destroy(struct cpuset * cs);
+
+asmlinkage long sys_cpuset_destroy(cpuset_t cpuset)
+{
+ struct cpuset * cs;
+
+ info("sys_cpuset_destroy(%d) called\n", cpuset);
+
+ if (cpuset == CPUSET_TOP_ID)
+ return -EINVAL;
+
+ read_lock(&cpuset_lock);
+ cs = find_cpuset_by_id(cpuset);
+
+ if (!cs) {
+ read_unlock(&cpuset_lock);
+ return -EINVAL;
+ }
+
+ use_cpuset(cs);
+ read_unlock(&cpuset_lock);
+
+ if (bad_permission(cs)) {
+ release_cpuset(cs);
+ return -EPERM;
+ }
+
+ write_lock(&cpuset_lock);
+ /* there's at least 1 user (us), if there's more, we can't destroy cs */
+ if (atomic_read(&cs->count) > 1) {
+ write_unlock(&cpuset_lock);
+ release_cpuset(cs);
+ return -EBUSY;
+ }
+
+ /* everything OK, destroy it */
+ __cpuset_destroy(cs);
+ /* write_unlock(&cpuset_lock) will be done inside __cpuset_destroy */
+
+ return 0;
+}
+
+static void rebuild_reserved_masks(struct cpuset * csp) {
+ cpumask_t r;
+ cpumask_t sr;
+ struct cpuset * cs;
+ info("Updating cpuset %d masks\n", csp->id);
+
+ cpus_clear(r);
+ cpus_clear(sr);
+
+ list_for_each_entry(cs, &csp->children, brothers) {
+ info(" child %d\n", cs->id);
+ cpus_or(r, r, cs->cpus_allowed);
+ if (cs->flags & CPUSET_STRICT)
+ cpus_or(sr, sr, cs->cpus_allowed);
+ }
+ csp->cpus_reserved = r;
+ csp->cpus_strictly_reserved = sr;
+}
+
+/* REALLY destroy a cpuset
+ * NOTE:
+ * -> write cpuset_lock must be held
+ * -> ----------------- WILL BE RELEASED
+ * this ugly hack is necessary to call release_cpuset(parent)
+ */
+static void __cpuset_destroy(struct cpuset * cs)
+{
+ list_del(&cs->list);
+ list_del(&cs->brothers);
+
+ /* cs will never be top_cpuset, so ->parent exists */
+ rebuild_reserved_masks(cs->parent);
+
+ write_unlock(&cpuset_lock);
+ release_cpuset(cs->parent);
+
+ kfree(cs);
+}
+
+/* remove an unused cpuset if it has the CPUSET_AUTOCLEAN flag */
+static void check_cpuset_autoclean(struct cpuset * cs)
+{
+ if (!(cs->flags & CPUSET_AUTOCLEAN)) return; /* not autoclean */
+ if (!cs->has_been_attached) return;
+
+ write_lock(&cpuset_lock);
+
+ if (atomic_read(&cs->count) > 0) { /* still in use */
+ write_unlock(&cpuset_lock);
+ return;
+ }
+
+ info("autocleaning cpuset %d\n", cs->id);
+
+ __cpuset_destroy(cs);
+ /* write_unlock(&cpuset_lock) will be done inside __cpuset_destroy */
+}
+
+asmlinkage long sys_cpuset_attach(cpuset_t cpuset, pid_t pid)
+{
+ struct cpuset * cs;
+ struct task_struct * task;
+
+ info("sys_cpuset_attach(%d, %d) called\n", cpuset, pid);
+
+ read_lock(&cpuset_lock);
+ cs = find_cpuset_by_id(cpuset);
+
+ if (!cs) {
+ read_unlock(&cpuset_lock);
+ return -EINVAL;
+ }
+
+
+ use_cpuset(cs);
+
+ read_unlock(&cpuset_lock);
+
+ if (bad_permission(cs)) {
+ release_cpuset(cs);
+ return -EPERM;
+ }
+
+ if (!cs->cpus_allowed) { /* cannot attach a cpuset with no CPU */
+ release_cpuset(cs);
+ return -EINVAL;
+ }
+
+ if (pid) {
+ read_lock(&tasklist_lock);
+
+ task = find_task_by_pid(pid);
+ if (!task) {
+ read_unlock(&tasklist_lock);
+ release_cpuset(cs);
+ return -ESRCH;
+ }
+
+ get_task_struct(task);
+ read_unlock(&tasklist_lock);
+
+ if ((current->euid) && (current->euid != task->uid) && (current->euid != task->suid)) {
+ put_task_struct(task);
+ release_cpuset(cs);
+ return -EPERM;
+ }
+ }
+ else {
+ task = current;
+ get_task_struct(task);
+ }
+
+ set_cpus_allowed(task, cpuset_combine_mask(task->cpus_wanted, cs->cpus_allowed));
+ cs->has_been_attached = 1;
+
+ /* release the current cpu set of the task */
+ /* lock to prevent a race where two cpuset_attach would be called on the same
+ * task at the same time, and task->cpuset would be released twice
+ */
+ spin_lock(&task->cpuset_attach_lock);
+ if (!task->cpuset) { /* task with no cpuset ? means it is exiting */
+ spin_unlock(&task->cpuset_attach_lock);
+ put_task_struct(task);
+ release_cpuset(cs);
+ return -ESRCH;
+ }
+ release_cpuset(task->cpuset);
+ /* now lock the cpuset, to protect any running migrate_cpuset...()
+ * from being disturbed by us
+ */
+ spin_lock(&cs->attach_lock);
+ task->cpuset = cs;
+ spin_unlock(&cs->attach_lock);
+
+ spin_unlock(&task->cpuset_attach_lock);
+
+
+ put_task_struct(task);
+
+ /* don't call release_cpuset here,
+ * the task being attached to the cpuset
+ * is really a new user !
+ */
+
+ return 0;
+}
+
+
+static int __cpuset_setaffinity(struct task_struct * task)
+{
+ cpumask_t allowed;
+ cpumask_t last = CPU_MASK_NONE; /* remember : 0 is not a valid mask */
+
+ /* We cannot hold any lock while calling set_cpus_allowed
+ * since it might sleep
+ * Thus we try until we are sure we did it with the right mask
+ */
+ for(;;) {
+ spin_lock(&task->cpuset_attach_lock);
+ if (!task->cpuset) {
+ /* task exiting */
+ spin_unlock(&task->cpuset_attach_lock);
+ return 0;
+ }
+ allowed = task->cpuset->cpus_allowed;
+ spin_unlock(&task->cpuset_attach_lock);
+
+ if (last == allowed)
+ return 0;
+
+ int ret;
+ ret = set_cpus_allowed(task, cpuset_combine_mask(task->cpus_wanted, allowed));
+ if (ret < 0)
+ return ret;
+
+ last = allowed;
+ }
+}
+
+/* Our replacement function for set_cpus_allowed */
+int cpuset_setaffinity(struct task_struct * task, cpumask_t mask)
+{
+ task->cpus_wanted = mask;
+ return __cpuset_setaffinity(task);
+}
+
+/* When a cpuset with attached processes is being realloc'ed CPUs
+ * update the processes' masks and migrate them
+ */
+static void migrate_cpuset_processes(struct cpuset * cs)
+{
+ struct task_struct *g, *p;
+ /* This should be a RARE use of the cpusets.
+ * therefore we'll prefer an inefficient operation here
+ * (searching the whole process list)
+ * than adding another list_head in task_t
+ * and locks and list_add for each fork()
+ */
+
+ /* we need to lock tasklist_lock for reading the processes list
+ * BUT we cannot call set_cpus_allowed with any spinlock held
+ * => we need to store the list of task struct in an array
+ */
+ struct task_struct ** array;
+ int nb = 0;
+ int sz;
+
+ spin_lock(&cs->attach_lock);
+ /* at most cs->count - 1 processes to migrate */
+ sz = atomic_read(&cs->count) - 1;
+ array = (struct task_struct **) kmalloc(sz * sizeof(struct task_struct *), GFP_ATOMIC);
+ if (!array) {
+ spin_unlock(&cs->attach_lock);
+ printk("Error allocating array in migrate_cpuset_processes !\n");
+ return;
+ }
+ /* see linux/sched.h for this nested for/do-while loop */
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (p->cpuset == cs) {
+ if (nb == sz) {
+ printk("migrate_cpuset_processes: array full !\n");
+ goto end_loop; /* break won't work in this double loop */
+ }
+ get_task_struct(p);
+ array[nb++] = p;
+ }
+ } while_each_thread(g, p);
+end_loop:
+ read_unlock(&tasklist_lock);
+ spin_unlock(&cs->attach_lock);
+
+ while(nb) {
+ struct task_struct * p = array[--nb];
+ __cpuset_setaffinity(p);
+ put_task_struct(p);
+ }
+ kfree(array);
+}
+
+
+
+/* see if mask b is included in mask a */
+/* old version : #define MASK_INCLUDED(a, b) (((a)|(b)) == (a)) */
+static inline int MASK_INCLUDED(cpumask_t a, cpumask_t b)
+{
+ cpumask_t r;
+ cpus_or(r, a, b);
+ return cpus_equal(r, a);
+}
+
+static inline cpumask_t CPUS_NOT(cpumask_t a)
+{
+ cpus_complement(a);
+ return a;
+}
+
+static inline cpumask_t CPUS_OR(cpumask_t a, cpumask_t b)
+{
+ cpumask_t r;
+ cpus_or(r, a, b);
+ return r;
+}
+
+static inline cpumask_t CPUS_AND(cpumask_t a, cpumask_t b)
+{
+ cpumask_t r;
+ cpus_and(r, a, b);
+ return r;
+}
+
+
+asmlinkage long sys_cpuset_alloc(cpuset_t cpuset, int len, unsigned long * user_mask_ptr)
+{
+ cpumask_t new_mask;
+ cpumask_t old_mask;
+ struct cpuset * cs ;
+ int retval;
+
+ info("sys_cpuset_alloc(%d, ...) called\n", cpuset);
+
+ if (cpuset == CPUSET_TOP_ID)
+ return -EINVAL;
+
+ if (len < sizeof(new_mask))
+ return -EINVAL;
+
+ if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
+ return -EFAULT;
+
+ /* do some sanity checks on the mask */
+ /* must have at least ONE cpu */
+ if (cpus_empty(new_mask))
+ return -EINVAL;
+
+ /* XXX phys_cpu_present_map has changed type --
+ * I disable this test for now
+ * anyway it is not _NEEDED_ since new_mask will have to stay
+ * in the parent's mask
+ * (just some overhead in a _really_ rare case) */
+#if 0
+ /* must only have existing CPUs */
+ if (!MASK_INCLUDED(phys_cpu_present_map, new_mask))
+ return -EINVAL;
+#endif
+
+ info(" with mask %016lx\n", new_mask);
+
+ read_lock(&cpuset_lock);
+ cs = find_cpuset_by_id(cpuset);
+
+
+ if (!cs) {
+ read_unlock(&cpuset_lock);
+ return -EINVAL;
+ }
+
+ use_cpuset(cs);
+ read_unlock(&cpuset_lock);
+
+ if (bad_permission(cs)) {
+ release_cpuset(cs);
+ return -EPERM;
+ }
+
+ /* lock early - we do not want the parent's masks to change under us */
+ write_lock(&cpuset_lock);
+ /* must only have CPUs in the parent cpuset (if any) */
+ retval = -EACCES;
+ if (!MASK_INCLUDED(cs->parent->cpus_allowed, new_mask))
+ goto mask_error;
+
+ old_mask = cs->cpus_allowed;
+
+ retval = -EBUSY;
+ /* must only have free cpus */
+ if (cs->flags & CPUSET_STRICT) {
+ /* CPUs already in this cs ARE free for us ! -> old_mask */
+ /* The next few lines mean :
+ * if (!MASK_INCLUDED(~cs->parent->cpus_reserved, new_mask & (~old_mask)))
+ * (just obfuscated my the cpus_ macros)
+ */
+ if (!MASK_INCLUDED(CPUS_NOT(cs->parent->cpus_reserved),
+ CPUS_AND(new_mask, CPUS_NOT(old_mask))))
+ goto mask_error;
+ }
+ else {
+ if (!MASK_INCLUDED(CPUS_NOT(cs->parent->cpus_strictly_reserved), new_mask))
+ goto mask_error;
+ }
+
+
+ /* are we trying to FREE reserved CPUs
+ * (i.e. reserved by children cpusets)
+ * from a non-unused cpuset ? */
+ /* if (cs->cpus_reserved & ~new_mask) */
+ if (!cpus_empty(CPUS_AND(cs->cpus_reserved, CPUS_NOT(new_mask))))
+ goto mask_error;
+
+ /* everything is OK */
+ cs->cpus_allowed = new_mask;
+ rebuild_reserved_masks(cs->parent);
+ write_unlock(&cpuset_lock);
+
+ /* did we change a non-unused cpuset ? */
+ if ((atomic_read(&cs->count) > 1) && !cpus_equal(new_mask, old_mask)) {
+ migrate_cpuset_processes(cs);
+ }
+
+ release_cpuset(cs);
+ return 0;
+
+mask_error:
+ write_unlock(&cpuset_lock);
+ release_cpuset(cs);
+ return retval;
+}
+
+asmlinkage long sys_cpuset_getfreecpus(int flags, int len, unsigned long * user_mask_ptr)
+{
+ cpumask_t reserved;
+ cpumask_t free;
+
+ int real_len = sizeof(unsigned long);
+ if (len < real_len)
+ return -EINVAL;
+
+ if (flags & CPUSET_STRICT)
+ reserved = current->cpuset->cpus_reserved;
+ else
+ reserved = current->cpuset->cpus_strictly_reserved;
+
+ free = CPUS_AND(current->cpuset->cpus_allowed, CPUS_NOT(reserved));
+
+ if (copy_to_user(user_mask_ptr, &free, real_len))
+ return -EFAULT;
+
+ return real_len;
+}
+
+/*************************************************************
+ ***************** /proc/cpusets stuff ***********************
+ *************************************************************
+ */
+#ifdef CONFIG_CPUSETS_PROC
+
+static void *proc_cpusets_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+ struct list_head *p;
+
+ read_lock(&cpuset_lock);
+ if (!n) seq_puts(m, "cpusets info \n");
+
+ p = &top_cpuset.list;
+ while (n--) {
+ p = p->next;
+ if (p == &top_cpuset.list)
+ return NULL;
+ }
+ return list_entry(p, struct cpuset, list);
+}
+
+static void *proc_cpusets_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ struct cpuset * cs = p;
+ ++*pos;
+ return cs->list.next == &top_cpuset.list ? NULL
+ : list_entry(cs->list.next, struct cpuset, list);
+}
+
+/* How many chars needed to print a long (as a mask) ? */
+#define CHARS_FOR_LONG (BITS_PER_LONG / 4)
+#define CFL CHARS_FOR_LONG
+static void sprint_mask(char * buf, cpumask_t mask)
+{
+#ifdef CPU_ARRAY_SIZE
+ int l;
+ for (l = CPU_ARRAY_SIZE - 1; l>=0; l--) {
+ /* XXX only 64 bits long supported here ! */
+ sprintf(buf, "%016lx", mask.mask[l]);
+ buf += CFL;
+ }
+#else
+ /* XXX only 64 bits long supported here ! */
+ sprintf(buf, "%016lx", mask);
+#endif
+}
+
+
+static int proc_cpusets_show(struct seq_file *m, void *p)
+{
+ struct cpuset * cs = p;
+#ifdef CPU_ARRAY_SIZE
+ char maskbuf[CPU_ARRAY_SIZE * CFL + 1];
+#else
+ char maskbuf[CFL + 1];
+#endif
+
+ seq_printf(m, "cpuset %d {\n"
+ "\tparent = %d\n"
+ "\tflags = %d\n"
+ "\tcount = %d\n"
+ "\thba = %d\n"
+ "\tuid & suid = %d & %d\n",
+ cs->id, cs->parent ? cs->parent->id : -1,
+ cs->flags, atomic_read(&cs->count), cs->has_been_attached,
+ cs->uid, cs->suid);
+
+ sprint_mask(maskbuf, cs->cpus_allowed);
+ seq_printf(m,"\tcpus_allowed = %s\n", maskbuf);
+ sprint_mask(maskbuf, cs->cpus_reserved);
+ seq_printf(m,"\tcpus_reserved = %s\n", maskbuf);
+ sprint_mask(maskbuf, cs->cpus_strictly_reserved);
+ seq_printf(m,"\tcpus_strictly_reserved = %s\n", maskbuf);
+
+ seq_printf(m, "}\n\n");
+
+ return 0;
+}
+
+static void proc_cpusets_stop(struct seq_file *m, void *p)
+{
+ read_unlock(&cpuset_lock);
+}
+
+static struct seq_operations cpusets_op = {
+ .start = proc_cpusets_start,
+ .next = proc_cpusets_next,
+ .stop = proc_cpusets_stop,
+ .show = proc_cpusets_show
+};
+
+
+static int proc_cpusets_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &cpusets_op);
+}
+
+static struct file_operations proc_cpusets_operations = {
+ .open = proc_cpusets_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+static int __init proc_cpusets_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("cpusets", 0, NULL);
+ if (entry)
+ entry->proc_fops = &proc_cpusets_operations;
+ return 0;
+}
+
+/*************************************************************
+ *********** /proc/xxx/cpuset ********************************
+ *************************************************************
+ */
+int proc_pid_cpuset(struct task_struct *task, char *buffer)
+{
+ return sprintf(buffer, "%d\n", task->cpuset->id);
+}
+
+#endif /* CONFIG_CPUSETS_PROC */
+
diff -Nru a/kernel/exit.c b/kernel/exit.c
--- a/kernel/exit.c Tue Oct 21 16:05:27 2003
+++ b/kernel/exit.c Tue Oct 21 16:05:27 2003
@@ -54,6 +54,19 @@

BUG_ON(p->state < TASK_ZOMBIE);

+
+#ifdef CONFIG_CPUSETS
+ spin_lock(&p->cpuset_attach_lock);
+ release_cpuset(p->cpuset);
+
+ /* mark that this process's cpuset has already been released
+ * another process might still try to cpuset_attach this process
+ */
+ p->cpuset = NULL;
+ spin_unlock(&p->cpuset_attach_lock);
+#endif /* CONFIG_CPUSETS */
+
+
atomic_dec(&p->user->processes);
spin_lock(&p->proc_lock);
proc_dentry = proc_pid_unhash(p);
@@ -87,6 +100,7 @@
spin_unlock(&p->proc_lock);
proc_pid_flush(proc_dentry);
release_thread(p);
+
put_task_struct(p);
}

diff -Nru a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c Tue Oct 21 16:05:27 2003
+++ b/kernel/fork.c Tue Oct 21 16:05:27 2003
@@ -31,6 +31,10 @@
#include <linux/ptrace.h>
#include <linux/mount.h>

+#ifdef CONFIG_CPUSETS
+#include <linux/cpuset.h>
+#endif
+
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -1035,6 +1039,11 @@
SET_LINKS(p);
if (p->ptrace & PT_PTRACED)
__ptrace_link(p, current->parent);
+
+#ifdef CONFIG_CPUSETS
+ use_cpuset(p->cpuset);
+#endif
+

attach_pid(p, PIDTYPE_PID, p->pid);
if (thread_group_leader(p)) {
diff -Nru a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c Tue Oct 21 16:05:27 2003
+++ b/kernel/sched.c Tue Oct 21 16:05:27 2003
@@ -38,6 +38,10 @@
#include <linux/cpu.h>
#include <linux/percpu.h>

+#ifdef CONFIG_CPUSETS
+#include <linux/cpuset.h>
+#endif
+
#ifdef CONFIG_NUMA
#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
#else
@@ -2203,7 +2207,11 @@
!capable(CAP_SYS_NICE))
goto out_unlock;

+#ifdef CONFIG_CPUSETS
+ retval = cpuset_setaffinity(p, new_mask);
+#else
retval = set_cpus_allowed(p, new_mask);
+#endif

out_unlock:
put_task_struct(p);
@@ -2236,7 +2244,11 @@
goto out_unlock;

retval = 0;
+#ifdef CONFIG_CPUSETS
+ mask = p->cpus_wanted;
+#else
cpus_and(mask, p->cpus_allowed, cpu_online_map);
+#endif

out_unlock:
read_unlock(&tasklist_lock);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/