Re: [patch] kstat change to see how much Linux SMP really scale well

Andi Kleen (ak@muc.de)
13 Mar 1999 08:42:35 +0100


andrea@e-mind.com (Andrea Arcangeli) writes:

> If I run something that doesn't scale well in SMP (due the big kernel
> lock) right now the kernel doesn't tell me how much the two CPU are really
> doing _useful_ things.
>
> While one CPU is blocked spinning on the lock it's not useful but we are
> accounting such time as normal kernel "useful" time. I don't like this too
> much because I want to see in xosview also how much the two CPU are used
> really. I think this would allow us to trivially discover big-kernel-lock
> bottleneck for example.
>
> So I changed the SMP kernel-time accounting to take care of what the
> kernel is doing. If the kernel was spinning on the lock at profiling time
> the system kernel-time won't be increased.

Nice idea. I extended it to account the lock time separately (per process
and global). With a modified top this gives a nice overview over locking
problems. Patch relative to your patch.

I changed the lock accounting back to system, because it is really system
time, but it is counted in the lock statistics too. To get pure (non locked)
system overhead simply subtract the lock counter from the system time.

Probably only compiles on i386 because I didn't change non i386
update_one_process callers. To catch mistakes I moved the prototype
into sched.h though.

BTW, on i386 the SMP timer interrupt does not handle lost ticks. The
little bit screwed up statistics are probably not bad, but it disturbs
scheduling too which may have negative effects.

Now the dream would be detailed per spinlock counters, unfortunately
the current spinlock declaration syntax makes it hard to name them (and
for dynamic spinlocks embedded in other structures a different way would
be needed anyways than to statically name the spinlocks - for them reading
System.map doesn't work well neither). This means for named spinlock profiling
it is needed to change all spinlock users :(:(. I believe for effective
SMP tuning in 2.3 they are definitely needed though.

-Andi

--- linux/fs/proc/array.c.~8~ Fri Jan 29 17:15:06 1999
+++ linux/fs/proc/array.c Sat Mar 13 08:16:21 1999
@@ -245,14 +245,15 @@
kstat.cpu_system,
jiffies*smp_num_cpus - (kstat.cpu_user + kstat.cpu_nice + kstat.cpu_system));
for (i = 0 ; i < smp_num_cpus; i++)
- len += sprintf(buffer + len, "cpu%d %u %u %u %lu\n",
+ len += sprintf(buffer + len, "cpu%d %u %u %u %lu %u\n",
i,
kstat.per_cpu_user[cpu_logical_map(i)],
kstat.per_cpu_nice[cpu_logical_map(i)],
kstat.per_cpu_system[cpu_logical_map(i)],
jiffies - ( kstat.per_cpu_user[cpu_logical_map(i)] \
+ kstat.per_cpu_nice[cpu_logical_map(i)] \
- + kstat.per_cpu_system[cpu_logical_map(i)]));
+ + kstat.per_cpu_system[cpu_logical_map(i)]),
+ kstat.per_cpu_lock[cpu_logical_map(i)]);
len += sprintf(buffer + len,
"disk %u %u %u %u\n"
"disk_rio %u %u %u %u\n"
@@ -817,7 +818,16 @@
cap_t(p->cap_effective));
}

-
+static inline char *task_ltime(struct task_struct *p, char *buffer)
+{
+ int c;
+ buffer += sprintf(buffer, "LockTime:");
+ for (c = 0; c < smp_num_cpus; c++)
+ buffer += sprintf(buffer, " %lu", p->per_cpu_ltime[c]);
+ *buffer++ = '\n';
+ return buffer;
+}
+
static int get_status(int pid, char * buffer)
{
char * orig = buffer;
@@ -833,6 +843,7 @@
buffer = task_mem(tsk, buffer);
buffer = task_sig(tsk, buffer);
buffer = task_cap(tsk, buffer);
+ buffer = task_ltime(tsk, buffer);
return buffer - orig;
}

--- linux/kernel/sched.c.~1~ Thu Jan 21 17:27:28 1999
+++ linux/kernel/sched.c Sat Mar 13 07:45:55 1999
@@ -1331,10 +1331,11 @@
}

void update_one_process(struct task_struct *p,
- unsigned long ticks, unsigned long user, unsigned long system, int cpu)
+ unsigned long ticks, unsigned long user, unsigned long system, int cpu, unsigned long lock)
{
p->per_cpu_utime[cpu] += user;
p->per_cpu_stime[cpu] += system;
+ p->per_cpu_ltime[cpu] += lock;
do_process_times(p, user, system);
do_it_virt(p, user);
do_it_prof(p, ticks);
--- linux/include/linux/sched.h.~1~ Sat Feb 27 19:51:49 1999
+++ linux/include/linux/sched.h Sat Mar 13 07:47:43 1999
@@ -272,7 +272,7 @@
struct timer_list real_timer;
struct tms times;
unsigned long start_time;
- long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
+ long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS], per_cpu_ltime[NR_CPUS];
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
int swappable:1;
@@ -359,7 +359,7 @@
/* timeout */ SCHED_OTHER,0,0,0,0,0,0,0, \
/* timer */ { NULL, NULL, 0, 0, it_real_fn }, \
/* utime */ {0,0,0,0},0, \
-/* per CPU times */ {0, }, {0, }, \
+/* per CPU times */ {0, }, {0, }, {0, }, \
/* flt */ 0,0,0,0,0,0, \
/* swp */ 0,0,0, \
/* process credentials */ \
@@ -456,6 +456,9 @@
extern unsigned long itimer_next;
extern struct timeval xtime;
extern void do_timer(struct pt_regs *);
+extern void update_one_process( struct task_struct *p,
+ unsigned long ticks, unsigned long user,
+ unsigned long system, int cpu, unsigned long lock);

extern unsigned int * prof_buffer;
extern unsigned long prof_len;
--- linux/include/linux/kernel_stat.h.~1~ Sat Feb 27 19:52:14 1999
+++ linux/include/linux/kernel_stat.h Sat Mar 13 08:12:53 1999
@@ -17,7 +17,8 @@
unsigned int cpu_user, cpu_nice, cpu_system;
unsigned int per_cpu_user[NR_CPUS],
per_cpu_nice[NR_CPUS],
- per_cpu_system[NR_CPUS];
+ per_cpu_system[NR_CPUS],
+ per_cpu_lock[NR_CPUS];
unsigned int dk_drive[DK_NDRIVE];
unsigned int dk_drive_rio[DK_NDRIVE];
unsigned int dk_drive_wio[DK_NDRIVE];
--- linux/arch/i386/kernel/smp.c.~1~ Sat Mar 13 07:28:25 1999
+++ linux/arch/i386/kernel/smp.c Sat Mar 13 08:12:00 1999
@@ -46,7 +46,7 @@
extern unsigned long start_kernel, _etext;
extern void update_one_process( struct task_struct *p,
unsigned long ticks, unsigned long user,
- unsigned long system, int cpu);
+ unsigned long system, int cpu, unsigned long lock);
/*
* Some notes on processor bugs:
*
@@ -1697,7 +1697,7 @@
x86_do_profile(regs->eip);

if (!--prof_counter[cpu]) {
- int user=0,system=0;
+ int user=0,system=0,lock=0;
struct task_struct * p = current;

/*
@@ -1709,17 +1709,19 @@

if (user_mode(regs))
user=1;
- else
- /*
- * Avoid to account as useful the wasted time the CPU
- * spend spinning on the SMP locks. -arca
- */
- if (!smp_cpu_spinning(regs))
- system=1;
+ else {
+ if (smp_cpu_spinning(regs))
+ lock=1;
+ system=1;
+ }

irq_enter(cpu, 0);
if (p->pid) {
- update_one_process(p, 1, user, system, cpu);
+ /* Note: this does not account for lost ticks
+ * because of irq locking, which may explain
+ * bad scheduler behaviour on SMP machines -AK.
+ */
+ update_one_process(p, 1, user, system, cpu, lock);

p->counter -= 1;
if (p->counter < 0) {
@@ -1736,6 +1738,7 @@

kstat.cpu_system += system;
kstat.per_cpu_system[cpu] += system;
+ kstat.per_cpu_lock[cpu] += lock;

}
prof_counter[cpu]=prof_multiplier[cpu];

-- 
This is like TV. I don't like TV.

- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.rutgers.edu Please read the FAQ at http://www.tux.org/lkml/