Re: ~5x greater CPU load for a networked application when using 2.6.15-rt15-smp vs. 2.6.12-1.1390_FC4

From: Ingo Molnar
Date: Thu Feb 23 2006 - 16:04:45 EST



* Gautam H Thaker <gthaker@xxxxxxxxxxxx> wrote:

> ::::::::::::::
> top: 2.6.15-rt15-smp.out # REAL_TIME KERNEL
> ::::::::::::::

> 2906 root -66 0 18624 2244 1480 S 41.4 0.1 27:11.21 nalive.p
> 6 root -91 0 0 0 0 S 32.3 0.0 21:04.53 softirq-net-rx/
> 1379 root -40 -5 0 0 0 S 14.5 0.0 9:54.76 IRQ 23

One effect of the -rt kernel is that it shows IRQ load explicitly -
while the stock kernel can 'hide' it because there interrupts run
'atomically', making it hard to measure the true system overhead. The
-rt kernel will likely show more overhead, but i'd not expect this
amount of overhead.

To figure out the true overhead of both kernels, could you try the
attached loop_print_thread.c code, and run it on: an idle non-rt kernel,
and idle -rt kernel, a busy non-rt kernel and a busy -rt kernel, and
send me the typical/average loops/sec value you are getting?

Furthermore, there have been some tasklet related fixes in 2.6.15-rt17,
which maybe could improve this workload. Maybe ...

Also, would there be some easy way for me to reproduce that workload?
Possibly some .c code you could send that is easy to run on the server
and the client to reproduce the guts of this workload?

Ingo

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>

#define rdtscll(val) \
__asm__ __volatile__ ("rdtsc;" : "=A" (val))

#define SECS 3ULL

volatile unsigned int count_array[1000] __attribute__((aligned(256)));
int atomic = 0;
unsigned long long delta = 0;

void *loop(void *arg)
{
unsigned long long start, now, mhz = 525000000, limit = mhz * SECS,
min = -1ULL, tmp;
volatile unsigned int *count, offset = (int)arg;
int j;

printf("offset: %u (atomic: %d).\n", offset, atomic);
count = (void *)count_array + offset;

if (!arg) {
for (j = 0; j < 10; j++) {
limit = mhz/10;
*count = 0;
rdtscll(start);
for (;;) {
(*count)++;
rdtscll(now);
if (now - start > limit)
break;
}
rdtscll(now);
tmp = (now-start)/(*count);
if (tmp < min)
min = tmp;
}
printf("delta: %Ld\n", min);
delta = min;
} else
while (!delta)
usleep(100000);
limit = mhz*SECS;

repeat:
*count = 0;
rdtscll(start);
if (atomic)
for (;;) {
asm ("lock; incl %0" : "=m" (*count) : "m" (*count));
rdtscll(now);
if (now - start > limit)
break;
}
else
for (;;) {
(*count)++;
rdtscll(now);
if (now - start > limit)
break;
}
printf("speed: %Ld loops (%Ld cycles per iteration).\n", (*count)/SECS, (limit/(*count)-delta)); fflush(stdout);
goto repeat;
}

int main (int argc, char **argv)
{
unsigned int nr_threads, i, ret, offset = 0;
pthread_t *t;

if (argc != 2 && argc != 3 && argc != 4) {
usage:
printf("usage: loop_print2 <nr threads> [<counter offset>] [<atomic>]\n");
exit(-1);
}
nr_threads = atol(argv[1]);
if (!nr_threads)
goto usage;
t = calloc(nr_threads, sizeof(*t));
if (argc >= 3)
offset = atol(argv[2]);
if (offset < sizeof(unsigned int))
offset = sizeof(unsigned int);
atomic = 0;
if (argc >= 4) {
atomic = atol(argv[3]);
printf("a: %d\n", atomic);
}

for (i = 1; i < nr_threads; i++) {
ret = pthread_create (t+i, NULL, loop,
(void *)(i*offset));
if (ret)
exit(-1);
}
loop((void *)0);

return 0;
}