[PATCH RFC] select_idle_sibling experiments

From: Chris Mason
Date: Tue Apr 05 2016 - 14:10:54 EST


Hi everyone,

We're porting the fb kernel up to 4.5, and one of our last few out-of-tree
patches is a hack to try harder to find idle cpus when waking up tasks.
This helps in pretty much every workload we run, mostly because they all
get tuned with a similar setup:

1) find the load where latencies stop being acceptable
2) Run the server at just a little less than that

Usually this means our CPUs are just a little bit idle, and a poor
scheduler decision to place a task on a busy CPU instead of an idle CPU
ends up impacting our p99 latencies.

Mike helped us with this last year, fixing up wake_wide() to improve
things. But we still ended up having to go back to the old hack.

I started with a small-ish program to benchmark wakeup latencies. The
basic idea is a bunch of worker threads who sit around and burn CPU.
Every once and a while they send a message to a message thread.

The message thread records the time he woke up the worker, and the
worker records the delta between that time and the time he actually got
the CPU again. At the end it spits out a latency histogram. The only
thing we record is the wakeup latency, there's no measurement of 'work
done' or any of the normal things you'd expect in a benchmark.

It has knobs for cpu think time, and for how long the messenger thread
waits before replying. Here's how I'm running it with my patch:

./schbench -c 30000 -s 30000 -m 6 -t 24 -r 30
Latency percentiles (usec)
50.0000th: 50
75.0000th: 62
90.0000th: 73
95.0000th: 79
*99.0000th: 99
99.5000th: 761
99.9000th: 10160
Over=0, min=0, max=14659

This translates to cputime of 30ms, sleep time of 30ms, 6 messenger
threads, 24 workers per messenger and a run time of 30 seconds. My box
has two sockets, 24 cores each. Mainline varies a bit, but numbers like
this are typical:

./schbench -c 30000 -s 30000 -m 6 -t 24 -r 30
Latency percentiles (usec)
50.0000th: 50
75.0000th: 63
90.0000th: 76
95.0000th: 85
*99.0000th: 4680
99.5000th: 10192
99.9000th: 10928
Over=0, min=0, max=21816

A high p99 in real application performance will block a new kernel for
us. p99.5 and p99.9 are included just to show how long the tail really
is.

I've inlined schbench.c below and attached as a .gz file just in case
exchange manages to munge it.

Now, on to the patch. I pushed some code around and narrowed the
problem down to select_idle_sibling() We have cores going into and out
of idle fast enough that even this cut our latencies in half:

static int select_idle_sibling(struct task_struct *p, int target)
goto next;

for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
+ if (!idle_cpu(i))
goto next;
}

IOW, by the time we get down to for_each_cpu(), the idle_cpu() check
done at the top of the function is no longer valid.

I tried a few variations on select_idle_sibling() that preserved the
underlying goal of returning idle cores before idle SMT threads. They
were all horrible in different ways, and none of them were fast.

The patch below just makes select_idle_sibling pick the first idle
thread it can find. When I ran it through production workloads here, it
was faster than the patch we've been carrying around for the last few
years.


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b7d4b..c41baa6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4974,7 +4974,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
static int select_idle_sibling(struct task_struct *p, int target)
{
struct sched_domain *sd;
- struct sched_group *sg;
int i = task_cpu(p);

if (idle_cpu(target))
@@ -4990,24 +4989,14 @@ static int select_idle_sibling(struct task_struct *p, int target)
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (!sd)
+ goto done;

- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
+ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+ if (cpu_active(i) && idle_cpu(i)) {
+ target = i;
goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ }
}
done:
return target;

--------------------------------------------

/*
* schbench.c
*
* Copyright (C) 2016 Facebook
* Chris Mason <clm@xxxxxx>
*
* GPLv2, portions copied from the kernel and from Jens Axboe's fio
*
* gcc -Wall -O0 -W schbench.c -o schbench -lpthread
*/
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <getopt.h>
#include <sys/time.h>
#include <time.h>
#include <string.h>
#include <linux/futex.h>
#include <sys/syscall.h>

#define PLAT_BITS 8
#define PLAT_VAL (1 << PLAT_BITS)
#define PLAT_GROUP_NR 19
#define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL)
#define PLAT_LIST_MAX 20

/* -m number of message threads */
static int message_threads = 2;
/* -t number of workers per message thread */
static int worker_threads = 16;
/* -r seconds */
static int runtime = 30;
/* -s usec */
static int sleeptime = 30000;
/* -c usec */
static unsigned long long cputime = 30000;
/* -a, bool */
static int autobench = 0;

/* the latency histogram uses this to pitch outliers */
static unsigned int max_us = 50000;

/* main() sets this to the time when we should all stop doing work */
static struct timeval global_stop;

/* the message threads flip this to true when they decide runtime is up */
static unsigned long stopping = 0;


/*
* one stat struct per thread data, when the workers sleep this records the
* latency between when they are woken up and when they actually get the
* CPU again. The message threads sum up the stats of all the workers and
* then bubble them up to main() for printing
*/
struct stats {
unsigned int plat[PLAT_NR];
unsigned int nr_samples;
unsigned int max;
unsigned int min;
unsigned int over;
};

/* this defines which latency profiles get printed */
#define PLIST_P99 4
static double plist[PLAT_LIST_MAX] = { 50.0, 75.0, 90.0, 95.0, 99.0, 99.5, 99.9 };

enum {
HELP_LONG_OPT = 1,
};

char *option_string = "am:t:s:c:r:";
static struct option long_options[] = {
{"auto", no_argument, 0, 'a'},
{"message-threads", required_argument, 0, 'm'},
{"threads", required_argument, 0, 't'},
{"runtime", required_argument, 0, 'r'},
{"sleeptime", required_argument, 0, 's'},
{"cputime", required_argument, 0, 'c'},
{"help", no_argument, 0, HELP_LONG_OPT},
{0, 0, 0, 0}
};

static void print_usage(void)
{
fprintf(stderr, "schbench usage:\n"
"\t-d (--dispatch-threads): number of message threads (def: 2)\n"
"\t-t (--threads): worker threads per message thread (def: 16)\n"
"\t-r (--runtime): How long to run before exiting (seconds, def: 30)\n"
"\t-s (--sleeptime): Message thread latency (usec, def: 10000\n"
"\t-c (--cputime): How long to think during loop (usec, def: 10000\n"
);
exit(1);
}

static void parse_options(int ac, char **av)
{
int c;

while (1) {
int option_index = 0;

c = getopt_long(ac, av, option_string,
long_options, &option_index);

if (c == -1)
break;

switch(c) {
case 'a':
autobench = 1;
break;
case 's':
sleeptime = atoi(optarg);
break;
case 'c':
cputime = atoi(optarg);
break;
case 'd':
message_threads = atoi(optarg);
break;
case 't':
worker_threads = atoi(optarg);
break;
case 'r':
runtime = atoi(optarg);
break;
case '?':
case HELP_LONG_OPT:
print_usage();
break;
default:
break;
}
}

if (optind < ac) {
fprintf(stderr, "Error Extra arguments '%s'\n", av[optind]);
exit(1);
}
}

void tvsub(struct timeval * tdiff, struct timeval * t1, struct timeval * t0)
{
tdiff->tv_sec = t1->tv_sec - t0->tv_sec;
tdiff->tv_usec = t1->tv_usec - t0->tv_usec;
if (tdiff->tv_usec < 0 && tdiff->tv_sec > 0) {
tdiff->tv_sec--;
tdiff->tv_usec += 1000000;
if (tdiff->tv_usec < 0) {
fprintf(stderr, "lat_fs: tvsub shows test time ran backwards!\n");
exit(1);
}
}

/* time shouldn't go backwards!!! */
if (tdiff->tv_usec < 0 || t1->tv_sec < t0->tv_sec) {
tdiff->tv_sec = 0;
tdiff->tv_usec = 0;
}
}

/*
* returns the difference between start and stop in usecs. Negative values
* are turned into 0
*/
unsigned long long tvdelta(struct timeval *start, struct timeval *stop)
{
struct timeval td;
unsigned long long usecs;

tvsub(&td, stop, start);
usecs = td.tv_sec;
usecs *= 1000000;
usecs += td.tv_usec;
return (usecs);
}

/* mr axboe's magic latency histogram */
static unsigned int plat_val_to_idx(unsigned int val)
{
unsigned int msb, error_bits, base, offset;

/* Find MSB starting from bit 0 */
if (val == 0)
msb = 0;
else
msb = sizeof(val)*8 - __builtin_clz(val) - 1;

/*
* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
* all bits of the sample as index
*/
if (msb <= PLAT_BITS)
return val;

/* Compute the number of error bits to discard*/
error_bits = msb - PLAT_BITS;

/* Compute the number of buckets before the group */
base = (error_bits + 1) << PLAT_BITS;

/*
* Discard the error bits and apply the mask to find the
* index for the buckets in the group
*/
offset = (PLAT_VAL - 1) & (val >> error_bits);

/* Make sure the index does not exceed (array size - 1) */
return (base + offset) < (PLAT_NR - 1) ?
(base + offset) : (PLAT_NR - 1);
}

/*
* Convert the given index of the bucket array to the value
* represented by the bucket
*/
static unsigned int plat_idx_to_val(unsigned int idx)
{
unsigned int error_bits, k, base;

if (idx >= PLAT_NR) {
fprintf(stderr, "idx %u is too large\n", idx);
exit(1);
}

/* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
* all bits of the sample as index */
if (idx < (PLAT_VAL << 1))
return idx;

/* Find the group and compute the minimum value of that group */
error_bits = (idx >> PLAT_BITS) - 1;
base = 1 << (error_bits + PLAT_BITS);

/* Find its bucket number of the group */
k = idx % PLAT_VAL;

/* Return the mean of the range of the bucket */
return base + ((k + 0.5) * (1 << error_bits));
}


static unsigned int calc_percentiles(unsigned int *io_u_plat, unsigned long nr,
unsigned int **output)
{
unsigned long sum = 0;
unsigned int len, i, j = 0;
unsigned int oval_len = 0;
unsigned int *ovals = NULL;
int is_last;

len = 0;
while (len < PLAT_LIST_MAX && plist[len] != 0.0)
len++;

if (!len)
return 0;

/*
* Calculate bucket values, note down max and min values
*/
is_last = 0;
for (i = 0; i < PLAT_NR && !is_last; i++) {
sum += io_u_plat[i];
while (sum >= (plist[j] / 100.0 * nr)) {
if (j == oval_len) {
oval_len += 100;
ovals = realloc(ovals, oval_len * sizeof(unsigned int));
}

ovals[j] = plat_idx_to_val(i);
is_last = (j == len - 1);
if (is_last)
break;

j++;
}
}

*output = ovals;
return len;
}

static int calc_p99(struct stats *s)
{
unsigned int *ovals = NULL;
int ret = 0;
int len;

len = calc_percentiles(s->plat, s->nr_samples, &ovals);
if (len && len > PLIST_P99)
ret = ovals[PLIST_P99];
if (ovals)
free(ovals);
return ret;
}

static void show_latencies(struct stats *s)
{
unsigned int *ovals = NULL;
unsigned int len, i;

len = calc_percentiles(s->plat, s->nr_samples, &ovals);
if (len) {
fprintf(stderr, "Latency percentiles (usec)\n");
for (i = 0; i < len; i++)
fprintf(stderr, "\t%s%2.4fth: %u\n",
i == PLIST_P99 ? "*" : "",
plist[i], ovals[i]);
}

if (ovals)
free(ovals);

fprintf(stderr, "\tOver=%u, min=%u, max=%u\n", s->over, s->min, s->max);
}

/* fold latency info from s into d */
void combine_stats(struct stats *d, struct stats *s)
{
int i;
for (i = 0; i < PLAT_NR; i++)
d->plat[i] += s->plat[i];
d->nr_samples += s->nr_samples;
d->over += s->over;
if (s->max > d->max)
d->max = s->max;
if (s->min < d->min)
d->min = s->min;
}

/* record a latency result into the histogram */
static void add_lat(struct stats *s, unsigned int us)
{
int lat_index = 0;

if (us > s->max)
s->max = us;
if (us < s->min)
s->min = us;

if (us > max_us) {
fprintf(stderr, "latency=%u usec\n", us);
s->over++;
}

lat_index = plat_val_to_idx(us);
__sync_fetch_and_add(&s->plat[lat_index], 1);
__sync_fetch_and_add(&s->nr_samples, 1);
}

/*
* every thread has one of these, it comes out to about 19K thanks to the
* giant stats struct
*/
struct thread_data {
pthread_t tid;
/* ->next is for placing us on the msg_thread's list for waking */
struct thread_data *next;

/* our parent thread and messaging partner */
struct thread_data *msg_thread;

/*
* the msg thread stuffs gtod in here before waking us, so we can
* measure scheduler latency
*/
struct timeval wake_time;

/* keep the futex and the wake_time in the same cacheline */
int futex;

/* mr axboe's magic latency histogram */
struct stats stats;
};

/* we're so fancy we make our own futex wrappers */
#define FUTEX_BLOCKED 0
#define FUTEX_RUNNING 1

static int futex(int *uaddr, int futex_op, int val,
const struct timespec *timeout, int *uaddr2, int val3)
{
return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr2, val3);
}

/*
* wakeup a process waiting on a futex, making sure they are really waiting
* first
*/
static void fpost(int *futexp)
{
int s;

if (__sync_bool_compare_and_swap(futexp, FUTEX_BLOCKED,
FUTEX_RUNNING)) {
s = futex(futexp, FUTEX_WAKE, 1, NULL, NULL, 0);
if (s == -1) {
perror("FUTEX_WAKE");
exit(1);
}
}
}

/*
* wait on a futex, with an optional timeout. Make sure to set
* the futex to FUTEX_BLOCKED beforehand.
*
* This will return zero if all went well, or return -ETIMEDOUT if you
* hit the timeout without getting posted
*/
static int fwait(int *futexp, struct timespec *timeout)
{
int s;
while (1) {
/* Is the futex available? */
if (__sync_bool_compare_and_swap(futexp, FUTEX_RUNNING,
FUTEX_BLOCKED)) {
break; /* Yes */
}
/* Futex is not available; wait */
s = futex(futexp, FUTEX_WAIT, FUTEX_BLOCKED, timeout, NULL, 0);
if (s == -1 && errno != EAGAIN) {
if (errno == ETIMEDOUT)
return -ETIMEDOUT;
perror("futex-FUTEX_WAIT");
exit(1);
}
}
return 0;
}

/*
* cmpxchg based list prepend
*/
static void xlist_add(struct thread_data *head, struct thread_data *add)
{
struct thread_data *old;
struct thread_data *ret;

while (1) {
old = head->next;
add->next = old;
ret = __sync_val_compare_and_swap(&head->next, old, add);
if (ret == old)
break;
}
}

/*
* xchg based list splicing. This returns the entire list and
* replaces the head->next with NULL
*/
static struct thread_data *xlist_splice(struct thread_data *head)
{
struct thread_data *old;
struct thread_data *ret;

while (1) {
old = head->next;
ret = __sync_val_compare_and_swap(&head->next, old, NULL);
if (ret == old)
break;
}
return ret;
}

/*
* Wake everyone currently waiting on the message list, filling in their
* thread_data->wake_time with the current time.
*
* It's not exactly the current time, it's really the time at the start of
* the list run. We want to detect when the scheduler is just preempting the
* waker and giving away the rest of its timeslice. So we gtod once at
* the start of the loop and use that for all the threads we wake.
*/
static void xlist_wake_all(struct thread_data *td)
{
struct thread_data *list;
struct thread_data *next;
struct timeval now;

list = xlist_splice(td);
gettimeofday(&now, NULL);
while (list) {
next = list->next;
list->next = NULL;
memcpy(&list->wake_time, &now, sizeof(now));
fpost(&list->futex);
list = next;
}
}

/*
* called by worker threads to send a message and wait for the answer.
* In reality we're just trading one cacheline with the gtod and futex in
* it, but that's good enough. We gtod after waking and use that to
* record scheduler latency.
*/
static void msg_and_wait(struct thread_data *td)
{
struct timeval now;
unsigned long long delta;
struct timespec timeout;

timeout.tv_sec = 0;
timeout.tv_nsec = 5000 * 1000;

/* set ourselves to blocked */
td->futex = FUTEX_BLOCKED;
gettimeofday(&td->wake_time, NULL);

/* add us to the list */
xlist_add(td->msg_thread, td);

fpost(&td->msg_thread->futex);

/*
* don't wait if the main threads are shutting down,
* they will never kick us fpost has a full barrier, so as long
* as the message thread walks his list after setting stopping,
* we shouldn't miss the wakeup
*/
if (!stopping) {
/* if he hasn't already woken us up, wait */
fwait(&td->futex, NULL);
}

gettimeofday(&now, NULL);
delta = tvdelta(&td->wake_time, &now);
if (delta > 0)
add_lat(&td->stats, delta);
}

/*
* once the message thread starts all his children, this is where he
* loops until our runtime is up. Basically this sits around waiting
* for posting by the worker threads, replying to their messages after
* a delay of 'sleeptime' + some jitter.
*/
static void run_msg_thread(struct thread_data *td)
{
struct timeval now;
struct timespec timeout;
unsigned int seed = pthread_self();
int max_jitter = sleeptime / 4;
int jitter;

jitter = rand_r(&seed) % max_jitter;
timeout.tv_sec = 0;
timeout.tv_nsec = (sleeptime + jitter) * 1000;

while (1) {
td->futex = FUTEX_BLOCKED;
xlist_wake_all(td);

gettimeofday(&now, NULL);
if (now.tv_sec > global_stop.tv_sec) {
stopping = 1;
__sync_synchronize();
xlist_wake_all(td);
break;
}
fwait(&td->futex, &timeout);

/*
* messages shouldn't be instant, sleep a little to make them
* wait
*/
jitter = rand_r(&seed) % max_jitter;
usleep(sleeptime + jitter);
}
}

#define nop __asm__ __volatile__("rep;nop": : :"memory")

static void usec_spin(unsigned long spin_time)
{
struct timeval now;
struct timeval start;
unsigned long long delta;

gettimeofday(&start, NULL);
while (1) {
gettimeofday(&now, NULL);
delta = tvdelta(&start, &now);
if (delta > spin_time)
return;
nop;
}
}

/*
* the worker thread is pretty simple, it just does a single spin and
* then waits on a message from the message thread
*/
void *worker_thread(void *arg)
{
struct thread_data *td = arg;

while(1) {
if (stopping)
break;

usec_spin(cputime);
msg_and_wait(td);
}
return NULL;
}

/*
* the message thread starts his own gaggle of workers and then sits around
* replying when they post him. He collects latency stats as all the threads
* exit
*/
void *message_thread(void *arg)
{
struct thread_data *td = arg;
struct thread_data *worker_threads_mem = NULL;
int i;
int ret;

worker_threads_mem = calloc(worker_threads, sizeof(struct thread_data));

if (!worker_threads_mem) {
perror("unable to allocate ram");
pthread_exit((void *)-ENOMEM);
}

for (i = 0; i < worker_threads; i++) {
pthread_t tid;

worker_threads_mem[i].msg_thread = td;
ret = pthread_create(&tid, NULL, worker_thread,
worker_threads_mem + i);
if (ret) {
fprintf(stderr, "error %d from pthread_create\n", ret);
exit(1);
}
worker_threads_mem[i].tid = tid;
}

run_msg_thread(td);

for (i = 0; i < worker_threads; i++) {
pthread_join(worker_threads_mem[i].tid, NULL);
combine_stats(&td->stats, &worker_threads_mem[i].stats);
}
free(worker_threads_mem);

return NULL;
}

int main(int ac, char **av)
{
int i;
int ret;
struct thread_data *message_threads_mem = NULL;
struct stats stats;

parse_options(ac, av);
again:
stopping = 0;
memset(&stats, 0, sizeof(stats));

message_threads_mem = calloc(message_threads,
sizeof(struct thread_data));


if (!message_threads_mem) {
perror("unable to allocate ram");
exit(1);
}
gettimeofday(&global_stop, NULL);
global_stop.tv_sec += runtime;

/* start our message threads, each one starts its own workers */
for (i = 0; i < message_threads; i++) {
pthread_t tid;
ret = pthread_create(&tid, NULL, message_thread,
message_threads_mem + i);
if (ret) {
fprintf(stderr, "error %d from pthread_create\n", ret);
exit(1);
}
message_threads_mem[i].tid = tid;
}
for (i = 0; i < message_threads; i++) {
pthread_join(message_threads_mem[i].tid, NULL);
combine_stats(&stats, &message_threads_mem[i].stats);
}

free(message_threads_mem);

/*
* in auto bench mode, keep adding workers until our latencies get
* horrible
*/
if (autobench) {
int p99 = calc_p99(&stats);
fprintf(stderr, "cputime %Lu threads %d p99 %d\n",
cputime, worker_threads, p99);
if (p99 < 2000) {
worker_threads++;
goto again;
}
}

show_latencies(&stats);

return 0;
}

Attachment: schbench.c.gz
Description: Binary data