Re: printk.time causes rare kernel boot hangs

From: Richard W.M. Jones
Date: Wed Jun 14 2023 - 13:35:28 EST



FWIW attached is a test program that runs the qemu instances in
parallel (up to 8 threads), which seems to be a quicker way to hit the
problem for me. Even on Intel, with this test I can hit the bug in a
few hundred iteration.

Rich.

--
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
Fedora Windows cross-compiler. Compile Windows programs, test, and
build Windows installers. Over 100 libraries supported.
http://fedoraproject.org/wiki/MinGW
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <error.h>
#include <unistd.h>
#include <sys/wait.h>
#include <pthread.h>

#define NR_ITERATIONS 10000
#define MAX_THREADS 8
#define MAX_TIME 60 /* max time to wait for qemu to complete */

//#define VMLINUX "/home/rjones/d/linux/vmlinux"
#define VMLINUX "vmlinux"
//#define QEMU "/home/rjones/d/qemu/build/qemu-system-x86_64"
#define QEMU "qemu-system-x86_64"
#define QEMU_COMMAND \
QEMU " -no-user-config -nodefaults -display none " \
"-machine accel=kvm:tcg,graphics=off -cpu max,la57=off -m 1280 " \
"-no-reboot " \
"-rtc driftfix=slew -no-hpet -global kvm-pit.lost_tick_policy=discard " \
"-kernel " VMLINUX " " \
"-serial stdio " \
"-append \"panic=1 console=ttyS0 edd=off udevtimeout=6000 udev.event-timeout=6000 no_timer_check printk.time=1 cgroup_disable=memory usbcore.nousb cryptomgr.notests tsc=reliable 8250.nr_uarts=1 selinux=0 TERM=xterm-256color\""

static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
static unsigned iterations;

static void *start_thread (void *);

int
main (int argc, char *argv[])
{
long n, i;
int r;
pthread_t thread[MAX_THREADS];

n = sysconf (_SC_NPROCESSORS_ONLN);
if (n == -1) error (EXIT_FAILURE, errno, "sysconf");

if (n > MAX_THREADS)
n = MAX_THREADS;

for (i = 0; i < n; ++i) {
r = pthread_create (&thread[i], NULL, start_thread, NULL);
if (r != 0) error (EXIT_FAILURE, r, "pthread_create");
}
for (i = 0; i < n; ++i) {
r = pthread_join (thread[i], NULL);
if (r != 0) error (EXIT_FAILURE, r, "pthread_join");
}
printf ("\n");
printf ("\n");
printf ("test ok\n");
exit (EXIT_SUCCESS);
}

static void *
start_thread (void *vp)
{
pid_t pid;
char tmp[] = "/tmp/kernel.out.XXXXXX";
char cmd[1024];
int i, r, status;

if (mkstemp (tmp) == -1)
error (EXIT_FAILURE, errno, "mkstemp: %s", tmp);

snprintf (cmd, sizeof cmd, QEMU_COMMAND " >& %s", tmp);

/* This basically runs a loop starting qemu. */
for (;;) {
pthread_mutex_lock (&lock);
if (iterations >= NR_ITERATIONS) {
pthread_mutex_unlock (&lock);
return NULL;
}
if (iterations <= MAX_THREADS) { // stagger the start times
pthread_mutex_unlock (&lock);
usleep (rand () % 3000000);
pthread_mutex_lock (&lock);
}
iterations++;
printf ("%d... ", iterations); fflush (stdout);
pthread_mutex_unlock (&lock);

pid = fork ();
if (pid == -1) error (EXIT_FAILURE, errno, "fork");
if (pid == 0) {
/* Child process, run qemu and wait. */
if (system (cmd) != 0)
_exit (EXIT_FAILURE);
else
_exit (EXIT_SUCCESS);
}

/* In the parent wait up to MAX_TIME seconds. */
for (i = 0; i < MAX_TIME; ++i) {
r = waitpid (pid, &status, WNOHANG);
if (r == -1) error (EXIT_FAILURE, errno, "waitpid");
if (r > 0) break;
sleep (1);
}

if (i == MAX_TIME || status != 0) {
/* Something failed in qemu (or it didn't
* exit), dump the whole log and exit with
* error.
*/
printf ("\n");
printf ("\n");
snprintf (cmd, sizeof cmd, "tail -20 %s", tmp);
system (cmd);
fprintf (stderr, "*** ERROR OR HANG ***\n");
exit (EXIT_FAILURE);
}
}

unlink (tmp);
}