Re: floppy.c soft lockup

From: Mark Hounschell
Date: Sat Jun 02 2007 - 16:44:40 EST

Next message: Linus Torvalds: "Re: [RFC][PATCH] IO-APIC blacklist"
Previous message: Tear: "Re: [RFC][PATCH] IO-APIC blacklist"
In reply to: Oleg Nesterov: "Re: floppy.c soft lockup"
Next in thread: Oleg Nesterov: "Re: floppy.c soft lockup"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Oleg Nesterov wrote:
> On 06/01, Mark Hounschell wrote:
>> Oleg Nesterov wrote:
>>> On 06/01, Mark Hounschell wrote:
>>>> Ok the prctl never returned. I just replaced the ioctl with it and added
>>>> a printf before and after. I only get the one before. The thread is hung
>>>> at this point just as if I'd done the ioctl?
>>> Thanks. So we can rule out floppy.c. flush_scheduled_work/flush_workqueue
>>> is broken by this RT application. Imho, this is not the kernel problem.
>>>
>>> Now I am very sure that the initial suspect was correct: cpu starvation.
>>> I can cook a debug patch to be 100% sure tomorrow, which kernel version is
>>> most convenient to you?
>>>
>> 2.6.22-rc3 is fine thanks.
>
> Please try this patch, it should dump some debug info when flush_workqueue()
> hangs (after 30 seconds). You can use it with or without the previous patch
> I sent. Please wait for a couple of minutes to collect more info.
>
> Oleg.
>
> --- OLD/kernel/sched.c~TST 2007-04-05 12:20:35.000000000 +0400
> +++ OLD/kernel/sched.c 2007-06-02 15:41:53.000000000 +0400
> @@ -4177,6 +4177,20 @@ struct task_struct *idle_task(int cpu)
> return cpu_rq(cpu)->idle;
> }
>
> +struct task_struct *get_cpu_curr(int cpu)
> +{
> + unsigned long flags;
> + struct task_struct *curr;
> + struct rq *rq = cpu_rq(cpu);
> +
> + spin_lock_irqsave(&rq->lock, flags);
> + curr = rq->curr;
> + get_task_struct(curr);
> + spin_unlock_irqrestore(&rq->lock, flags);
> +
> + return curr;
> +}
> +
> /**
> * find_process_by_pid - find a process with a matching PID value.
> * @pid: the pid in question.
> --- OLD/kernel/workqueue.c~TST 2007-06-02 13:34:57.000000000 +0400
> +++ OLD/kernel/workqueue.c 2007-06-02 16:18:02.000000000 +0400
> @@ -49,6 +49,7 @@ struct cpu_workqueue_struct {
> struct task_struct *thread;
>
> int run_depth; /* Detect run_workqueue() recursion depth */
> + int jobs;
> } ____cacheline_aligned;
>
> /*
> @@ -253,6 +254,7 @@ static void run_workqueue(struct cpu_wor
>
> cwq->current_work = work;
> list_del_init(cwq->worklist.next);
> + cwq->jobs++;
> spin_unlock_irq(&cwq->lock);
>
> BUG_ON(get_wq_data(work) != cwq);
> @@ -328,7 +330,48 @@ static void insert_wq_barrier(struct cpu
> insert_work(cwq, &barr->work, tail);
> }
>
> -static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
> +extern struct task_struct *get_cpu_curr(int cpu);
> +
> +static void flush_wait(struct cpu_workqueue_struct *cwq, int cpu, struct completion *done)
> +{
> + struct task_struct *curr;
> + struct work_struct *work;
> + int old_pid, jobs;
> +
> + if (is_single_threaded(cwq->wq))
> + cpu = raw_smp_processor_id();
> +
> +again:
> + work = cwq->current_work;
> + jobs = cwq->jobs;
> +
> + curr = get_cpu_curr(cpu);
> + old_pid = curr->pid;
> + put_task_struct(curr);
> +
> + if (wait_for_completion_timeout(done, HZ * 30))
> + return;
> +
> + printk(KERN_ERR "ERR!! %s flush hang: %p %p %d %d\n", cwq->thread->comm,
> + work, cwq->current_work, jobs, cwq->jobs);
> +
> + curr = get_cpu_curr(cpu);
> + printk(KERN_ERR "CURR: %d %d %s %ld %ld\n", old_pid, curr->pid,
> + curr->comm, curr->nivcsw, curr->nvcsw);
> + put_task_struct(curr);
> +
> + spin_lock_irq(&cwq->lock);
> + list_for_each_entry(work, &cwq->worklist, entry)
> + print_symbol(" %s\n", (unsigned long) work->func);
> + printk(" ----\n");
> + if (cwq->current_work)
> + print_symbol(" %s\n", (unsigned long) cwq->current_work->func);
> + spin_unlock_irq(&cwq->lock);
> +
> + goto again;
> +}
> +
> +static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq, int cpu)
> {
> int active;
>
> @@ -351,7 +394,7 @@ static int flush_cpu_workqueue(struct cp
> spin_unlock_irq(&cwq->lock);
>
> if (active)
> - wait_for_completion(&barr.done);
> + flush_wait(cwq, cpu, &barr.done);
> }
>
> return active;
> @@ -377,7 +420,7 @@ void fastcall flush_workqueue(struct wor
>
> might_sleep();
> for_each_cpu_mask(cpu, *cpu_map)
> - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
> + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu), cpu);
> }
> EXPORT_SYMBOL_GPL(flush_workqueue);
>
> @@ -748,7 +791,7 @@ static void cleanup_workqueue_thread(str
> * checks list_empty(), and a "normal" queue_work() can't use
> * a dead CPU.
> */
> - while (flush_cpu_workqueue(cwq))
> + while (flush_cpu_workqueue(cwq, cpu))
> ;
>
> kthread_stop(cwq->thread);
>

Jun 2 16:36:11 harley kernel: ERR!! events/1 flush hang: c201dbc0
c201dbc0 10012 10012
Jun 2 16:36:11 harley kernel: CURR: 7974 7974 vrsx 93 26
Jun 2 16:36:11 harley kernel: wq_barrier_func+0x0/0x8
Jun 2 16:36:11 harley kernel: vmstat_update+0x0/0x24
Jun 2 16:36:11 harley kernel: ----
Jun 2 16:36:11 harley kernel: cache_reap+0x0/0xf4
Jun 2 16:36:41 harley kernel: ERR!! events/1 flush hang: c201dbc0
c201dbc0 10012 10012
Jun 2 16:36:41 harley kernel: CURR: 7974 7974 vrsx 93 26
Jun 2 16:36:41 harley kernel: wq_barrier_func+0x0/0x8
Jun 2 16:36:41 harley kernel: vmstat_update+0x0/0x24
Jun 2 16:36:41 harley kernel: ----
Jun 2 16:36:41 harley kernel: cache_reap+0x0/0xf4
Jun 2 16:37:11 harley kernel: ERR!! events/1 flush hang: c201dbc0
c201dbc0 10012 10012
Jun 2 16:37:11 harley kernel: CURR: 7974 7974 vrsx 93 26
Jun 2 16:37:11 harley kernel: wq_barrier_func+0x0/0x8
Jun 2 16:37:11 harley kernel: vmstat_update+0x0/0x24
Jun 2 16:37:11 harley kernel: ----
Jun 2 16:37:11 harley kernel: cache_reap+0x0/0xf4
Jun 2 16:37:41 harley kernel: ERR!! events/1 flush hang: c201dbc0
c201dbc0 10012 10012
Jun 2 16:37:41 harley kernel: CURR: 7974 7974 vrsx 93 26
Jun 2 16:37:41 harley kernel: wq_barrier_func+0x0/0x8
Jun 2 16:37:41 harley kernel: vmstat_update+0x0/0x24
Jun 2 16:37:41 harley kernel: ----
Jun 2 16:37:41 harley kernel: cache_reap+0x0/0xf4
Jun 2 16:37:51 harley kernel: RTOM: In int handler for 12 usec.
Jun 2 16:38:11 harley kernel: ERR!! events/1 flush hang: c201dbc0
c201dbc0 10012 10012
Jun 2 16:38:11 harley kernel: CURR: 7974 7974 vrsx 93 26
Jun 2 16:38:11 harley kernel: wq_barrier_func+0x0/0x8
Jun 2 16:38:11 harley kernel: vmstat_update+0x0/0x24
Jun 2 16:38:11 harley kernel: ----
Jun 2 16:38:11 harley kernel: cache_reap+0x0/0xf4
Jun 2 16:38:41 harley kernel: ERR!! events/1 flush hang: c201dbc0
c201dbc0 10012 10012
Jun 2 16:38:41 harley kernel: CURR: 7974 7974 vrsx 93 26
Jun 2 16:38:41 harley kernel: wq_barrier_func+0x0/0x8
Jun 2 16:38:41 harley kernel: vmstat_update+0x0/0x24
Jun 2 16:38:41 harley kernel: ----
Jun 2 16:38:41 harley kernel: cache_reap+0x0/0xf4
Jun 2 16:39:11 harley kernel: ERR!! events/1 flush hang: c201dbc0
c201dbc0 10012 10012
Jun 2 16:39:11 harley kernel: CURR: 7974 7974 vrsx 93 26
Jun 2 16:39:11 harley kernel: wq_barrier_func+0x0/0x8
Jun 2 16:39:11 harley kernel: vmstat_update+0x0/0x24
Jun 2 16:39:11 harley kernel: ----
Jun 2 16:39:11 harley kernel: cache_reap+0x0/0xf4
Jun 2 16:39:41 harley kernel: ERR!! events/1 flush hang: c201dbc0
c201dbc0 10012 10012
Jun 2 16:39:41 harley kernel: CURR: 7974 7974 vrsx 93 26
Jun 2 16:39:41 harley kernel: wq_barrier_func+0x0/0x8
Jun 2 16:39:41 harley kernel: vmstat_update+0x0/0x24
Jun 2 16:39:41 harley kernel: ----
Jun 2 16:39:41 harley kernel: cache_reap+0x0/0xf4
Jun 2 16:40:11 harley kernel: ERR!! events/1 flush hang: c201dbc0
c201dbc0 10012 10012
Jun 2 16:40:11 harley kernel: CURR: 7974 7974 vrsx 93 26
Jun 2 16:40:11 harley kernel: wq_barrier_func+0x0/0x8
Jun 2 16:40:11 harley kernel: vmstat_update+0x0/0x24
Jun 2 16:40:11 harley kernel: ----
Jun 2 16:40:11 harley kernel: cache_reap+0x0/0xf4
Jun 2 16:40:41 harley kernel: ERR!! events/1 flush hang: c201dbc0
c201dbc0 10012 10012
Jun 2 16:40:41 harley kernel: CURR: 7974 7974 vrsx 93 26
Jun 2 16:40:41 harley kernel: wq_barrier_func+0x0/0x8
Jun 2 16:40:41 harley kernel: vmstat_update+0x0/0x24
Jun 2 16:40:41 harley kernel: ----
Jun 2 16:40:41 harley kernel: cache_reap+0x0/0xf4

Regards
Mark
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Linus Torvalds: "Re: [RFC][PATCH] IO-APIC blacklist"
Previous message: Tear: "Re: [RFC][PATCH] IO-APIC blacklist"
In reply to: Oleg Nesterov: "Re: floppy.c soft lockup"
Next in thread: Oleg Nesterov: "Re: floppy.c soft lockup"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]