Re: 2.5.68-mmX: Drowning in irq 7: nobody cared!

From: Andrew Morton (akpm@digeo.com)
Date: Tue May 06 2003 - 10:17:16 EST


Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:
>
> With APIC at least it doesnt suprise me the least. The IRQ hack seems
> extremely racey.

Good point. How about we do something like "if half of the past 1000
interrupts weren't handled then try to kill the IRQ"?

 arch/i386/kernel/irq.c | 55 +++++++++++++++++++++++++++++++++++++++----------
 include/linux/irq.h | 2 +
 2 files changed, 46 insertions(+), 11 deletions(-)

diff -puN arch/i386/kernel/irq.c~irq-check-rate-limit arch/i386/kernel/irq.c
--- 25/arch/i386/kernel/irq.c~irq-check-rate-limit 2003-05-06 07:54:17.000000000 -0700
+++ 25-akpm/arch/i386/kernel/irq.c 2003-05-06 08:16:15.000000000 -0700
@@ -66,8 +66,12 @@
 /*
  * Controller mappings for all interrupt sources:
  */
-irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned =
- { [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}};
+irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+ [0 ... NR_IRQS-1] = {
+ .handler = &no_irq_type,
+ .lock = SPIN_LOCK_UNLOCKED
+ }
+};
 
 static void register_irq_proc (unsigned int irq);
 
@@ -209,7 +213,6 @@ int handle_IRQ_event(unsigned int irq,
 {
         int status = 1; /* Force the "do bottom halves" bit */
         int retval = 0;
- struct irqaction *first_action = action;
 
         if (!(action->flags & SA_INTERRUPT))
                 local_irq_enable();
@@ -222,19 +225,43 @@ int handle_IRQ_event(unsigned int irq,
         if (status & SA_SAMPLE_RANDOM)
                 add_interrupt_randomness(irq);
         local_irq_disable();
- if (retval != 1) {
+ return status;
+}
+
+/*
+ * If 500 of the previous 1000 interrupts have not been handled then assume
+ * that the IRQ is stuck in some manner. Drop a diagnostic and try to turn the
+ * IRQ off.
+ *
+ * Called under desc->lock
+ */
+static void note_interrupt(irq_desc_t *desc, int irq, int status)
+{
+ if (status != IRQ_HANDLED)
+ desc->irqs_unhandled++;
+ desc->irq_count++;
+ if (desc->irq_count < 1000)
+ return;
+
+ desc->irq_count = 0;
+ if (desc->irqs_unhandled > 500) {
+ /*
+ * The interrupt is stuck
+ */
                 static int count = 100;
+ struct irqaction *action;
+
                 if (count) {
                         count--;
- if (retval) {
+ if (status) {
                                 printk("irq event %d: bogus retval mask %x\n",
- irq, retval);
+ irq, status);
                         } else {
                                 printk("irq %d: nobody cared!\n", irq);
                         }
                         dump_stack();
                         printk("handlers:\n");
- action = first_action;
+ action = desc->action;
                         do {
                                 printk("[<%p>]", action->handler);
                                 print_symbol(" (%s)",
@@ -243,9 +270,13 @@ int handle_IRQ_event(unsigned int irq,
                                 action = action->next;
                         } while (action);
                 }
+ /*
+ * Now kill the IRQ
+ */
+ desc->status |= IRQ_DISABLED;
+ desc->handler->disable(irq);
         }
-
- return status;
+ desc->irqs_unhandled = 0;
 }
 
 /*
@@ -418,10 +449,12 @@ asmlinkage unsigned int do_IRQ(struct pt
          * SMP environment.
          */
         for (;;) {
+ int status;
+
                 spin_unlock(&desc->lock);
- handle_IRQ_event(irq, &regs, action);
+ status = handle_IRQ_event(irq, &regs, action);
                 spin_lock(&desc->lock);
-
+ note_interrupt(desc, irq, status);
                 if (likely(!(desc->status & IRQ_PENDING)))
                         break;
                 desc->status &= ~IRQ_PENDING;
diff -puN include/linux/irq.h~irq-check-rate-limit include/linux/irq.h
--- 25/include/linux/irq.h~irq-check-rate-limit 2003-05-06 07:56:03.000000000 -0700
+++ 25-akpm/include/linux/irq.h 2003-05-06 08:05:17.000000000 -0700
@@ -61,6 +61,8 @@ typedef struct {
         hw_irq_controller *handler;
         struct irqaction *action; /* IRQ action list */
         unsigned int depth; /* nested irq disables */
+ unsigned int irq_count; /* For detecting broken interrupts */
+ unsigned int irqs_unhandled;
         spinlock_t lock;
 } ____cacheline_aligned irq_desc_t;
 

_

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Wed May 07 2003 - 22:00:26 EST