Re: NCR53c810 SCSI host - kernel panic

Drew Eckhardt (drew@poohsticks.org)
Wed, 28 Jun 1995 09:38:35 -0600


In message <199506261107.MAA03400@ncgrafix.demon.co.uk>, steve@ncgrafix.demon.c
o.uk writes:
>Hi Drew,
> Firstly, this message has also been sent to the linux-kernel mailing list.

As was this response; however linux-scsi or ncr53c810@Colorado.EDU would
probably be more appropriate.

> This has an NCR53c810 SCSI chipset built into the motherboard, and my machin
>e
>has an IBM DPES-31080 1.05GB SCSI hard disk with SCSI ID 0.
>
> Without this drive connected to the SCSI bus, I can boot the machine from a
>boot floppy and a magneto optical root device (this is a SCSI device also).
> If I connect the SCSI hard disk to the SCSI bus, the kernel panics with a ge
>neral protection fault. This fault is generated in the print_insn function on
>the line
>
> dcmd = (insn[0] >> 24) & 0xff;
>
>where insn has the value 0xffffff00. (A presumably invalid pointer in this
>instance)

In theory, this shouldn't happen.

>print_insn is being called from the intr_dma function, immediately after the
>printk("scsi%d : illegal instruction", host->host_no) line.

And if it was happening, we should get a bus fault interrupt instead of
an illegal instruction interrupt, since DSP is pointing off in La-La land.

>This hard disk by default has no junper on the "Sync Nego" pin. With a jumper
>on this pin, the machine boots fine. (Compaq supply no documentation with this
>
>drive, so I'm making an educated guess that the sync nego ping is to do with
>synchronous transfer negotiation)

You need to renegotiate synchronous parameters after
- An error condition (SCSI command returned CHECK CONDITION status)
- A bus reset
- Power up

So that you can hot swap drives, nice SCSI devices will initiate the
synchronous negotiation themselves in the last two cases; and we're not
handling this correctly for some reason.

Some SCSI host adapters and their driver software are a bit broken, and
don't like it too much when SCSI devices do synchronous transfer
requests, so most devices provide a way to disable this otherwise desireable
behavior.

>scsi0 : target 0 requesting synchronous transfer period 100ns, offset 15
>scsi0 : setting target 0 to asynchronous SCSI
>scsi0 : illegal instruction general protection: 0000
>
>I don't know very much about SCSI drivers, SCSI protocol etc. so do you have
>any ideas what is going on here?? At the very least, I guess it would be nice
>if the kernel did not panic before your diagnostics have finished printing.
>If on the other hand, my hardware is not behaving properly with respect to the
>SCSI protocol etc - I need to get some hardware that does work.

Your hardware is fine. As of the latest release, the NCR driver does a
bus reset on initialization so that things still work for the people who've
been running Some Other Operating System where the drivers have negotiated
for synchronous transfers and then started up Linux without a reboot, etc.

With nice SCSI devices, this causes them to initiate a synchronous negotiation,
and apparantly that's broken.

>Anyway, any ideas, comments etc are appreciated.

Sure. Apply these patches, and we'll see how the diagnostic output changes.
Note that they're against 1.1.10 + ELF patches; there'll be one trivial section
which won't apply to the stock a.out only 1.1.10 sources.

If my girl friend lets me geek out, I'll hook my RZ55 up to my second NCR
chip later today and see what happens in this situation.

--- 53c7,8xx.c 1995/06/23 05:58:11 1.1
+++ 53c7,8xx.c 1995/06/28 15:22:02
@@ -6,7 +6,7 @@
*/


-#define PERM_OPTIONS (OPTION_IO_MAPPED|OPTION_DEBUG_TEST1)
+#define PERM_OPTIONS (OPTION_IO_MAPPED|OPTION_DEBUG_TEST1|OPTION_DEBUG_SDTR)
/*
* Define SCSI_MALLOC to use scsi_malloc instead of kmalloc. Other than
* preventing deadlock, I'm not sure why we'd want to do this.
@@ -23,7 +23,7 @@
* Copyright 1993, 1994, 1995 Drew Eckhardt
* Visionary Computing
* (Unix and Linux consulting and custom programming)
- * drew@Colorado.EDU
+ * drew@PoohSticks.ORG
* +1 (303) 786-7975
*
* TolerANT and SCSI SCRIPTS are registered trademarks of NCR Corporation.
@@ -176,7 +176,9 @@
#include "constants.h"
#include "sd.h"

+static int shutdown (struct Scsi_Host *host);
static void abnormal_finished (struct NCR53c7x0_cmd *cmd, int result);
+static int disable (struct Scsi_Host *host);
static int NCR53c8xx_run_tests (struct Scsi_Host *host);
static int NCR53c8xx_script_len;
static int NCR53c8xx_dsa_len;
@@ -185,7 +187,8 @@
static void intr_phase_mismatch (struct Scsi_Host *host, struct NCR53c7x0_cmd
*cmd);
static void intr_dma (struct Scsi_Host *host, struct NCR53c7x0_cmd *cmd);
-static void print_dsa (struct Scsi_Host *host, unsigned long *dsa);
+static void print_dsa (struct Scsi_Host *host, unsigned long *dsa,
+ const char *prefix);
static int print_insn (struct Scsi_Host *host, unsigned long *insn,
char *prefix, int kernel);

@@ -432,26 +435,6 @@
hostdata->NCR53c7xx_msg_abort = ABORT;
hostdata->NCR53c7xx_msg_nop = NOP;

- /*
- * Set up an interrupt handler if we aren't already sharing an IRQ
- * with another board.
- */
-
- for (search = first_host; search && ((search->hostt != the_template) ||
- (search->irq != host->irq)); search=search->next);
-
- if (!search) {
- if (request_irq(host->irq, NCR53c7x0_intr, SA_INTERRUPT, "53c7,8xx")) {
- printk("scsi%d : IRQ%d not free, detaching\n",
- host->host_no, host->irq);
- scsi_unregister (host);
- return -1;
- }
- } else {
- printk("scsi%d : using interrupt handler previously installed for scsi%d\n",
- host->host_no, search->host_no);
- }
-
printk ("scsi%d : using %s mapped access\n", host->host_no,
(hostdata->options & OPTION_MEMORY_MAPPED) ? "memory" :
"io");
@@ -480,6 +463,13 @@
host->this_id = NCR53c7x0_read8(SCID_REG) & 7;
hostdata->this_id_mask = 1 << host->this_id;
#endif
+
+ if (!host->this_id) {
+ printk("scsi%d : initiator ID was set to 0, changing to 7\n",
+ host->host_no);
+ host->this_id = 7;
+ hostdata->this_id_mask = 1 << 7;
+ };

printk("scsi%d : using initiator ID %d\n", host->host_no,
host->this_id);
@@ -640,6 +630,27 @@
hostdata->expecting_iid = 0;
hostdata->expecting_sto = 0;

+ /*
+ * Set up an interrupt handler if we aren't already sharing an IRQ
+ * with another board.
+ */
+
+ for (search = first_host; search && !(search->hostt == the_template &&
+ search->irq == host->irq && search != host); search=search->next);
+
+ if (!search) {
+ if (request_irq(host->irq, NCR53c7x0_intr, SA_INTERRUPT, "53c7,8xx")) {
+ printk("scsi%d : IRQ%d not free, detaching\n",
+ host->host_no, host->irq);
+ scsi_unregister (host);
+ return -1;
+ }
+ } else {
+ printk("scsi%d : using interrupt handler previously installed for scsi%d\n",
+ host->host_no, search->host_no);
+ }
+
+
if ((hostdata->run_tests && hostdata->run_tests(host) == -1) ||
(hostdata->options & OPTION_DEBUG_TESTS_ONLY)) {
/* XXX Should disable interrupts, etc. here */
@@ -1703,7 +1714,19 @@
hostdata->dsp_changed = 1;
}
return SPECIFIC_INT_NOTHING;
+
+
case A_int_msg_sdtr:
+/*
+ * At this point, hostdata->msg_buf contains
+ * 0 EXTENDED MESSAGE
+ * 1 length
+ * 2 SDTR
+ * 3 period * 4ns
+ * 4 offset
+ */
+
+
if (cmd) {
printk ("scsi%d : target %d %s synchronous transfer period %dns, offset%d\n",
host->host_no, c->target, (cmd->flags & CMD_FLAG_SDTR) ? "accepting" :
@@ -1734,13 +1757,26 @@
}

patch_dsa_32 (cmd->dsa, dsa_msgout_other, 0, 5);
- patch_dsa_32 (cmd->dsa, dsa_msgout_other, 1,
+ patch_dsa_32 (cmd->dsa, dsa_msgout_other, 1, (long)
hostdata->msg_buf);
hostdata->dsp = hostdata->script +
- hostdata->E_respond_message / sizeof(long);
+ hostdata->E_respond_message / sizeof(long);
hostdata->dsp_changed = 1;
}

+ if (hostdata->options & OPTION_DEBUG_SDTR) {
+ printk("scsi%d : DSP=0x%lx, DCMD|DBC=0x%lx, DSA=0x%lx\n"
+ " DSPS=0x%lx, TEMP=0x%lx, DMODE=0x%x,\n"
+ " DNAD=0x%lx, new DSP 0x%lx\n",
+ host->host_no, (long) dsp,
+ NCR53c7x0_read32(DBC_REG),
+ NCR53c7x0_read32(DSA_REG),
+ NCR53c7x0_read32(DSPS_REG),
+ NCR53c7x0_read32(TEMP_REG),
+ (int) NCR53c7x0_read8(hostdata->dmode),
+ NCR53c7x0_read32(DNAD_REG), (long) hostdata->dsp);
+ }
+
if (hostdata->msg_buf[4]) {
int Hz = 1000000000 / (hostdata->msg_buf[3] * 4);
printk ("scsi%d : setting target %d to %d.%02dMhz %s SCSI%s\n"
@@ -2273,9 +2309,9 @@
#if 0
NCR53c7x0_write8(STIME0_REG_800,
((14 << STIME0_800_SEL_SHIFT) & STIME0_800_SEL_MASK)
-/* Disable HTH interrupt */
| ((15 << STIME0_800_HTH_SHIFT) & STIME0_800_HTH_MASK));
#else
+/* Disable HTH interrupt */
NCR53c7x0_write8(STIME0_REG_800,
((14 << STIME0_800_SEL_SHIFT) & STIME0_800_SEL_MASK));
#endif
@@ -3033,14 +3069,10 @@
size_t buflen; /* Length of same */
#endif

-#if 0
- printk("interrupt %d received\n", irq);
-#endif
-
do {
done = 1;
- for (host = first_host; host; host = hostdata->next ?
- hostdata->next : NULL) {
+ for (host = first_host; host; host = host->next)
+ if (host->hostt == the_template && host->irq == irq) {
NCR53c7x0_local_setup(host);

hostdata = (struct NCR53c7x0_hostdata *) host->hostdata;
@@ -3465,7 +3497,8 @@
* may be NULL.
*/

-static void intr_dma (struct Scsi_Host *host, struct NCR53c7x0_cmd *cmd) {
+static void
+intr_dma (struct Scsi_Host *host, struct NCR53c7x0_cmd *cmd) {
NCR53c7x0_local_declare();
struct NCR53c7x0_hostdata *hostdata = (struct NCR53c7x0_hostdata *)
host->hostdata;
@@ -3513,10 +3546,10 @@
} else
#endif
{
- printk("scsi%d : unexpected abort interrupt at\n"
+ printk(KERN_ALERT "scsi%d : unexpected abort interrupt at\n"
" ", host->host_no);
- print_insn (host, dsp, "s ", 1);
- panic(" ");
+ print_insn (host, dsp, KERN_ALERT "s ", 1);
+ FATAL (host);
}
}

@@ -3537,10 +3570,11 @@
~DCNTL_SSM) | DCNTL_STD);
restore_flags(flags);
} else {
- printk("scsi%d : unexpected single step interrupt at\n"
+ printk(KERN_ALERT "scsi%d : unexpected single step interrupt at\n"
" ", host->host_no);
- print_insn (host, dsp, "", 1);
- panic(" mail drew@colorad.edu\n");
+ print_insn (host, dsp, KERN_ALERT "", 1);
+ printk(KERN_ALERT " mail drew@colorado.edu\n");
+ FATAL (host);
}
}

@@ -3585,16 +3619,17 @@
hostdata->expecting_sto = 1;
}
} else {
- printk("scsi%d : illegal instruction ", host->host_no);
- print_insn (host, dsp, "", 1);
- printk("scsi%d : DSP=0x%lx, DCMD|DBC=0x%lx, DSA=0x%lx\n"
+ printk(KERN_ALERT "scsi%d : illegal instruction ", host->host_no);
+ print_insn (host, dsp, KERN_ALERT "", 1);
+ printk(KERN_ALERT "scsi%d : DSP=0x%lx, DCMD|DBC=0x%lx, DSA=0x%lx\n"
" DSPS=0x%lx, TEMP=0x%lx, DMODE=0x%x,\n"
" DNAD=0x%lx\n",
host->host_no, (unsigned long) dsp, dbc_dcmd,
(unsigned long) dsa, NCR53c7x0_read32(DSPS_REG),
NCR53c7x0_read32(TEMP_REG), (int) NCR53c7x0_read8(hostdata->dmode),
NCR53c7x0_read32(DNAD_REG));
- panic(" mail drew@Colorado.EDU\n");
+ printk(KERN_ALERT " mail drew@PoohSticks.ORG\n");
+ FATAL (host);
}
}

@@ -3604,17 +3639,18 @@
*/

if (dstat & DSTAT_800_BF) {
- printk("scsi%d : BUS FAULT, DSP=0x%lx, DCMD|DBC=0x%lx, DSA=0x%lx\n"
+ printk(KERN_ALERT "scsi%d : BUS FAULT, DSP=0x%lx, DCMD|DBC=0x%lx, DSA=0x%lx\n"
" DSPS=0x%lx, TEMP=0x%lx, DMODE=0x%x\n",
host->host_no, (unsigned long) dsp, NCR53c7x0_read32(DBC_REG),
(unsigned long) dsa, NCR53c7x0_read32(DSPS_REG),
NCR53c7x0_read32(TEMP_REG), (int) NCR53c7x0_read8(hostdata->dmode));
- print_dsa (host, dsa);
- printk("scsi%d : DSP->\n", host->host_no);
- print_insn(host, dsp, "", 1);
- print_insn(host, next_dsp, "", 1);
+ print_dsa (host, dsa, KERN_ALERT "");
+ printk(KERN_ALERT "scsi%d : DSP->\n", host->host_no);
+ print_insn(host, dsp, KERN_ALERT "", 1);
+ print_insn(host, next_dsp, KERN_ALERT "", 1);
#if 0
- panic(" mail drew@Colorado.EDU\n");
+ printk(KERN_ALERT " mail drew@PoohSticks.ORG\n");
+ FATAL (host);
#else
hostdata->idle = 1;
hostdata->options |= OPTION_DEBUG_INIT_ONLY;
@@ -3641,18 +3677,20 @@
abort_connected(host);
break;
case SPECIFIC_INT_PANIC:
- printk("scsi%d : failure at ", host->host_no);
- print_insn (host, dsp, "", 1);
- panic(" dstat_sir_intr() returned SPECIFIC_INT_PANIC\n");
+ printk(KERN_ALERT "scsi%d : failure at ", host->host_no);
+ print_insn (host, dsp, KERN_ALERT "", 1);
+ printk(KERN_ALERT " dstat_sir_intr() returned SPECIFIC_INT_PANIC\n");
+ FATAL (host);
break;
case SPECIFIC_INT_BREAK:
intr_break (host, cmd);
break;
default:
- printk("scsi%d : failure at ", host->host_no);
- print_insn (host, dsp, "", 1);
- panic(" dstat_sir_intr() returned unknown value %d\n",
+ printk(KERN_ALERT "scsi%d : failure at ", host->host_no);
+ print_insn (host, dsp, KERN_ALERT "", 1);
+ printk(KERN_ALERT" dstat_sir_intr() returned unknown value %d\n",
tmp);
+ FATAL (host);
}
}

@@ -3678,23 +3716,40 @@
* Returns : size, in longs, of instruction printed.
*/

-static int print_insn (struct Scsi_Host *host, unsigned long *insn,
+static int
+print_insn (struct Scsi_Host *host, unsigned long *insn,
char *prefix, int kernel) {
char buf[80], /* Temporary buffer and pointer */
*tmp;
unsigned char dcmd; /* dcmd register for *insn */
int size;

- dcmd = (insn[0] >> 24) & 0xff;
- sprintf(buf, "%s%08lx : 0x%08lx 0x%08lx", (prefix ? prefix : ""),
- (unsigned long) insn, insn[0], insn[1]);
- tmp = buf + strlen(buf);
- if ((dcmd & DCMD_TYPE_MASK) == DCMD_TYPE_MMI) {
- sprintf (tmp, " 0x%08lx\n", insn[2]);
- size = 3;
+ /*
+ * Check to see if the instruction pointer is not bogus before
+ * indirecting through it.
+ *
+ * FIXME: icky magic needs to happen here on non-intel boxes which
+ * don't have kernel memory mapped in like this. Might be reasonable
+ * to use vverify()?
+ */
+
+ if ((unsigned long) insn > (high_memory - 8) ||
+ ((((dcmd = (insn[0] >> 24) & 0xff) & DCMD_TYPE_MMI) == DCMD_TYPE_MMI) &&
+ (unsigned long) insn > (high_memory - 12))) {
+ size = 0;
+ sprintf (buf, "%s%08lx : address out of range\n",
+ prefix, (long) insn);
} else {
- sprintf (tmp, "\n");
- size = 2;
+ sprintf(buf, "%s%08lx : 0x%08lx 0x%08lx", (prefix ? prefix : ""),
+ (unsigned long) insn, insn[0], insn[1]);
+ tmp = buf + strlen(buf);
+ if ((dcmd & DCMD_TYPE_MASK) == DCMD_TYPE_MMI) {
+ sprintf (tmp, " 0x%08lx\n", insn[2]);
+ size = 3;
+ } else {
+ sprintf (tmp, "\n");
+ size = 2;
+ }
}

if (kernel)
@@ -3857,14 +3912,17 @@
* therefore shares the scsicam_bios_param function.
*/

-static void print_dsa (struct Scsi_Host *host, unsigned long *dsa) {
+static void
+print_dsa (struct Scsi_Host *host, unsigned long *dsa,
+ const char *prefix) {
struct NCR53c7x0_hostdata *hostdata = (struct NCR53c7x0_hostdata *)
host->hostdata;
int i, len;
char *ptr;

- printk("scsi%d : dsa at 0x%x\n"
+ printk("%sscsi%d : dsa at 0x%x\n"
" + %ld : dsa_msgout length = %lu, data = 0x%lx\n" ,
+ prefix ? prefix : "",
host->host_no, (unsigned) dsa, hostdata->dsa_msgout,
dsa[hostdata->dsa_msgout / sizeof(long)],
dsa[hostdata->dsa_msgout / sizeof(long) + 1]);
@@ -3909,11 +3967,38 @@
NCR53c7x0_write8(SCNTL1_REG, SCNTL1_RST);
udelay(25); /* Minimum amount of time to assert RST */
NCR53c7x0_write8(SCNTL1_REG, SCNTL1_RST);
+
+ disable (host);
restore_flags (flags);
return 0;
}
#endif

+/*
+ * Function : static int disable (struct Scsi_Host *host)
+ *
+ * Purpose : disables the given NCR host, causing all commands
+ * to return a driver error. Call this so we can unload the
+ * module during development and try again. Eventually,
+ * we should be able to find clean workarrounds for these
+ * problems.
+ *
+ * Inputs : host - hostadapter to twiddle
+ *
+ * Returns : 0 on success.
+ */
+
+static int
+disable (struct Scsi_Host *host) {
+ struct NCR53c7x0_hostdata *hostdata = (struct NCR53c7x0_hostdata *)
+ host->hostdata;
+ hostdata->options |= OPTION_DEBUG_PROBE_ONLY;
+ printk (KERN_ALERT "scsi%d: disabled. Unload and reload\n",
+ host->host_no);
+ /* Should see that pending commands return errors too */
+ return 0;
+}
+

/*
* Function : static int halt (struct Scsi_Host *host)
@@ -3932,13 +4017,17 @@
unsigned char istat, tmp;
struct NCR53c7x0_hostdata *hostdata = (struct NCR53c7x0_hostdata *)
host->hostdata;
+ int stage;
NCR53c7x0_local_setup(host);

save_flags(flags);
cli();
- NCR53c7x0_write8(hostdata->istat, ISTAT_ABRT);
/* Eat interrupts until we find what we're looking for */
- for (;;) {
+ for (stage = 0;;) {
+ if (stage == 1) {
+ NCR53c7x0_write8(hostdata->istat, ISTAT_ABRT);
+ ++stage;
+ }
istat = NCR53c7x0_read8 (hostdata->istat);
if (istat & ISTAT_SIP) {
if ((hostdata->chip / 100) == 8) {
@@ -3949,13 +4038,20 @@
tmp = NCR53c7x0_read8(SSTAT0_REG);
}
} else if (istat & ISTAT_DIP) {
- NCR53c7x0_write8(hostdata->istat, 0);
tmp = NCR53c7x0_read8(DSTAT_REG);
- if (tmp & DSTAT_ABRT)
- break;
- else
- panic("scsi%d: could not halt NCR chip\n", host->host_no);
+ if (stage == 2) {
+ if (tmp & DSTAT_ABRT)
+ NCR53c7x0_write8(hostdata->istat, 0);
+ else {
+ printk(KERN_ALERT "scsi%d: could not halt NCR chip\n",
+ host->host_no);
+ disable (host);
+ }
+ }
+ break;
}
+ if (stage == 0 && !(istat & (ISTAT_SIP|ISTAT_DIP)))
+ ++stage;
}
hostdata->state = STATE_HALTED;
restore_flags(flags);
--- 53c7,8xx.h 1995/06/23 05:58:11 1.1
+++ 53c7,8xx.h 1995/06/28 15:13:23
@@ -58,7 +58,7 @@
#define NCR53c7xx_release NULL
#endif

-#define NCR53c7xx {NULL, NULL, "NCR53c{7,8}xx (rel 4)", NCR53c7xx_detect, \
+#define NCR53c7xx {NULL, NULL, "NCR53c{7,8}xx (rel 5)", NCR53c7xx_detect, \
NULL, /* info */ NULL, /* command, deprecated */ NULL, \
NCR53c7xx_queue_command, NCR53c7xx_abort, NCR53c7xx_reset, \
NULL /* slave attach */, scsicam_bios_param, /* can queue */ 1, \
@@ -913,7 +913,7 @@
#define OPTION_DEBUG_FIXUP 0x400000 /* print fixup values */
#define OPTION_DEBUG_DSA 0x800000
#define OPTION_DEBUG_CORRUPTION 0x1000000 /* Detect script corruption */
-
+#define OPTION_DEBUG_SDTR 0x2000000 /* Debug SDTR problem */
#if !defined(PERM_OPTIONS)
#define PERM_OPTIONS 0
#endif
@@ -1362,10 +1362,10 @@
if (hostdata->options & OPTION_DEBUG_DSA) \
printk("scsi : dsa %s symbol %s(%ld) word %d now 0x%lx\n", \
#dsa, #symbol, (long) hostdata->##symbol, \
- (int) (word), (long) (value)); \
+ (int) (word), (long) (value)); \
}
-

+#define FATAL(host) shutdown((host));

#endif /* NCR53c7x0_C */
#endif /* NCR53c7x0_H */