re: (FOUND - Need patch) bad bad unknown brokeness

From: david (david@killerlabs.com)
Date: Mon Mar 20 2000 - 22:25:39 EST


ok, i sent another email, here's the output, i managed to get a ps axwl run.
note, it does NOT always hang under the same circumstances, but sending
large bodied transactions has a high chance of triggering it.

(ps outputs trimmed)

James:~# ps axwl
  F UID PID PPID PRI NI VSZ RSS WCHAN STAT TTY TIME COMMAND
000 0 120 1 8 0 1032 308 wait_o D ? 0:00 /usr/local/sbin/watchdogd
000 501 716 198 16 0 5036 2432 nanosl S pts/1 0:02 pine
000 501 4612 716 17 0 1472 784 wait_o D pts/1 0:00 joe /tmp/pico.000716

the child proc of pine is joe, I was exiting joe which would flush the
buffer to /tmp/pico.000716 then return to pine, pine would read that file.

ok, I repeated the excercise to gain more information. this time joe
managed to exit and pine started the smtp session and hung at the end. here
is the ps awxl:

nifty:/usr/src/linux-99p3# grep "wait_o" System.map
c01232ec T ___wait_on_page
c012e1cc T __wait_on_buffer
c0131b8c T __wait_on_super
c013e394 t __wait_on_inode
c0152d9c t __wait_on_dquot
c020b25f ? __kstrtab___wait_on_buffer
c020b27a ? __kstrtab____wait_on_page
c020c711 ? __kstrtab___wait_on_super
c0210b80 ? __ksymtab___wait_on_buffer
c0210b88 ? __ksymtab____wait_on_page
c02111c8 ? __ksymtab___wait_on_super

ok, let's try this excercise without quotas enabled.

doesn't affect it.

000 0 120 1 0 0 1032 308 12e345 D ? 0:00 /usr/local/sbin/watchdogd
140 516 408 383 0 0 4292 2512 12e345 D ? 0:00 /usr/local/apache/bin/httpd
040 8 416 414 12 0 2348 1576 15b7e5 D ? 0:15 sendmail: SAA00416 IDENT:mail@mail.l
000 0 425 351 10 0 1376 656 12e345 D pts/2 0:00 wget -r www.kalifornia.com

c012e1cc T __wait_on_buffer
c012e3a4 t sync_buffers

(unrelated i believe)
c015b5a8 t __get_request_wait
c015b848 T is_read_only

Dump of assembler code for function __wait_on_buffer:
[snipped to the point]
0xc012e314 <__wait_on_buffer+328>: movl (%ebx),%ebx
0xc012e316 <__wait_on_buffer+330>: lock addl $0x0,0x0(%esp,1)
0xc012e31c <__wait_on_buffer+336>: movl $0x0,0x4(%eax)
0xc012e323 <__wait_on_buffer+343>: testl %edx,%edx
0xc012e325 <__wait_on_buffer+345>:
    je 0xc012e308 <__wait_on_buffer+316>
0xc012e327 <__wait_on_buffer+347>: pushl %ecx
0xc012e328 <__wait_on_buffer+348>: call *%edx
0xc012e32a <__wait_on_buffer+350>: addl $0x4,%esp
0xc012e32d <__wait_on_buffer+353>:
    jmp 0xc012e308 <__wait_on_buffer+316>
0xc012e32f <__wait_on_buffer+355>: nop
0xc012e330 <__wait_on_buffer+356>: movl 0x10(%esp,1),%edx
0xc012e334 <__wait_on_buffer+360>: movl $0x2,(%edx)
0xc012e33a <__wait_on_buffer+366>: testb $0x4,0x18(%ebp)
0xc012e33e <__wait_on_buffer+370>:
    je 0xc012e34c <__wait_on_buffer+384>
0xc012e340 <__wait_on_buffer+372>: call 0xc011450c <schedule>
0xc012e345 <__wait_on_buffer+377>: testb $0x4,0x18(%ebp)
0xc012e349 <__wait_on_buffer+381>:
    jne 0xc012e2e8 <__wait_on_buffer+284>
0xc012e34b <__wait_on_buffer+383>: nop
0xc012e34c <__wait_on_buffer+384>: movl 0x10(%esp,1),%ecx
0xc012e350 <__wait_on_buffer+388>: movl $0x0,(%ecx)
0xc012e356 <__wait_on_buffer+394>: pushf
0xc012e357 <__wait_on_buffer+395>: popl %ebx
0xc012e358 <__wait_on_buffer+396>: cli
0xc012e359 <__wait_on_buffer+397>: movl 0x3c(%esp,1),%eax
0xc012e35d <__wait_on_buffer+401>: cmpl %edi,%eax
0xc012e35f <__wait_on_buffer+403>:
    je 0xc012e388 <__wait_on_buffer+444>
0xc012e361 <__wait_on_buffer+405>: pushl %edi
0xc012e362 <__wait_on_buffer+406>: pushl %eax
0xc012e363 <__wait_on_buffer+407>: pushl $0xc01e2920
0xc012e368 <__wait_on_buffer+412>: call 0xc0117170 <printk>

linux/fs/buffer.c:134

  134 /*
  135 * Rewrote the wait-routines to use the "new" wait-queue functionality,
  136 * and getting rid of the cli-sti pairs. The wait-queue routines still
  137 * need cli-sti, but now it's just a couple of 386 instructions or so.
  138 *
  139 * Note that the real wait_on_buffer() is an inline function that checks
  140 * if 'b_wait' is set before calling this, so that the queues aren't set
  141 * up unnecessarily.
  142 */
  143 void __wait_on_buffer(struct buffer_head * bh)
  144 {
  145 struct task_struct *tsk = current;
  146 DECLARE_WAITQUEUE(wait, tsk);
  147
  148 atomic_inc(&bh->b_count);
  149 add_wait_queue(&bh->b_wait, &wait);
  150 do {
  151 run_task_queue(&tq_disk);
  152 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
  153 if (!buffer_locked(bh))
  154 break;
  155 schedule();
  156 } while (buffer_locked(bh));
  157 tsk->state = TASK_RUNNING;
  158 remove_wait_queue(&bh->b_wait, &wait);
  159 atomic_dec(&bh->b_count);
  160 }

It looks like it's hanging on line 156.

include/linux/fs.h:#define __buffer_state(bh, state) (((bh)->b_state & (1UL << BH_##state)) != 0)
linux/include/linux/fs.h:#define buffer_locked(bh) __buffer_state(bh,Lock)

I'm building a kernel now with a printk to show the b_state

-d

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Thu Mar 23 2000 - 21:00:31 EST