Re: [BUG] 4.11.0-rc3 xterm hung in D state on exit, wchan is tty_release_struct

From: Rob Herring
Date: Thu Mar 23 2017 - 09:46:10 EST


On Thu, Mar 23, 2017 at 12:30:18AM -0700, lkml@xxxxxxxxxxx wrote:
> On Wed, Mar 22, 2017 at 11:44:18PM -0700, lkml@xxxxxxxxxxx wrote:
> > On Wed, Mar 22, 2017 at 07:08:46PM -0700, lkml@xxxxxxxxxxx wrote:
> > > Hello list,
> > >
> > > After approximately one day day of running 4.11.0-rc3 with 7e54d9d reverted to
> > > enable regular use, this happened upon destroying an xterm:
> > >
> > > [80817.525112] BUG: unable to handle kernel paging request at 0000000000002260
> > > [80817.525239] IP: n_tty_receive_buf_common+0x68/0xab0
> > > [80817.525312] PGD 0
> > >
> > > [80817.525387] Oops: 0000 [#1] PREEMPT SMP
> > > [80817.525452] CPU: 0 PID: 9532 Comm: kworker/u4:3 Not tainted 4.11.0-rc3-00001-gc56a355 #53
> > > [80817.525564] Hardware name: LENOVO 7668CTO/7668CTO, BIOS 7NETC2WW (2.22 ) 03/22/2011
> > > [80817.525673] Workqueue: events_unbound flush_to_ldisc
> > > [80817.525752] task: ffff967d91d80000 task.stack: ffff9add81f40000
> > > [80817.525839] RIP: 0010:n_tty_receive_buf_common+0x68/0xab0
> > > [80817.525917] RSP: 0018:ffff9add81f43d38 EFLAGS: 00010297
> > > [80817.525992] RAX: 0000000000000000 RBX: ffff967d91c98c00 RCX: 0000000000000001
> > > [80817.526035] RDX: ffff967e73bba58d RSI: ffff967e73bba48d RDI: ffff967d91c98cc0
> > > [80817.526035] RBP: ffff9add81f43dd0 R08: 0000000000000001 R09: 0000000000000000
> > > [80817.526035] R10: 00004980cbe001e0 R11: 0000000000000000 R12: ffff967d87aacf20
> > > [80817.526035] R13: ffff967e73bba58d R14: 0000000000000001 R15: ffff967e74aa8008
> > > [80817.526035] FS: 0000000000000000(0000) GS:ffff967e7bc00000(0000) knlGS:0000000000000000
> > > [80817.526035] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > > [80817.526035] CR2: 0000000000002260 CR3: 0000000099009000 CR4: 00000000000006f0
> > > [80817.526035] Call Trace:
> > > [80817.526035] ? update_curr+0xbb/0x1a0
> > > [80817.526035] n_tty_receive_buf2+0xf/0x20
> > > [80817.526035] tty_ldisc_receive_buf+0x1d/0x50
> > > [80817.526035] tty_port_default_receive_buf+0x40/0x60
> > > [80817.526035] flush_to_ldisc+0x94/0xa0
> > > [80817.526035] process_one_work+0x13b/0x3e0
> > > [80817.526035] worker_thread+0x64/0x4a0
> > > [80817.526035] kthread+0x10f/0x150
> > > [80817.526035] ? process_one_work+0x3e0/0x3e0
> > > [80817.526035] ? __kthread_create_on_node+0x150/0x150
> > > [80817.526035] ret_from_fork+0x29/0x40
> > > [80817.526035] Code: 85 70 ff ff ff e8 59 75 57 00 48 8d 83 00 02 00 00 c7 45 c8 00 00 00 00 48 89 45 98 48 8d 83 28 02 00 00 48 89 45 90 48 8b 45 b8 <48> 8b b0 60 22 00 00 48 8b 08 89 f0 29 c8 f6 83 10 01 00 00 08
> > > [80817.526035] RIP: n_tty_receive_buf_common+0x68/0xab0 RSP: ffff9add81f43d38
> > > [80817.526035] CR2: 0000000000002260
> > > [80817.526035] ---[ end trace 640aec4765d350f2 ]---
> > >
> > >
> > > That xterm process is stuck, and I am unable to start any new xterms, switching to virtual consoles proves useless, presumably there's an important lock held.
> > >
> > <snip>
> >
> > At a casual glance of the v4.10..v4.11-rc3 changes affecting drivers/tty, the
> > commit c3485e looks suspicious to me, these hunks in particular:

Have you tried reverting it to confirm?

I'm not completely surprised. I did ask folks to look at this part.

> >
> > @@ -465,16 +465,6 @@ static void flush_to_ldisc(struct work_struct *work)
> > {
> > struct tty_port *port = container_of(work, struct tty_port, buf.work);
> > struct tty_bufhead *buf = &port->buf;
> > - struct tty_struct *tty;
> > - struct tty_ldisc *disc;
> > -
> > - tty = READ_ONCE(port->itty);
> > - if (tty == NULL)
> > - return;
> > -
> > - disc = tty_ldisc_ref(tty);
> > - if (disc == NULL)
> > - return;
> >
> > mutex_lock(&buf->lock);
> >
> > @@ -504,7 +494,7 @@ static void flush_to_ldisc(struct work_struct *work)
> > continue;
> > }
> >
> > - count = receive_buf(disc, head, count);
> > + count = receive_buf(port, head, count);
> > if (!count)
> > break;
> > head->read += count;
> > @@ -512,7 +502,6 @@ static void flush_to_ldisc(struct work_struct *work)
> >
> > mutex_unlock(&buf->lock);
> >
> > - tty_ldisc_deref(disc);
> > }
> >
> > /**
> >
> > <snip>
> >
> > I'm not familiar with this code at all, but port->buf is part of port, and if
> > the port is destroyed as part of the tty, then perhaps port->buf (and
> > port->buf->lock) may become invalid on us without these:
> >
> > - tty = READ_ONCE(port->itty);
> > - if (tty == NULL)
> > - return;
> > -
> > - disc = tty_ldisc_ref(tty);
> > - if (disc == NULL)
> > - return;

AIUI, the lifetime of the port is tied to the lifetime of the device
while the lifetime of the tty is tied to the open/close of the tty
device. I simply moved this sequence after the buffer processing. My
only guess is that taking and releasing the ldisc ref multiple times is
a problem.

It's also moved to inside the buf->lock mutex, but I don't think that
it would matter.

> >
> > Added Rob Herring, author of c3485ee to CC list.
> >
>
> I suspect this part was a mistake:
>
> - tty = READ_ONCE(port->itty);
> - if (tty == NULL)
> - return;
>
> Note release_tty() tty->port->itty is assigned NULL before calling
> tty_buffer_cancel_work():

The READ_ONCE should still handle that.

Anyway, the changes were purely to try to remove the need for a ldisc in
the serdev case and avoid referencing it. In fact we still have an
ldisc, it's just not used. So we can restore the original ordering.

Can you try this patch:

8<---------------------------------------------------------------------

diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index d05393594f15..9bf2c32efaf7 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -29,7 +29,8 @@ struct serport {
* Callback functions from the tty port.
*/

-static int ttyport_receive_buf(struct tty_port *port, const unsigned char *cp,
+static int ttyport_receive_buf(struct tty_port *port, struct tty_ldisc *disc,
+ const unsigned char *cp,
const unsigned char *fp, size_t count)
{
struct serdev_controller *ctrl = port->client_data;
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index 4e7a4e9dcf4d..3fb0dcfeec45 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -437,7 +437,7 @@ int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf);

static int
-receive_buf(struct tty_port *port, struct tty_buffer *head, int count)
+receive_buf(struct tty_port *port, struct tty_ldisc *disc, struct tty_buffer *head, int count)
{
unsigned char *p = char_buf_ptr(head, head->read);
char *f = NULL;
@@ -445,7 +445,7 @@ receive_buf(struct tty_port *port, struct tty_buffer *head, int count)
if (~head->flags & TTYB_NORMAL)
f = flag_buf_ptr(head, head->read);

- return port->client_ops->receive_buf(port, p, f, count);
+ return port->client_ops->receive_buf(port, disc, p, f, count);
}

/**
@@ -465,6 +465,16 @@ static void flush_to_ldisc(struct work_struct *work)
{
struct tty_port *port = container_of(work, struct tty_port, buf.work);
struct tty_bufhead *buf = &port->buf;
+ struct tty_struct *tty;
+ struct tty_ldisc *disc;
+
+ tty = READ_ONCE(port->itty);
+ if (tty == NULL)
+ return;
+
+ disc = tty_ldisc_ref(tty);
+ if (disc == NULL)
+ return;

mutex_lock(&buf->lock);

@@ -494,7 +504,7 @@ static void flush_to_ldisc(struct work_struct *work)
continue;
}

- count = receive_buf(port, head, count);
+ count = receive_buf(port, disc, head, count);
if (!count)
break;
head->read += count;
@@ -502,6 +512,7 @@ static void flush_to_ldisc(struct work_struct *work)

mutex_unlock(&buf->lock);

+ tty_ldisc_deref(disc);
}

/**
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 1d21a9c1d33e..c0af5021f367 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -19,26 +19,11 @@
#include <linux/serdev.h>

static int tty_port_default_receive_buf(struct tty_port *port,
+ struct tty_ldisc *disc,
const unsigned char *p,
const unsigned char *f, size_t count)
{
- int ret;
- struct tty_struct *tty;
- struct tty_ldisc *disc;
-
- tty = READ_ONCE(port->itty);
- if (!tty)
- return 0;
-
- disc = tty_ldisc_ref(tty);
- if (!disc)
- return 0;
-
- ret = tty_ldisc_receive_buf(disc, p, (char *)f, count);
-
- tty_ldisc_deref(disc);
-
- return ret;
+ return tty_ldisc_receive_buf(disc, p, (char *)f, count);
}

static void tty_port_default_wakeup(struct tty_port *port)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1017e904c0a3..67569fb64dbf 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -219,7 +219,8 @@ struct tty_port_operations {
};

struct tty_port_client_operations {
- int (*receive_buf)(struct tty_port *port, const unsigned char *, const unsigned char *, size_t);
+ int (*receive_buf)(struct tty_port *, struct tty_ldisc *,
+ const unsigned char *, const unsigned char *, size_t);
void (*write_wakeup)(struct tty_port *port);
};