Re: [PATCH] net/9p/trans_fd.c: fix double list_del() and race in access

From: Tomas Bortoli
Date: Mon Jul 23 2018 - 07:46:50 EST


On 07/23/2018 05:02 AM, Dominique Martinet wrote:
> Tomas Bortoli wrote on Fri, Jul 20, 2018:
>> This patch uses list_del_init() instead of list_del() to eliminate
>> "req_list". This to prevent double list_del()'s calls to the same list
>> from provoking a GPF. Furthermore, this patch fixes an access to
>> "req_list" that was made without getting the relative lock.
>
> Please see comment about locking.
>
> As for list_del to list_del_init, it feels a little wrong to me, but I
> don't have a better idea so let's go with that.

Yes, it's not the best solution.

> Do you know what happened to trigger this? one thread running
> p9_conn_cancel then the other thread doing p9_fd_cancel ?
>

I don't see how races should be prevented. The bug is triggered in
p9_fd_cancel and in this case it's due to the status of the request
being REQ_STATUS_UNSENT but list_del(&req->req_list) is used 4 times in
trans_fd.c:

- p9_read_work()
with the lock but updating the status afterwards (brings to race)
- p9_conn_cancel()
without the lock and updating the status afterwards (brings to race)
- p9_fd_cancelled()
.. ?
-p9_fd_cancel()
with lock, run on conditional status
BOOM

So, maybe we can try to see if it's the problem of syncing the status
between different threads or if it's more but idk.

>> Signed-off-by: Tomas Bortoli <tomasbortoli@xxxxxxxxx>
>> Reported-by: syzbot+735d926e9d1317c3310c@xxxxxxxxxxxxxxxxxxxxxxxxx
>> ---
>>
>> net/9p/trans_fd.c | 10 ++++++----
>> 1 file changed, 6 insertions(+), 4 deletions(-)
>>
>> diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
>> index a64b01c56e30..131bb1f059e6 100644
>> --- a/net/9p/trans_fd.c
>> +++ b/net/9p/trans_fd.c
>> @@ -223,7 +223,9 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
>>
>> list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
>> p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
>> - list_del(&req->req_list);
>> + spin_lock_irqsave(&m->client->lock, flags);
>> + list_del_init(&req->req_list);
>> + spin_unlock_irqrestore(&m->client->lock, flags);
>
> Just locking around one item if you're afraid it might change won't be
> enough - list_for_each_entry_safe is only "safe" from removing the
> current element from the list yourself, not from other threads messing
> with it, so you'd need to lock around the whole loop if that's what
> you're protecting against.
>

Right, I thought I had to unlock before p9_client_cb() as here:

https://github.com/torvalds/linux/blob/master/net/9p/trans_fd.c#L375

However, also locking the client mutex for the whole loop doesn't seem
to give problems. See patch below

> (Also, since I've taken the other patchs to change spin locks on
> client->lock to spin_lock instead of spin_lock_irqsave, please use that
> function for new locking of that variable - in general just basing your
> patchs off linux-next's master branch is a good idea.)
>
>> if (!req->t_err)
>> req->t_err = err;
>> p9_client_cb(m->client, req, REQ_STATUS_ERROR);
>> @@ -369,7 +371,7 @@ static void p9_read_work(struct work_struct *work)
>> spin_lock(&m->client->lock);
>> if (m->req->status != REQ_STATUS_ERROR)
>> status = REQ_STATUS_RCVD;
>> - list_del(&m->req->req_list);
>> + list_del_init(&m->req->req_list);
>> spin_unlock(&m->client->lock);
>> p9_client_cb(m->client, m->req, status);
>> m->rc.sdata = NULL;
>> @@ -684,7 +686,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
>> spin_lock(&client->lock);
>>
>> if (req->status == REQ_STATUS_UNSENT) {
>> - list_del(&req->req_list);
>> + list_del_init(&req->req_list);
>> req->status = REQ_STATUS_FLSHD;
>> ret = 0;
>> }
>> @@ -701,7 +703,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
>> * remove it from the list.
>> */
>> spin_lock(&client->lock);
>> - list_del(&req->req_list);
>> + list_del_init(&req->req_list);
>> spin_unlock(&client->lock);
>>
>> return 0;



diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index a64b01c56e30..2ae5f03d872f 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -199,15 +199,14 @@ static void p9_mux_poll_stop(struct p9_conn *m)
static void p9_conn_cancel(struct p9_conn *m, int err)
{
struct p9_req_t *req, *rtmp;
- unsigned long flags;
LIST_HEAD(cancel_list);

p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);

- spin_lock_irqsave(&m->client->lock, flags);
+ spin_lock(&m->client->lock);

if (m->err) {
- spin_unlock_irqrestore(&m->client->lock, flags);
+ spin_unlock(&m->client->lock);
return;
}

@@ -223,11 +222,12 @@ static void p9_conn_cancel(struct p9_conn *m, int err)

list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
- list_del(&req->req_list);
+ list_del_init(&req->req_list);
if (!req->t_err)
req->t_err = err;
p9_client_cb(m->client, req, REQ_STATUS_ERROR);
}
+ spin_unlock(&m->client->lock);
}

static __poll_t
@@ -369,7 +369,7 @@ static void p9_read_work(struct work_struct *work)
spin_lock(&m->client->lock);
if (m->req->status != REQ_STATUS_ERROR)
status = REQ_STATUS_RCVD;
- list_del(&m->req->req_list);
+ list_del_init(&m->req->req_list);
spin_unlock(&m->client->lock);
p9_client_cb(m->client, m->req, status);
m->rc.sdata = NULL;
@@ -684,7 +684,7 @@ static int p9_fd_cancel(struct p9_client *client,
struct p9_req_t *req)
spin_lock(&client->lock);

if (req->status == REQ_STATUS_UNSENT) {
- list_del(&req->req_list);
+ list_del_init(&req->req_list);
req->status = REQ_STATUS_FLSHD;
ret = 0;
}
@@ -701,7 +701,7 @@ static int p9_fd_cancelled(struct p9_client *client,
struct p9_req_t *req)
* remove it from the list.
*/
spin_lock(&client->lock);
- list_del(&req->req_list);
+ list_del_init(&req->req_list);
spin_unlock(&client->lock);

return 0;