Re: [PATCH] fix race in AF_UNIX

From: Miklos Szeredi
Date: Mon Jun 04 2007 - 05:45:55 EST


> A recv() on an AF_UNIX, SOCK_STREAM socket can race with a
> send()+close() on the peer, causing recv() to return zero, even though
> the sent data should be received.
>
> This happens if the send() and the close() is performed between
> skb_dequeue() and checking sk->sk_shutdown in unix_stream_recvmsg():
>
> process A skb_dequeue() returns NULL, there's no data in the socket queue
> process B new data is inserted onto the queue by unix_stream_sendmsg()
> process B sk->sk_shutdown is set to SHUTDOWN_MASK by unix_release_sock()
> process A sk->sk_shutdown is checked, unix_release_sock() returns zero

This is only part of the story. It turns out, there are other races
involving the garbage collector, that can throw away perfectly good
packets with AF_UNIX sockets in them.

The problems arise when a socket goes from installed to in-flight or
vica versa during garbage collection. Since gc is done with a
spinlock held, this only shows up on SMP.

The following patch fixes it for me, but it's possibly the wrong
approach.

Signed-off-by: Miklos Szeredi <mszeredi@xxxxxxx>
---

Index: linux-2.6.22-rc2/net/unix/garbage.c
===================================================================
--- linux-2.6.22-rc2.orig/net/unix/garbage.c 2007-06-03 23:58:11.000000000 +0200
+++ linux-2.6.22-rc2/net/unix/garbage.c 2007-06-04 11:39:42.000000000 +0200
@@ -90,6 +90,7 @@
static struct sock *gc_current = GC_HEAD; /* stack of objects to mark */

atomic_t unix_tot_inflight = ATOMIC_INIT(0);
+DECLARE_RWSEM(unix_gc_sem);


static struct sock *unix_get_socket(struct file *filp)
@@ -169,7 +170,7 @@ static void maybe_unmark_and_push(struct

void unix_gc(void)
{
- static DEFINE_MUTEX(unix_gc_sem);
+ static DEFINE_MUTEX(unix_gc_local_lock);
int i;
struct sock *s;
struct sk_buff_head hitlist;
@@ -179,9 +180,22 @@ void unix_gc(void)
* Avoid a recursive GC.
*/

- if (!mutex_trylock(&unix_gc_sem))
+ if (!mutex_trylock(&unix_gc_local_lock))
return;

+
+ /*
+ * unix_gc_sem protects against sockets going from in-flight to
+ * installed
+ *
+ * Can't sleep on this, because skb_recv_datagram could be
+ * waiting for a packet that is to be sent by the thread which
+ * invoked the gc
+ */
+ if (!down_write_trylock(&unix_gc_sem)) {
+ mutex_unlock(&unix_gc_local_lock);
+ return;
+ }
spin_lock(&unix_table_lock);

forall_unix_sockets(i, s)
@@ -207,8 +221,6 @@ void unix_gc(void)

forall_unix_sockets(i, s)
{
- int open_count = 0;
-
/*
* If all instances of the descriptor are not
* in flight we are in use.
@@ -218,10 +230,20 @@ void unix_gc(void)
* In this case (see unix_create1()) we set artificial
* negative inflight counter to close race window.
* It is trick of course and dirty one.
+ *
+ * Get the inflight counter first, then the open
+ * counter. This avoids problems if racing with
+ * sendmsg
+ *
+ * If just created socket is not yet attached to
+ * a file descriptor, assume open_count of 1
*/
+ int inflight_count = atomic_read(&unix_sk(s)->inflight);
+ int open_count = 1;
+
if (s->sk_socket && s->sk_socket->file)
open_count = file_count(s->sk_socket->file);
- if (open_count > atomic_read(&unix_sk(s)->inflight))
+ if (open_count > inflight_count)
maybe_unmark_and_push(s);
}

@@ -302,11 +324,12 @@ void unix_gc(void)
u->gc_tree = GC_ORPHAN;
}
spin_unlock(&unix_table_lock);
+ up_write(&unix_gc_sem);

/*
* Here we are. Hitlist is filled. Die.
*/

__skb_queue_purge(&hitlist);
- mutex_unlock(&unix_gc_sem);
+ mutex_unlock(&unix_gc_local_lock);
}
Index: linux-2.6.22-rc2/include/net/af_unix.h
===================================================================
--- linux-2.6.22-rc2.orig/include/net/af_unix.h 2007-04-26 05:08:32.000000000 +0200
+++ linux-2.6.22-rc2/include/net/af_unix.h 2007-06-04 09:13:56.000000000 +0200
@@ -14,6 +14,7 @@ extern void unix_gc(void);

extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
extern spinlock_t unix_table_lock;
+extern struct rw_semaphore unix_gc_sem;

extern atomic_t unix_tot_inflight;

Index: linux-2.6.22-rc2/net/unix/af_unix.c
===================================================================
--- linux-2.6.22-rc2.orig/net/unix/af_unix.c 2007-06-03 23:58:11.000000000 +0200
+++ linux-2.6.22-rc2/net/unix/af_unix.c 2007-06-04 11:04:15.000000000 +0200
@@ -1572,6 +1572,7 @@ static int unix_dgram_recvmsg(struct kio

msg->msg_namelen = 0;

+ down_read(&unix_gc_sem);
mutex_lock(&u->readlock);

skb = skb_recv_datagram(sk, flags, noblock, &err);
@@ -1629,6 +1630,7 @@ out_free:
skb_free_datagram(sk,skb);
out_unlock:
mutex_unlock(&u->readlock);
+ up_read(&unix_gc_sem);
out:
return err;
}
@@ -1704,6 +1706,7 @@ static int unix_stream_recvmsg(struct ki
memset(&tmp_scm, 0, sizeof(tmp_scm));
}

+ down_read(&unix_gc_sem);
mutex_lock(&u->readlock);

do
@@ -1732,6 +1735,7 @@ static int unix_stream_recvmsg(struct ki
if (!timeo)
break;
mutex_unlock(&u->readlock);
+ up_read(&unix_gc_sem);

timeo = unix_stream_data_wait(sk, timeo);

@@ -1739,6 +1743,7 @@ static int unix_stream_recvmsg(struct ki
err = sock_intr_errno(timeo);
goto out;
}
+ down_read(&unix_gc_sem);
mutex_lock(&u->readlock);
continue;
unlock:
@@ -1810,6 +1815,7 @@ static int unix_stream_recvmsg(struct ki
} while (size);

mutex_unlock(&u->readlock);
+ up_read(&unix_gc_sem);
scm_recv(sock, msg, siocb->scm, flags);
out:
return copied ? : err;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/