Re: KASAN: slab-use-after-free Read in ip_finish_output

From: Florian Westphal
Date: Thu Mar 14 2024 - 20:08:25 EST


Florian Westphal <fw@xxxxxxxxx> wrote:
> Eric Dumazet <edumazet@xxxxxxxxxx> wrote:
> > Thanks for taking a look Florian.
> >
> > Perhaps not messing with truesize at all would help ?
> >
> > Something based on this POC :
> >
> > spin_lock(&qp->q.lock);
> > + if (!qp->q.sk) {
> > + struct sock *sk = skb->sk;
> >
> > + if (sk && refcount_inc_not_zero(&sk->sk_refcnt))
> > + qp->q.sk = sk;
>
> Unfortunetely I did not get this to work.
>
> sk_refcnt is 0. sk is kept alive by sock_wfree destructor.
>
> I don't know how to recover from this, refcnt cannot be "repaired"
> anymore.
>
> I could artificially inflate sk_wmem counter by 1, to prevent release,
> but that needs yet another sock_wfree-like destructor.
>
> I'm already not sure how this existing scheme works, there are
> multiple places that check for skb->destructor == sock_wfree,
> yet we have is_skb_wmem helper that checks __sock_wfree and tcp_wfree.
>
> Removing defrag from output seems like best option, but it will
> surely break some scenarios.
>
> Or, I could just fail reasm if sk refcount is already 0, that would
> likely work too?

I am going to test (and formally submit if no issues found):

inet: inet_defrag: prevent sk release while still in use

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3023bc2be6a1..b1aeed510f24 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -753,8 +753,6 @@ typedef unsigned char *sk_buff_data_t;
* @list: queue head
* @ll_node: anchor in an llist (eg socket defer_list)
* @sk: Socket we are owned by
- * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in
- * fragmentation management
* @dev: Device we arrived on/are leaving by
* @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
* @cb: Control buffer. Free for use by every layer. Put private vars here
@@ -875,10 +873,7 @@ struct sk_buff {
struct llist_node ll_node;
};

- union {
- struct sock *sk;
- int ip_defrag_offset;
- };
+ struct sock *sk;

union {
ktime_t tstamp;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 7072fc0783ef..fab1ea5df089 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -39,6 +39,7 @@ struct ipfrag_skb_cb {
};
struct sk_buff *next_frag;
int frag_run_len;
+ int ip_defrag_offset;
};

#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
@@ -396,12 +397,12 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
*/
if (!last)
fragrun_create(q, skb); /* First fragment. */
- else if (last->ip_defrag_offset + last->len < end) {
+ else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) {
/* This is the common case: skb goes to the end. */
/* Detect and discard overlaps. */
- if (offset < last->ip_defrag_offset + last->len)
+ if (offset < FRAG_CB(last)->ip_defrag_offset + last->len)
return IPFRAG_OVERLAP;
- if (offset == last->ip_defrag_offset + last->len)
+ if (offset == FRAG_CB(last)->ip_defrag_offset + last->len)
fragrun_append_to_last(q, skb);
else
fragrun_create(q, skb);
@@ -418,13 +419,13 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,

parent = *rbn;
curr = rb_to_skb(parent);
- curr_run_end = curr->ip_defrag_offset +
+ curr_run_end = FRAG_CB(curr)->ip_defrag_offset +
FRAG_CB(curr)->frag_run_len;
- if (end <= curr->ip_defrag_offset)
+ if (end <= FRAG_CB(curr)->ip_defrag_offset)
rbn = &parent->rb_left;
else if (offset >= curr_run_end)
rbn = &parent->rb_right;
- else if (offset >= curr->ip_defrag_offset &&
+ else if (offset >= FRAG_CB(curr)->ip_defrag_offset &&
end <= curr_run_end)
return IPFRAG_DUP;
else
@@ -438,7 +439,7 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
rb_insert_color(&skb->rbnode, &q->rb_fragments);
}

- skb->ip_defrag_offset = offset;
+ FRAG_CB(skb)->ip_defrag_offset = offset;

return IPFRAG_OK;
}
@@ -448,13 +449,29 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
struct sk_buff *parent)
{
struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
- struct sk_buff **nextp;
+ struct sk_buff **nextp = NULL;
+ struct sock *sk = skb->sk;
int delta;

+ if (sk) {
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+
+ /* need to orphan now, both this function and
+ * inet_frag_reasm_finish alter skb->truesize, which
+ * messes up sk->sk_wmem accounting.
+ * skb->sk might have been passed as argument to dst->output
+ * and must remain valid until tx completes.
+ */
+ skb_orphan(skb);
+ }
+
if (head != skb) {
fp = skb_clone(skb, GFP_ATOMIC);
- if (!fp)
- return NULL;
+ if (!fp) {
+ head = skb;
+ goto out_restore_sk;
+ }
FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
if (RB_EMPTY_NODE(&skb->rbnode))
FRAG_CB(parent)->next_frag = fp;
@@ -470,13 +487,14 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
consume_skb(head);
head = skb;
}
- WARN_ON(head->ip_defrag_offset != 0);
+ WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0);
+ DEBUG_NET_WARN_ON_ONCE(head->sk);

delta = -head->truesize;

/* Head of list must not be cloned. */
if (skb_unclone(head, GFP_ATOMIC))
- return NULL;
+ goto out_restore_sk;

delta += head->truesize;
if (delta)
@@ -492,7 +510,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,

clone = alloc_skb(0, GFP_ATOMIC);
if (!clone)
- return NULL;
+ goto out_restore_sk;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
@@ -509,6 +527,12 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
nextp = &skb_shinfo(head)->frag_list;
}

+out_restore_sk:
+ if (sk) {
+ head->sk = sk;
+ head->destructor = sock_edemux;
+ }
+
return nextp;
}
EXPORT_SYMBOL(inet_frag_reasm_prepare);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index a4941f53b523..fb947d1613fe 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -384,6 +384,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
}

skb_dst_drop(skb);
+ skb_orphan(skb);
return -EINPROGRESS;

insert_error:
@@ -487,7 +488,6 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
struct ipq *qp;

__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
- skb_orphan(skb);

/* Lookup (or create) queue header */
qp = ip_find(net, ip_hdr(skb), user, vif);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 1a51a44571c3..d0dcbaca1994 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -294,6 +294,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
}

skb_dst_drop(skb);
+ skb_orphan(skb);
return -EINPROGRESS;

insert_error:
@@ -469,7 +470,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
hdr = ipv6_hdr(skb);
fhdr = (struct frag_hdr *)skb_transport_header(skb);

- skb_orphan(skb);
fq = fq_find(net, fhdr->identification, user, hdr,
skb->dev ? skb->dev->ifindex : 0);
if (fq == NULL) {