Re: [PATCH net-next V3] tun: add eBPF based queue selection method

From: Willem de Bruijn
Date: Tue Dec 05 2017 - 11:14:35 EST


On Tue, Dec 5, 2017 at 2:29 AM, Jason Wang <jasowang@xxxxxxxxxx> wrote:
>
>
> On 2017å12æ05æ 08:16, Willem de Bruijn wrote:
>>
>> On Mon, Dec 4, 2017 at 4:31 AM, Jason Wang <jasowang@xxxxxxxxxx> wrote:
>>>
>>> This patch introduces an eBPF based queue selection method. With this,
>>> the policy could be offloaded to userspace completely through a new
>>> ioctl TUNSETSTEERINGEBPF.
>>>
>>> Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
>>> ---
>>> +static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff
>>> *skb)
>>> +{
>>> + struct tun_steering_prog *prog;
>>> + u16 ret = 0;
>>> +
>>> + prog = rcu_dereference(tun->steering_prog);
>>> + if (prog)
>>> + ret = bpf_prog_run_clear_cb(prog->prog, skb);
>>
>> This dereferences tun->steering_prog for a second time. It is safe
>> in this load balancing case to assign a few extra packets to queue 0.
>> But the issue can also be avoided by replacing the function with a
>> direct call in tun_net_xmit:
>>
>> struct tun_steering_prog *s = rcu_dereference(tun->steering_prog);
>> if (s)
>> ret = bpf_prog_run_clear_cb(s->prog, skb) %
>> tun->numqueues;
>
>
> Right.
>
>
>>
>>> /* Net device start xmit */
>>> -static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device
>>> *dev)
>>> +static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
>>> {
>>> - struct tun_struct *tun = netdev_priv(dev);
>>> - int txq = skb->queue_mapping;
>>> - struct tun_file *tfile;
>>> - u32 numqueues = 0;
>>> -
>>> - rcu_read_lock();
>>> - tfile = rcu_dereference(tun->tfiles[txq]);
>>> - numqueues = READ_ONCE(tun->numqueues);
>>> -
>>> - /* Drop packet if interface is not attached */
>>> - if (txq >= numqueues)
>>> - goto drop;
>>> -
>>> #ifdef CONFIG_RPS
>>> - if (numqueues == 1 && static_key_false(&rps_needed)) {
>>> + if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
>>> /* Select queue was not called for the skbuff, so we
>>> extract the
>>> * RPS hash and save it into the flow_table here.
>>> */
>>> @@ -969,6 +986,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb,
>>> struct net_device *dev)
>>> }
>>> }
>>> #endif
>>> +}
>>> +
>>> +/* Net device start xmit */
>>> +static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device
>>> *dev)
>>> +{
>>> + struct tun_struct *tun = netdev_priv(dev);
>>> + int txq = skb->queue_mapping;
>>> + struct tun_file *tfile;
>>> + u32 numqueues = 0;
>>> +
>>> + rcu_read_lock();
>>> + tfile = rcu_dereference(tun->tfiles[txq]);
>>> + numqueues = READ_ONCE(tun->numqueues);
>>
>> Now tun->numqueues is read twice, reversing commit fa35864e0bb7
>> ("tuntap: Fix for a race in accessing numqueues"). I don't see anything
>> left that would cause a divide by zero after the relevant code was
>> converted from divide to multiple and subsequently even removed.
>>
>> But if it's safe to read multiple times, might as well remove the
>> READ_ONCE.
>
>
> Good point, but READ_ONCE() is not something new, we'd better change this in
> another patch.

Sounds good. It's a simple follow-up. I can also send that.
>
>
>>
>>> @@ -1551,7 +1588,7 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>> struct tun_file *tfile,
>>> int copylen;
>>> bool zerocopy = false;
>>> int err;
>>> - u32 rxhash;
>>> + u32 rxhash = 0;
>>> int skb_xdp = 1;
>>> bool frags = tun_napi_frags_enabled(tun);
>>>
>>> @@ -1739,7 +1776,10 @@ static ssize_t tun_get_user(struct tun_struct
>>> *tun, struct tun_file *tfile,
>>> rcu_read_unlock();
>>> }
>>>
>>> - rxhash = __skb_get_hash_symmetric(skb);
>>> + rcu_read_lock();
>>> + if (!rcu_dereference(tun->steering_prog))
>>> + rxhash = __skb_get_hash_symmetric(skb);
>>> + rcu_read_unlock();
>>>
>>> if (frags) {
>>> /* Exercise flow dissector code path. */
>>> @@ -1783,7 +1823,9 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>> struct tun_file *tfile,
>>> u64_stats_update_end(&stats->syncp);
>>> put_cpu_ptr(stats);
>>>
>>> - tun_flow_update(tun, rxhash, tfile);
>>> + if (rxhash)
>>> + tun_flow_update(tun, rxhash, tfile);
>>> +
>>
>> Nit: zero is a valid hash? In which case, an int64_t initialized to -1 is
>> the
>> safer check.
>
>
> Looks not? E.g looking at __flow_hash_from_keys() it did:
>
> static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
> {
> u32 hash;
>
> __flow_hash_consistentify(keys);
>
> hash = __flow_hash_words(flow_keys_hash_start(keys),
> flow_keys_hash_length(keys), keyval);
> if (!hash)
> hash = 1;
>
> return hash;
> }
>
> Thanks

Interesting, thanks. In that case

Acked-by: Willem de Bruijn <willemb@xxxxxxxxxx>