Re: [PATCH] xen/virtio: Convert PAGE_SIZE/PAGE_SHIFT/PFN_UP to Xen counterparts

From: Oleksandr Tyshchenko
Date: Fri Oct 07 2022 - 13:35:47 EST



On 07.10.22 19:16, Oleksandr wrote:

Hello Xenia

>
> On 07.10.22 18:50, Xenia Ragiadakou wrote:
>
>
> Hello Xenia
>
>>
>> On 10/7/22 16:43, Oleksandr Tyshchenko wrote:
>>>
>>> On 07.10.22 10:15, Xenia Ragiadakou wrote:
>>>>
>>>> On 10/7/22 00:13, Oleksandr Tyshchenko wrote:
>>>>
>>>> Hi Oleksandr
>>>
>>>
>>> Hello Xenia
>>>
>>>
>>>>
>>>>>
>>>>> On 06.10.22 20:59, Xenia Ragiadakou wrote:
>>>>>
>>>>> Hello Xenia
>>>>>
>>>>>>
>>>>>> On 10/6/22 15:09, Oleksandr Tyshchenko wrote:
>>>>>>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@xxxxxxxx>
>>>>>>>
>>>>>>> Although XEN_PAGE_SIZE is equal to PAGE_SIZE (4KB) for now, it
>>>>>>> would
>>>>>>> be more correct to use Xen specific #define-s as XEN_PAGE_SIZE can
>>>>>>> be changed at some point in the future.
>>>>>>>
>>>>>>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@xxxxxxxx>
>>>>>>> ---
>>>>>>> Cc: Juergen Gross <jgross@xxxxxxxx>
>>>>>>> Cc: Xenia Ragiadakou <burzalodowa@xxxxxxxxx>
>>>>>>>
>>>>>>> As it was proposed at:
>>>>>>> https://urldefense.com/v3/__https://lore.kernel.org/xen-devel/20221005174823.1800761-1-olekstysh@xxxxxxxxx/__;!!GF_29dbcQIUBPA!zHt-xZ_7tZc_EM6zva21E_YgwIiEeimFWfsJIpPwAu-TBcnzQhXHqlKzmXmwIcI6uIx_arHNZiaZeHt_428_8p-DyMpd$
>>>>>>>
>>>>>>>
>>>>>>> [lore[.]kernel[.]org]
>>>>>>>
>>>>>>> Should go in only after that series.
>>>>>>> ---
>>>>>>>     drivers/xen/grant-dma-ops.c | 20 ++++++++++----------
>>>>>>>     1 file changed, 10 insertions(+), 10 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/xen/grant-dma-ops.c
>>>>>>> b/drivers/xen/grant-dma-ops.c
>>>>>>> index c66f56d24013..5392fdc25dca 100644
>>>>>>> --- a/drivers/xen/grant-dma-ops.c
>>>>>>> +++ b/drivers/xen/grant-dma-ops.c
>>>>>>> @@ -31,12 +31,12 @@ static
>>>>>>> DEFINE_XARRAY_FLAGS(xen_grant_dma_devices,
>>>>>>> XA_FLAGS_LOCK_IRQ);
>>>>>>>       static inline dma_addr_t grant_to_dma(grant_ref_t grant)
>>>>>>>     {
>>>>>>> -    return XEN_GRANT_DMA_ADDR_OFF | ((dma_addr_t)grant <<
>>>>>>> PAGE_SHIFT);
>>>>>>> +    return XEN_GRANT_DMA_ADDR_OFF | ((dma_addr_t)grant <<
>>>>>>> XEN_PAGE_SHIFT);
>>>>>>>     }
>>>>>>
>>>>>> With this change, can the offset added to the dma handle,
>>>>>> generated by
>>>>>> grant_to_dma(), be the offset in the page? Couldn't it corrupt the
>>>>>> grant ref?
>>>>>
>>>>>
>>>>> Good point, indeed, I think it could corrupt if guest uses a
>>>>> different
>>>>> than Xen page granularity (i.e 64KB).
>>>>>
>>>>>
>>>>>>
>>>>>>>       static inline grant_ref_t dma_to_grant(dma_addr_t dma)
>>>>>>>     {
>>>>>>> -    return (grant_ref_t)((dma & ~XEN_GRANT_DMA_ADDR_OFF) >>
>>>>>>> PAGE_SHIFT);
>>>>>>> +    return (grant_ref_t)((dma & ~XEN_GRANT_DMA_ADDR_OFF) >>
>>>>>>> XEN_PAGE_SHIFT);
>>>>>>>     }
>>>>>>>       static struct xen_grant_dma_data
>>>>>>> *find_xen_grant_dma_data(struct
>>>>>>> device *dev)
>>>>>>> @@ -79,7 +79,7 @@ static void *xen_grant_dma_alloc(struct device
>>>>>>> *dev, size_t size,
>>>>>>>                      unsigned long attrs)
>>>>>>>     {
>>>>>>>         struct xen_grant_dma_data *data;
>>>>>>> -    unsigned int i, n_pages = PFN_UP(size);
>>>>>>> +    unsigned int i, n_pages = XEN_PFN_UP(size);
>>>>>>>         unsigned long pfn;
>>>>>>>         grant_ref_t grant;
>>>>>>>         void *ret;
>>>>>>> @@ -91,14 +91,14 @@ static void *xen_grant_dma_alloc(struct device
>>>>>>> *dev, size_t size,
>>>>>>>         if (unlikely(data->broken))
>>>>>>>             return NULL;
>>>>>>>     -    ret = alloc_pages_exact(n_pages * PAGE_SIZE, gfp);
>>>>>>> +    ret = alloc_pages_exact(n_pages * XEN_PAGE_SIZE, gfp);
>>>>>>>         if (!ret)
>>>>>>>             return NULL;
>>>>>>>           pfn = virt_to_pfn(ret);
>>>>>>>           if (gnttab_alloc_grant_reference_seq(n_pages, &grant)) {
>>>>>>> -        free_pages_exact(ret, n_pages * PAGE_SIZE);
>>>>>>> +        free_pages_exact(ret, n_pages * XEN_PAGE_SIZE);
>>>>>>>             return NULL;
>>>>>>>         }
>>>>>>>     @@ -116,7 +116,7 @@ static void xen_grant_dma_free(struct
>>>>>>> device
>>>>>>> *dev, size_t size, void *vaddr,
>>>>>>>                        dma_addr_t dma_handle, unsigned long attrs)
>>>>>>>     {
>>>>>>>         struct xen_grant_dma_data *data;
>>>>>>> -    unsigned int i, n_pages = PFN_UP(size);
>>>>>>> +    unsigned int i, n_pages = XEN_PFN_UP(size);
>>>>>>>         grant_ref_t grant;
>>>>>>>           data = find_xen_grant_dma_data(dev);
>>>>>>> @@ -138,7 +138,7 @@ static void xen_grant_dma_free(struct device
>>>>>>> *dev, size_t size, void *vaddr,
>>>>>>>           gnttab_free_grant_reference_seq(grant, n_pages);
>>>>>>>     -    free_pages_exact(vaddr, n_pages * PAGE_SIZE);
>>>>>>> +    free_pages_exact(vaddr, n_pages * XEN_PAGE_SIZE);
>>>>>>>     }
>>>>>>>       static struct page *xen_grant_dma_alloc_pages(struct
>>>>>>> device *dev,
>>>>>>> size_t size,
>>>>>>> @@ -168,7 +168,7 @@ static dma_addr_t xen_grant_dma_map_page(struct
>>>>>>> device *dev, struct page *page,
>>>>>>>                          unsigned long attrs)
>>>>>>>     {
>>>>>>>         struct xen_grant_dma_data *data;
>>>>>>> -    unsigned int i, n_pages = PFN_UP(offset + size);
>>>>>>> +    unsigned int i, n_pages = XEN_PFN_UP(offset + size);
>>>>>>
>>>>>> The offset, here, refers to the offset in the page ...
>>>>>>
>>>>>>>         grant_ref_t grant;
>>>>>>>         dma_addr_t dma_handle;
>>>>>>>     @@ -200,8 +200,8 @@ static void xen_grant_dma_unmap_page(struct
>>>>>>> device *dev, dma_addr_t dma_handle,
>>>>>>>                          unsigned long attrs)
>>>>>>>     {
>>>>>>>         struct xen_grant_dma_data *data;
>>>>>>> -    unsigned long offset = dma_handle & (PAGE_SIZE - 1);
>>>>>>> -    unsigned int i, n_pages = PFN_UP(offset + size);
>>>>>>> +    unsigned long offset = dma_handle & ~XEN_PAGE_MASK;
>>>>>>
>>>>>> ... while, here, it refers to the offset in the grant.
>>>>>> So, the calculated number of grants may differ.
>>>>>
>>>>> Good point, I think you are right, so we need to additionally use
>>>>> xen_offset_in_page() macro in xen_grant_dma_map_page(),
>>>>>
>>>>> something like that to be squashed with current patch:
>>>>>
>>>>>
>>>>> diff --git a/drivers/xen/grant-dma-ops.c
>>>>> b/drivers/xen/grant-dma-ops.c
>>>>> index 9d5eca6d638a..bb984dc05deb 100644
>>>>> --- a/drivers/xen/grant-dma-ops.c
>>>>> +++ b/drivers/xen/grant-dma-ops.c
>>>>> @@ -169,7 +169,7 @@ static dma_addr_t xen_grant_dma_map_page(struct
>>>>> device *dev, struct page *page,
>>>>>                                             unsigned long attrs)
>>>>>     {
>>>>>            struct xen_grant_dma_data *data;
>>>>> -       unsigned int i, n_pages = XEN_PFN_UP(offset + size);
>>>>> +       unsigned int i, n_pages =
>>>>> XEN_PFN_UP(xen_offset_in_page(offset)
>>>>> + size);
>>>>>            grant_ref_t grant;
>>>>>            dma_addr_t dma_handle;
>>>>>
>>>>> @@ -191,7 +191,7 @@ static dma_addr_t xen_grant_dma_map_page(struct
>>>>> device *dev, struct page *page,
>>>>>                                    xen_page_to_gfn(page) + i, dir ==
>>>>> DMA_TO_DEVICE);
>>>>>            }
>>>>>
>>>>> -       dma_handle = grant_to_dma(grant) + offset;
>>>>> +       dma_handle = grant_to_dma(grant) +
>>>>> xen_offset_in_page(offset);
>>>>>
>>>>>            return dma_handle;
>>>>>     }
>>>>>
>>>>> Did I get your point right?
>>>>>
>>>>
>>>> I think it 's more complicated than that.
>>>> Let's say that the offset in page is > XEN_PAGE_SIZE, then the
>>>> calculation of the number of grants won't take into account the part
>>>> of the offset that is multiple of the XEN_PAGE_SIZE i.e it will
>>>> calculate only the strictly necessary number of grants.
>>>> But xen_grant_dma_map_page() grants access to the whole page because,
>>>> as it can be observed in the code snippet below, it does not take into
>>>> account the page offset.
>>>>
>>>> for (i = 0; i < n_pages; i++) {
>>>>    gnttab_grant_foreign_access_ref(grant + i, data->backend_domid,
>>>> xen_page_to_gfn(page) + i, dir == DMA_TO_DEVICE);
>>>> }
>>>
>>>
>>> Thanks, valid point. Agree it's indeed more complicated. I will comment
>>> on that later. I have just pushed another fix, it is not related to
>>> XEN_PAGE_SIZE directly, but also about page offset > PAGE_SIZE
>>>
>>
>> I got a little bit confused with the order that the patches will be
>> applied :)
>
> This series should go in the first [1]:
>
> Current patch depends on it and new patch [2] also depends on it. I
> think (but might mistake) that current patch we need/want to postpone
> (because it doesn't fix/improve something immediately, but more for
> future use-cases),
>
> but the new patch is a fix for the real situation. Once we decide with
> new patch [2] we will be able to return to the current patch and
> rebase it.
>
>
> [1]
> https://lore.kernel.org/xen-devel/20221005174823.1800761-1-olekstysh@xxxxxxxxx/
>
> [2]
> https://lore.kernel.org/xen-devel/20221007132736.2275574-1-olekstysh@xxxxxxxxx/
>
>
>
>> IIUC the above scenario cannot happen, i.e the offset to be >
>> PAGE_SIZE, because this callback is used to map for transfer a
>> portion of a single page.
>
> It happens. I have rechecked that. And can provide some debug prints
> if needed.


This is the print in xen_grant_dma_map_page() which is only triggers if
passed offset > PAGE_SIZE (I applied it on top of [2])

@@ -195,6 +195,12 @@ static dma_addr_t xen_grant_dma_map_page(struct
device *dev, struct page *page,

        dma_handle = grant_to_dma(grant) + dma_offset;

+       if (offset > PAGE_SIZE) {
+               printk("%s[%d] dma_handle 0x%llx: offset 0x%lx
(dma_offset 0x%lx gfn_offset 0x%lx) size 0x%lx n_pages %d\n", __func__,
__LINE__,
+                               dma_handle, offset, dma_offset,
gfn_offset, size, n_pages);
+               WARN_ON(1);
+       }
+
        return dma_handle;
 }


At the runtime we have, for example:

[   10.277599] xen_grant_dma_map_page[199] dma_handle
0x8000000000044aa8: offset 0x3aa8 (dma_offset 0xaa8 gfn_offset 0x3) size
0x3a0 n_pages 1

[   10.277695] ------------[ cut here ]------------
[   10.277715] WARNING: CPU: 3 PID: 122 at
drivers/xen/grant-dma-ops.c:201 xen_grant_dma_map_page+0x194/0x1a0
[   10.277757] Modules linked in:
[   10.277779] CPU: 3 PID: 122 Comm: kworker/u8:6 Tainted: G        W
6.0.0-rc7-yocto-standard-00352-g0c5e442382bb-dirty #1
[   10.277823] Hardware name: XENVM-4.17 (DT)
[   10.277840] Workqueue: rpciod rpc_async_schedule
[   10.277870] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS
BTYPE=--)
[   10.277897] pc : xen_grant_dma_map_page+0x194/0x1a0
[   10.277920] lr : xen_grant_dma_map_page+0x194/0x1a0
[   10.477036] sp : ffff800009eb34a0
[   10.477068] x29: ffff800009eb34b0 x28: ffff0001c1e0c600 x27:
ffff0001c0bb3f00
[   10.477116] x26: 0000000000003aa8 x25: ffff0001c0fbab80 x24:
fffffc0000000000
[   10.477167] x23: 0000000000000003 x22: 0000000000000001 x21:
0000000000000001
[   10.477215] x20: fffffc0007048800 x19: 8000000000044aa8 x18:
0000000000000010
[   10.477265] x17: 657366666f203a38 x16: 00000000deadbeef x15:
3030303030303878
[   10.477311] x14: 00000000000005d5 x13: ffff0001c0fbafe0 x12:
00000000ffffffea
[   10.477361] x11: 00000000ffffefff x10: 00000000ffffefff x9 :
ffff80000969acc0
[   10.477407] x8 : 0000000000017fe8 x7 : c0000000ffffefff x6 :
000000000000bff4
[   10.477459] x5 : 0000000000057fa8 x4 : 0000000000000000 x3 :
ffff800009eb3238
[   10.477507] x2 : acd9abf2a61fc800 x1 : 0000000000000000 x0 :
000000000000007f
[   10.477554] Call trace:
[   10.477570]  xen_grant_dma_map_page+0x194/0x1a0
[   10.477608]  dma_map_page_attrs+0x1d4/0x230
[   10.477638]  vring_map_one_sg+0x60/0x70
[   10.477668]  virtqueue_add_outbuf+0x248/0x780
[   10.477705]  start_xmit+0x1d0/0x518
[   10.477734]  dev_hard_start_xmit+0x98/0x158
[   10.477767]  sch_direct_xmit+0xec/0x378
[   10.477795]  __dev_queue_xmit+0x5b8/0xc50
[   10.477820]  ip_finish_output2+0x234/0x560
[   10.477853]  __ip_finish_output+0xac/0x268
[   10.477878]  ip_output+0xfc/0x1b0
[   10.477905]  ip_local_out+0x48/0x60
[   10.477934]  __ip_queue_xmit+0x140/0x3c8
[   10.477960]  ip_queue_xmit+0x14/0x20
[   10.477986]  __tcp_transmit_skb+0x480/0xad0
[   10.478014]  tcp_write_xmit+0x5dc/0x1048
[   10.478045]  __tcp_push_pending_frames+0x3c/0xc8
[   10.478077]  __tcp_sock_set_cork.part.0+0x60/0x70
[   10.478113]  tcp_sock_set_cork+0x50/0x68
[   10.478141]  xs_tcp_send_request+0x1d0/0x248
[   10.478174]  xprt_transmit+0x154/0x298
[   10.478206]  call_transmit+0x98/0xb0
[   10.478232]  __rpc_execute+0xb0/0x338
[   10.478258]  rpc_async_schedule+0x2c/0x50
[   10.478286]  process_one_work+0x1d0/0x320
[   10.478314]  worker_thread+0x4c/0x400
[   10.478341]  kthread+0x110/0x120
[   10.478371]  ret_from_fork+0x10/0x20
[   10.478399] ---[ end trace 0000000000000000 ]---


We get an offset 0x3aa8. We are in the process of mapping virtio
descriptor which is passed from the top level as scatterlist, according
to the dump

the dma_map_page() is called from here with scatterlist parameters:

https://elixir.bootlin.com/linux/v6.0-rc7/source/drivers/virtio/virtio_ring.c#L363

So we are not dealing with a range within a single page.


Or I really missed something.


>
>
>
>>
>>
>>> so touches the same code and should be prereq:
>>>
>>> https://urldefense.com/v3/__https://lore.kernel.org/all/20221007132736.2275574-1-olekstysh@xxxxxxxxx/__;!!GF_29dbcQIUBPA!36CJ5S5T47HMh9Nq-0WVUPSlnHjvlUon-ooFGowbvGA8BdWSCC9niF0f_btvite1G6LIRwyw6XsU6PAnCjMy92KiNkMG$
>>> [lore[.]kernel[.]org]
>>>
>>>
>>>>
>>>>>>
>>>>>>
>>>>>>> +    unsigned int i, n_pages = XEN_PFN_UP(offset + size);
>>>>>>>         grant_ref_t grant;
>>>>>>>           if (WARN_ON(dir == DMA_NONE))
>>>>>>
>>>>>
>>>>> Thank you.
>>>>>
>>>>>
>>>>
>>
--
Regards,

Oleksandr Tyshchenko