Re: [PATCH] vmalloc: respect the GFP_NOIO and GFP_NOFS flags

From: Mikulas Patocka
Date: Fri Jun 30 2017 - 14:12:19 EST




On Fri, 30 Jun 2017, Michal Hocko wrote:

> On Thu 29-06-17 22:25:09, Mikulas Patocka wrote:
> > The __vmalloc function has a parameter gfp_mask with the allocation flags,
> > however it doesn't fully respect the GFP_NOIO and GFP_NOFS flags. The
> > pages are allocated with the specified gfp flags, but the pagetables are
> > always allocated with GFP_KERNEL. This allocation can cause unexpected
> > recursion into the filesystem or I/O subsystem.
> >
> > It is not practical to extend page table allocation routines with gfp
> > flags because it would require modification of architecture-specific code
> > in all architecturs. However, the process can temporarily request that all
> > allocations are done with GFP_NOFS or GFP_NOIO with with the functions
> > memalloc_nofs_save and memalloc_noio_save.
> >
> > This patch makes the vmalloc code use memalloc_nofs_save or
> > memalloc_noio_save if the supplied gfp flags do not contain __GFP_FS or
> > __GFP_IO. It fixes some possible deadlocks in drivers/mtd/ubi/io.c,
> > fs/gfs2/, fs/btrfs/free-space-tree.c, fs/ubifs/,
> > fs/nfs/blocklayout/extent_tree.c where __vmalloc is used with the GFP_NOFS
> > flag.
>
> I strongly believe this is a step in the _wrong_ direction. Why? Because

What do you think __vmalloc with GFP_NOIO should do? Print a warning?
Silently ignore the GFP_NOIO flag?

Mikulas

> the memalloc_no{io,fs}_save API is for the scope allocation context. We
> want users of the scope to define it and document why it is needed.
> GFP_NOFS (I haven't checked GFP_NOIO users) is overused a _lot_ mostly
> based on the filesystem should rather use it to prevent deadlock cargo
> cult. This should change longterm because heavy fs workloads can cause
> troubles to the memory reclaim. So we really want to encourage those
> users to define nofs scopes (e.g. on journal locked contexts etc.)
> rather than have them use the GFP_NOFS explicitly and very often
> mindlessly.
>
> I am not going to nack this patch because it not incorrect but I would
> really like to discourage you from it because while it saves 24 lines of
> code it (ab)uses the scope allocation context at a wrong layer.
>
> > The patch also simplifies code in dm-bufio.c, dm-ioctl.c and fs/xfs/kmem.c
> > by removing explicit calls to memalloc_nofs_save and memalloc_noio_save
> > before the call to __vmalloc.
> >
> > Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx>
> >
> > ---
> > drivers/md/dm-bufio.c | 24 +-----------------------
> > drivers/md/dm-ioctl.c | 6 +-----
> > fs/xfs/kmem.c | 14 --------------
> > mm/util.c | 6 +++---
> > mm/vmalloc.c | 18 +++++++++++++++++-
> > 5 files changed, 22 insertions(+), 46 deletions(-)
> >
> > Index: linux-2.6/mm/vmalloc.c
> > ===================================================================
> > --- linux-2.6.orig/mm/vmalloc.c
> > +++ linux-2.6/mm/vmalloc.c
> > @@ -31,6 +31,7 @@
> > #include <linux/compiler.h>
> > #include <linux/llist.h>
> > #include <linux/bitops.h>
> > +#include <linux/sched/mm.h>
> >
> > #include <linux/uaccess.h>
> > #include <asm/tlbflush.h>
> > @@ -1670,6 +1671,8 @@ static void *__vmalloc_area_node(struct
> > unsigned int nr_pages, array_size, i;
> > const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
> > const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN;
> > + unsigned noio_flag;
> > + int r;
> >
> > nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> > array_size = (nr_pages * sizeof(struct page *));
> > @@ -1712,8 +1715,21 @@ static void *__vmalloc_area_node(struct
> > cond_resched();
> > }
> >
> > - if (map_vm_area(area, prot, pages))
> > + if (unlikely(!(gfp_mask & __GFP_IO)))
> > + noio_flag = memalloc_noio_save();
> > + else if (unlikely(!(gfp_mask & __GFP_FS)))
> > + noio_flag = memalloc_nofs_save();
> > +
> > + r = map_vm_area(area, prot, pages);
> > +
> > + if (unlikely(!(gfp_mask & __GFP_IO)))
> > + memalloc_noio_restore(noio_flag);
> > + else if (unlikely(!(gfp_mask & __GFP_FS)))
> > + memalloc_nofs_restore(noio_flag);
> > +
> > + if (unlikely(r))
> > goto fail;
> > +
> > return area->addr;
> >
> > fail:
> > Index: linux-2.6/mm/util.c
> > ===================================================================
> > --- linux-2.6.orig/mm/util.c
> > +++ linux-2.6/mm/util.c
> > @@ -351,10 +351,10 @@ void *kvmalloc_node(size_t size, gfp_t f
> > void *ret;
> >
> > /*
> > - * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
> > - * so the given set of flags has to be compatible.
> > + * vmalloc uses blocking allocations for some internal allocations
> > + * (e.g page tables) so the given set of flags has to be compatible.
> > */
> > - WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
> > + WARN_ON_ONCE(!gfpflags_allow_blocking(flags));
> >
> > /*
> > * We want to attempt a large physically contiguous block first because
> > Index: linux-2.6/drivers/md/dm-bufio.c
> > ===================================================================
> > --- linux-2.6.orig/drivers/md/dm-bufio.c
> > +++ linux-2.6/drivers/md/dm-bufio.c
> > @@ -386,9 +386,6 @@ static void __cache_size_refresh(void)
> > static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
> > enum data_mode *data_mode)
> > {
> > - unsigned noio_flag;
> > - void *ptr;
> > -
> > if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
> > *data_mode = DATA_MODE_SLAB;
> > return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
> > @@ -402,26 +399,7 @@ static void *alloc_buffer_data(struct dm
> > }
> >
> > *data_mode = DATA_MODE_VMALLOC;
> > -
> > - /*
> > - * __vmalloc allocates the data pages and auxiliary structures with
> > - * gfp_flags that were specified, but pagetables are always allocated
> > - * with GFP_KERNEL, no matter what was specified as gfp_mask.
> > - *
> > - * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
> > - * all allocations done by this process (including pagetables) are done
> > - * as if GFP_NOIO was specified.
> > - */
> > -
> > - if (gfp_mask & __GFP_NORETRY)
> > - noio_flag = memalloc_noio_save();
> > -
> > - ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
> > -
> > - if (gfp_mask & __GFP_NORETRY)
> > - memalloc_noio_restore(noio_flag);
> > -
> > - return ptr;
> > + return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
> > }
> >
> > /*
> > Index: linux-2.6/drivers/md/dm-ioctl.c
> > ===================================================================
> > --- linux-2.6.orig/drivers/md/dm-ioctl.c
> > +++ linux-2.6/drivers/md/dm-ioctl.c
> > @@ -1691,7 +1691,6 @@ static int copy_params(struct dm_ioctl _
> > struct dm_ioctl *dmi;
> > int secure_data;
> > const size_t minimum_data_size = offsetof(struct dm_ioctl, data);
> > - unsigned noio_flag;
> >
> > if (copy_from_user(param_kernel, user, minimum_data_size))
> > return -EFAULT;
> > @@ -1714,10 +1713,7 @@ static int copy_params(struct dm_ioctl _
> > * suspended and the ioctl is needed to resume it.
> > * Use kmalloc() rather than vmalloc() when we can.
> > */
> > - dmi = NULL;
> > - noio_flag = memalloc_noio_save();
> > - dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL | __GFP_HIGH);
> > - memalloc_noio_restore(noio_flag);
> > + dmi = kvmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH);
> >
> > if (!dmi) {
> > if (secure_data && clear_user(user, param_kernel->data_size))
> > Index: linux-2.6/fs/xfs/kmem.c
> > ===================================================================
> > --- linux-2.6.orig/fs/xfs/kmem.c
> > +++ linux-2.6/fs/xfs/kmem.c
> > @@ -48,7 +48,6 @@ kmem_alloc(size_t size, xfs_km_flags_t f
> > void *
> > kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
> > {
> > - unsigned nofs_flag = 0;
> > void *ptr;
> > gfp_t lflags;
> >
> > @@ -56,22 +55,9 @@ kmem_zalloc_large(size_t size, xfs_km_fl
> > if (ptr)
> > return ptr;
> >
> > - /*
> > - * __vmalloc() will allocate data pages and auxillary structures (e.g.
> > - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
> > - * here. Hence we need to tell memory reclaim that we are in such a
> > - * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
> > - * the filesystem here and potentially deadlocking.
> > - */
> > - if (flags & KM_NOFS)
> > - nofs_flag = memalloc_nofs_save();
> > -
> > lflags = kmem_flags_convert(flags);
> > ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL);
> >
> > - if (flags & KM_NOFS)
> > - memalloc_nofs_restore(nofs_flag);
> > -
> > return ptr;
> > }
> >
>
> --
> Michal Hocko
> SUSE Labs
>