Re: [PATCH] BUG: gfp_zone() not mapping zone modifiers correctlyand bad ordering of fallback lists

From: Andrew Morton
Date: Fri Jan 13 2006 - 15:17:53 EST


mel@xxxxxxxxx (Mel Gorman) wrote:
>
> Hi Andrew,
>
> This patch is divided into two parts and addresses a bug in how zone
> fallback lists are calculated and how __GFP_* zone modifiers are mapped to
> their equivilant ZONE_* type. It applies to 2.6.15-mm3 and has been tested
> on x86 and ppc64. It has been reported by Yasunori Goto that it boots on
> ia64. Details as follows;

This stuff is nasty. It'd be nice to split the patch into two (always)
(please).

> build_zonelists() attempts to be smart, and uses highest_zone() so that it
> doesn't attempt to call build_zonelists_node() for empty zones. However,
> build_zonelists_node() is smart enough to do the right thing by itself and
> build_zonelists() already has the zone index that highest_zone() is meant
> to provide. So, remove the unnecessary function highest_zone().

Dave, Andy: could you please have a think about the fallback list thing?

> The helper function gfp_zone() assumes that the bits used in the zone modifier
> of a GFP flag maps directory on to their ZONE_* equivalent and just applies a
> mask. However, the bits do not map directly and the wrong fallback lists can
> be used. If unluckly, the system can go OOM when plenty of suitable memory
> is available. This patch redefines the __GFP_ zone modifier flags to allow
> a simple mapping to their equivilant ZONE_ type.

Linus, you had a look at this a few weeks ago and I think you had
objections to the gfp-modifiers-are-bitmasks approach?

> Signed-off-by: Mel Gorman <mel@xxxxxxxxx>
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.15-mm3-clean/include/linux/gfp.h linux-2.6.15-mm3-001_fallbacks/include/linux/gfp.h
> --- linux-2.6.15-mm3-clean/include/linux/gfp.h 2006-01-11 14:24:18.000000000 +0000
> +++ linux-2.6.15-mm3-001_fallbacks/include/linux/gfp.h 2006-01-13 09:18:09.000000000 +0000
> @@ -11,15 +11,19 @@ struct vm_area_struct;
> /*
> * GFP bitmasks..
> */
> -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */
> -#define __GFP_DMA ((__force gfp_t)0x01u)
> -#define __GFP_HIGHMEM ((__force gfp_t)0x02u)
> +/*
> + * Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits)
> + * The values here are mapped to their equivilant ZONE_* type by gfp_zone
> + * which makes assumptions on the value of these bits.
> + */
> +#define __GFP_DMA ((__force gfp_t)0x02u)
> +#define __GFP_HIGHMEM ((__force gfp_t)0x01u)
> #ifdef CONFIG_DMA_IS_DMA32
> -#define __GFP_DMA32 ((__force gfp_t)0x01) /* ZONE_DMA is ZONE_DMA32 */
> +#define __GFP_DMA32 ((__force gfp_t)0x02u) /* ZONE_DMA is ZONE_DMA32 */
> #elif BITS_PER_LONG < 64
> -#define __GFP_DMA32 ((__force gfp_t)0x00) /* ZONE_NORMAL is ZONE_DMA32 */
> +#define __GFP_DMA32 ((__force gfp_t)0x00u) /* ZONE_NORMAL is ZONE_DMA32 */
> #else
> -#define __GFP_DMA32 ((__force gfp_t)0x04) /* Has own ZONE_DMA32 */
> +#define __GFP_DMA32 ((__force gfp_t)0x03u) /* Has own ZONE_DMA32 */
> #endif
>
> /*
> @@ -75,10 +79,15 @@ struct vm_area_struct;
> #define GFP_DMA32 __GFP_DMA32
>
>
> +/**
> + * GFP zone modifiers need to be mapped to their equivilant ZONE_ value defined
> + * in linux/mmzone.h to determine what fallback list should be used for the
> + * allocation.
> + */
> static inline int gfp_zone(gfp_t gfp)
> {
> - int zone = GFP_ZONEMASK & (__force int) gfp;
> - BUG_ON(zone >= GFP_ZONETYPES);
> + int zone = ((__force int) gfp & GFP_ZONEMASK) ^ 0x2;
> + BUG_ON(zone >= MAX_NR_ZONES);
> return zone;
> }
>
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.15-mm3-clean/mm/page_alloc.c linux-2.6.15-mm3-001_fallbacks/mm/page_alloc.c
> --- linux-2.6.15-mm3-clean/mm/page_alloc.c 2006-01-11 14:24:18.000000000 +0000
> +++ linux-2.6.15-mm3-001_fallbacks/mm/page_alloc.c 2006-01-13 09:10:17.000000000 +0000
> @@ -1577,18 +1577,6 @@ static int __init build_zonelists_node(p
> return nr_zones;
> }
>
> -static inline int highest_zone(int zone_bits)
> -{
> - int res = ZONE_NORMAL;
> - if (zone_bits & (__force int)__GFP_HIGHMEM)
> - res = ZONE_HIGHMEM;
> - if (zone_bits & (__force int)__GFP_DMA32)
> - res = ZONE_DMA32;
> - if (zone_bits & (__force int)__GFP_DMA)
> - res = ZONE_DMA;
> - return res;
> -}
> -
> #ifdef CONFIG_NUMA
> #define MAX_NODE_LOAD (num_online_nodes())
> static int __initdata node_load[MAX_NUMNODES];
> @@ -1690,13 +1678,11 @@ static void __init build_zonelists(pg_da
> node_load[node] += load;
> prev_node = node;
> load--;
> - for (i = 0; i < GFP_ZONETYPES; i++) {
> + for (i = 0; i < MAX_NR_ZONES; i++) {
> zonelist = pgdat->node_zonelists + i;
> for (j = 0; zonelist->zones[j] != NULL; j++);
>
> - k = highest_zone(i);
> -
> - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
> + j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
> zonelist->zones[j] = NULL;
> }
> }
> @@ -1706,17 +1692,16 @@ static void __init build_zonelists(pg_da
>
> static void __init build_zonelists(pg_data_t *pgdat)
> {
> - int i, j, k, node, local_node;
> + int i, j, node, local_node;
>
> local_node = pgdat->node_id;
> - for (i = 0; i < GFP_ZONETYPES; i++) {
> + for (i = 0; i < MAX_NR_ZONES; i++) {
> struct zonelist *zonelist;
>
> zonelist = pgdat->node_zonelists + i;
>
> j = 0;
> - k = highest_zone(i);
> - j = build_zonelists_node(pgdat, zonelist, j, k);
> + j = build_zonelists_node(pgdat, zonelist, j, i);
> /*
> * Now we build the zonelist so that it contains the zones
> * of all the other nodes.
> @@ -1728,12 +1713,12 @@ static void __init build_zonelists(pg_da
> for (node = local_node + 1; node < MAX_NUMNODES; node++) {
> if (!node_online(node))
> continue;
> - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
> + j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
> }
> for (node = 0; node < local_node; node++) {
> if (!node_online(node))
> continue;
> - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
> + j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
> }
>
> zonelist->zones[j] = NULL;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/