Re: [PATCH v9 1/8] mm/demotion: Add support for explicit memory tiers

From: Aneesh Kumar K.V
Date: Fri Jul 15 2022 - 06:27:48 EST


Aneesh Kumar K V <aneesh.kumar@xxxxxxxxxxxxx> writes:

....

>
>> You dropped the original sysfs interface patches from the series, but
>> the kernel internal implementation is still for the original sysfs
>> interface. For example, memory tier ID is for the original sysfs
>> interface, not for the new proposed sysfs interface. So I suggest you
>> to implement with the new interface in mind. What do you think about
>> the following design?
>>
>
> Sorry I am not able to follow you here. This patchset completely drops
> exposing memory tiers to userspace via sysfs. Instead it allow
> creation of memory tiers with specific tierID from within the kernel/device driver.
> Default tierID is 200 and dax kmem creates memory tier with tierID 100.
>
>
>> - Each NUMA node belongs to a memory type, and each memory type
>> corresponds to a "abstract distance", so each NUMA node corresonds to
>> a "distance". For simplicity, we can start with static distances, for
>> example, DRAM (default): 150, PMEM: 250. The distance of each NUMA
>> node can be recorded in a global array,
>>
>> int node_distances[MAX_NUMNODES];
>>
>> or, just
>>
>> pgdat->distance
>>
>
> I don't follow this. I guess you are trying to have a different design.
> Would it be much easier if you can write this in the form of a patch?
>
>
>> - Each memory tier corresponds to a range of distance, for example,
>> 0-100, 100-200, 200-300, >300, we can start with static ranges too.
>>
>> - The core API of memory tier could be
>>
>> struct memory_tier *find_create_memory_tier(int distance);
>>
>> it will find the memory tier which covers "distance" in the memory
>> tier list, or create a new memory tier if not found.
>>
>
> I was expecting this to be internal to dax kmem. How dax kmem maps
> "abstract distance" to a memory tier. At this point this patchset is
> keeping all that for a future patchset.
>

This shows how i was expecting "abstract distance" to be integrated.

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 82cae08976bc..1281aec63986 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -1332,6 +1332,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
ndr_desc.mapping = &mapping;
ndr_desc.num_mappings = 1;
ndr_desc.nd_set = &p->nd_set;
+ ndr_desc.memtier_distance = PMEM_MEMTIER_DEFAULT_DISTANCE;

if (p->hcall_flush_required) {
set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index ae5f4acf2675..7b8cf1f15562 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2641,6 +2641,10 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
NUMA_NO_NODE, ndr_desc->numa_node, &res.start, &res.end);
}

+ /*
+ * We may want to look at SLIT/HMAT to fine tune this
+ */
+ ndr_desc->memtier_distance = PMEM_MEMTIER_DEFAULT_DISTANCE;
/*
* Persistence domain bits are hierarchical, if
* ACPI_NFIT_CAPABILITY_CACHE_FLUSH is set then
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 1dad813ee4a6..708a40cf29c0 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -570,8 +570,9 @@ static void dax_region_unregister(void *region)
}

struct dax_region *alloc_dax_region(struct device *parent, int region_id,
- struct range *range, int target_node, unsigned int align,
- unsigned long flags)
+ struct range *range, int target_node,
+ int memtier_distance, unsigned int align,
+ unsigned long flags)
{
struct dax_region *dax_region;

@@ -599,6 +600,7 @@ struct dax_region *alloc_dax_region(struct device *parent, int region_id,
dax_region->align = align;
dax_region->dev = parent;
dax_region->target_node = target_node;
+ dax_region->memtier_distance = memtier_distance;
ida_init(&dax_region->ida);
dax_region->res = (struct resource) {
.start = range->start,
@@ -1370,6 +1372,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)

dev_dax->dax_dev = dax_dev;
dev_dax->target_node = dax_region->target_node;
+ dev_dax->memtier_distance = dax_region->memtier_distance;
dev_dax->align = dax_region->align;
ida_init(&dev_dax->ida);
kref_get(&dax_region->kref);
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index fbb940293d6d..3de4292392dd 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -13,8 +13,9 @@ void dax_region_put(struct dax_region *dax_region);

#define IORESOURCE_DAX_STATIC (1UL << 0)
struct dax_region *alloc_dax_region(struct device *parent, int region_id,
- struct range *range, int target_node, unsigned int align,
- unsigned long flags);
+ struct range *range, int target_node,
+ int memtier_distance, unsigned int align,
+ unsigned long flags);

struct dev_dax_data {
struct dax_region *dax_region;
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 1c974b7caae6..5db382c78d0e 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -31,6 +31,7 @@ void dax_bus_exit(void);
struct dax_region {
int id;
int target_node;
+ int memtier_distance;
struct kref kref;
struct device *dev;
unsigned int align;
@@ -64,6 +65,7 @@ struct dev_dax {
struct dax_device *dax_dev;
unsigned int align;
int target_node;
+ int memtier_distance;
int id;
struct ida ida;
struct device dev;
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 1bf040dbc834..b9f80971c07b 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -26,7 +26,7 @@ static int dax_hmem_probe(struct platform_device *pdev)
range.start = res->start;
range.end = res->end;
dax_region = alloc_dax_region(dev, pdev->id, &range, mri->target_node,
- PMD_SIZE, 0);
+ mri->memtier_distance, PMD_SIZE, 0);
if (!dax_region)
return -ENOMEM;

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 0c03889286ac..32878bd96f09 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -45,13 +45,18 @@ struct dax_kmem_data {
static unsigned int dax_kmem_memtier = MEMORY_TIER_PMEM;
module_param(dax_kmem_memtier, uint, 0644);

+int find_memtier_from_distance(struct dev_dax *dev_dax)
+{
+ return dax_kmem_memtier + dev_dax->memtier_distance;
+}
+
static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
{
struct device *dev = &dev_dax->dev;
unsigned long total_len = 0;
struct dax_kmem_data *data;
int i, rc, mapped = 0;
- int numa_node;
+ int numa_node, mem_tier;

/*
* Ensure good NUMA information for the persistent memory.
@@ -150,7 +155,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
}

dev_set_drvdata(dev, data);
- node_create_and_set_memory_tier(numa_node, dax_kmem_memtier);
+ mem_tier = find_memtier_from_distance(dev_dax);
+ node_create_and_set_memory_tier(numa_node, mem_tier);
return 0;

err_request_mem:
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index f050ea78bb83..1b51fc0490de 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -54,8 +54,10 @@ static struct dev_dax *__dax_pmem_probe(struct device *dev)
range = pgmap.range;
range.start += offset;
dax_region = alloc_dax_region(dev, region_id, &range,
- nd_region->target_node, le32_to_cpu(pfn_sb->align),
- IORESOURCE_DAX_STATIC);
+ nd_region->target_node,
+ nd_region->memtier_distance,
+ le32_to_cpu(pfn_sb->align),
+ IORESOURCE_DAX_STATIC);
if (!dax_region)
return ERR_PTR(-ENOMEM);

diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index ec5219680092..cf7a379a2220 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -416,6 +416,7 @@ struct nd_region {
u64 ndr_size;
u64 ndr_start;
int id, num_lanes, ro, numa_node, target_node;
+ int memtier_distance;
void *provider_data;
struct kernfs_node *bb_state;
struct badblocks bb;
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index d976260eca7a..f2067de8d660 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1019,6 +1019,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
nd_region->ro = ro;
nd_region->numa_node = ndr_desc->numa_node;
nd_region->target_node = ndr_desc->target_node;
+ nd_region->memtier_distance = ndr_desc->memtier_distance;
ida_init(&nd_region->ns_ida);
ida_init(&nd_region->btt_ida);
ida_init(&nd_region->pfn_ida);
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 0d61e07b6827..bf20e018074f 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -121,6 +121,7 @@ struct nd_region_desc {
int num_lanes;
int numa_node;
int target_node;
+ int memtier_distance;
unsigned long flags;
struct device_node *of_node;
int (*flush)(struct nd_region *nd_region, struct bio *bio);
@@ -224,6 +225,8 @@ struct nvdimm_fw_ops {
int (*arm)(struct nvdimm *nvdimm, enum nvdimm_fwa_trigger arg);
};

+#define PMEM_MEMTIER_DEFAULT_DISTANCE 10
+
void badrange_init(struct badrange *badrange);
int badrange_add(struct badrange *badrange, u64 addr, u64 length);
void badrange_forget(struct badrange *badrange, phys_addr_t start,
diff --git a/include/linux/memregion.h b/include/linux/memregion.h
index c04c4fd2e209..5850e2bbbfed 100644
--- a/include/linux/memregion.h
+++ b/include/linux/memregion.h
@@ -6,6 +6,7 @@

struct memregion_info {
int target_node;
+ int memtier_distance;
};

#ifdef CONFIG_MEMREGION