Re: [PATCH v13 4/9] mm/demotion/dax/kmem: Set node's abstract distance to MEMTIER_DEFAULT_DAX_ADISTANCE

From: Aneesh Kumar K V
Date: Tue Aug 09 2022 - 01:33:58 EST


On 8/9/22 8:34 AM, Huang, Ying wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes:
>
>> By default, all nodes are assigned to the default memory tier which
>> is the memory tier designated for nodes with DRAM
>>
>> Set dax kmem device node's tier to slower memory tier by assigning
>> abstract distance to MEMTIER_DEFAULT_DAX_ADISTANCE. Low-level drivers
>> like papr_scm or ACPI NFIT can initialize memory device type to a
>> more accurate value based on device tree details or HMAT.
>
> I don't know how ACPI NFIT can help here. Can you teach me?
>
> Per my understanding, we may use the information provided by ACPI SLIT
> or HMAT (or device tree via papr_scm) to create memory types. Before
> that is implemented, we just create a memory type with default abstract
> distance.
>

My idea is to use ACPI NFIT driver that creates a persistent memory region
(nvdimm_region_create) to also create memory type and assign that to the
NUMA node mapping that region. For now NFIT driver manages all the persistent
memory region/DIMM creation (drivers/acpi/nfit/core.c). It can also do the
memory type creation using other ACPI information like SLIT/HMAT etc. Similarly
CXL driver can do the same using CDAT.



>> If the
>> kernel doesn't find the memory type initialized, a default slower
>> memory type is assigned by the kmem driver.
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx>
>> ---
>> drivers/dax/kmem.c | 40 ++++++++++++++++++-
>> include/linux/memory-tiers.h | 26 ++++++++++++-
>> mm/memory-tiers.c | 74 +++++++++++++++++++++++++-----------
>> 3 files changed, 115 insertions(+), 25 deletions(-)
>>
>> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
>> index a37622060fff..b5cb03307af8 100644
>> --- a/drivers/dax/kmem.c
>> +++ b/drivers/dax/kmem.c
>> @@ -11,9 +11,17 @@
>> #include <linux/fs.h>
>> #include <linux/mm.h>
>> #include <linux/mman.h>
>> +#include <linux/memory-tiers.h>
>> #include "dax-private.h"
>> #include "bus.h"
>>
>> +/*
>> + * Default abstract distance assigned to the NUMA node onlined
>> + * by DAX/kmem if the low level platform driver didn't initialize
>> + * one for this NUMA node.
>> + */
>
> We have 2 choices here.
>
> 1. The low level drivers create memory types and set
> node_memory_types[]. We need a mechanism to coordinate among multiple
> drivers. On x86, we have ACPI SLIT, HMAT. And we have platform
> independent CXL defined CDAT.
>
> 2. The high level driver (such as dax/kmem.c) coordinate among multiple
> low level drivers. For example, it may query CXL CDAT firstly, and use
> a notifier chain to query platform drivers with priority.
>
> Personally, I prefer choice 2. We can discuss this later. But can we
> make the comments more general to avoid to make decision now?
>
>> +#define MEMTIER_DEFAULT_DAX_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 2)
>> +
>> /* Memory resource name used for add_memory_driver_managed(). */
>> static const char *kmem_name;
>> /* Set if any memory will remain added when the driver will be unloaded. */
>> @@ -41,6 +49,7 @@ struct dax_kmem_data {
>> struct resource *res[];
>> };
>>
>> +static struct memory_dev_type *dax_slowmem_type;
>
> I don't think "slowmem" make much sense here. There may be even slower
> memory. Just dax_mem_type or dax_kmem_type should be OK?
>

Dan Williams had a question about why we are adding an abstract distance slower than DRAM in
dax/kmem where we can also initialize faster than DRAM memory using kmem. The
"slowmem" was added to indicate that this type is slower than DRAM.

>> static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>> {
>> struct device *dev = &dev_dax->dev;
>> @@ -62,6 +71,8 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>> return -EINVAL;
>> }
>>
>> + init_node_memory_type(numa_node, dax_slowmem_type);
>> +
>
> I don't find error handling in this function. Per my understanding, if
> memory hot-add fails, we need to call clear_node_memory_type().
>

Added

>> for (i = 0; i < dev_dax->nr_range; i++) {
>> struct range range;
>>
>> @@ -162,6 +173,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>> static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
>> {
>> int i, success = 0;
>> + int node = dev_dax->target_node;
>> struct device *dev = &dev_dax->dev;
>> struct dax_kmem_data *data = dev_get_drvdata(dev);
>>
>> @@ -198,6 +210,14 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
>> kfree(data->res_name);
>> kfree(data);
>> dev_set_drvdata(dev, NULL);
>> + /*
>> + * Clear the memtype association on successful unplug.
>> + * If not, we have memory blocks left which can be
>> + * offlined/onlined later. We need to keep memory_dev_type
>> + * for that. This implies this reference will be around
>> + * till next reboot.
>> + */
>> + clear_node_memory_type(node, dax_slowmem_type);
>> }
>> }
>> #else
>> @@ -228,9 +248,27 @@ static int __init dax_kmem_init(void)
>> if (!kmem_name)
>> return -ENOMEM;
>>
>> + dax_slowmem_type = kmalloc(sizeof(*dax_slowmem_type), GFP_KERNEL);
>> + if (!dax_slowmem_type) {
>> + rc = -ENOMEM;
>> + goto kmem_name_free;
>> + }
>> + dax_slowmem_type->adistance = MEMTIER_DEFAULT_DAX_ADISTANCE;
>> + INIT_LIST_HEAD(&dax_slowmem_type->tier_sibiling);
>> + dax_slowmem_type->nodes = NODE_MASK_NONE;
>> + dax_slowmem_type->memtier = NULL;
>> + kref_init(&dax_slowmem_type->kref);
>> +
>
> Here we initialize the kref to 1. So in dax_kmem_exit() we should drop
> the last reference, otherwise we cannot free the memory type?
>

Add the required destroy to dax_kmem_exit()

>> rc = dax_driver_register(&device_dax_kmem_driver);
>> if (rc)
>> - kfree_const(kmem_name);
>> + goto error_out;
>> +
>> + return rc;
>> +
>> +error_out:
>> + kfree(dax_slowmem_type);
>> +kmem_name_free:
>> + kfree_const(kmem_name);
>> return rc;
>> }
>>
>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>> index cc89876899a6..7bf6f47d581a 100644
>> --- a/include/linux/memory-tiers.h
>> +++ b/include/linux/memory-tiers.h
>> @@ -2,6 +2,8 @@
>> #ifndef _LINUX_MEMORY_TIERS_H
>> #define _LINUX_MEMORY_TIERS_H
>>
>> +#include <linux/types.h>
>> +#include <linux/nodemask.h>
>> /*
>> * Each tier cover a abstrace distance chunk size of 128
>> */
>> @@ -13,12 +15,34 @@
>> #define MEMTIER_ADISTANCE_DRAM (4 * MEMTIER_CHUNK_SIZE)
>> #define MEMTIER_HOTPLUG_PRIO 100
>>
>> +struct memory_tier;
>> +struct memory_dev_type {
>> + /* list of memory types that are part of same tier as this type */
>> + struct list_head tier_sibiling;
>> + /* abstract distance for this specific memory type */
>> + int adistance;
>> + /* Nodes of same abstract distance */
>> + nodemask_t nodes;
>> + struct kref kref;
>> + struct memory_tier *memtier;
>> +};
>> +
>> #ifdef CONFIG_NUMA
>> -#include <linux/types.h>
>> extern bool numa_demotion_enabled;
>> +void init_node_memory_type(int node, struct memory_dev_type *default_type);
>> +void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>>
>> #else
>>
>> #define numa_demotion_enabled false
>> +static inline void init_node_memory_type(int node, struct memory_dev_type *default_type)
>> +{
>> +
>> +}
>> +
>> +static inline void clear_node_memory_type(int node, struct memory_dev_type *memtype)
>> +{
>> +
>> +}
>> #endif /* CONFIG_NUMA */
>> #endif /* _LINUX_MEMORY_TIERS_H */
>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> index 2caa5ab446b8..e07dffb67567 100644
>> --- a/mm/memory-tiers.c
>> +++ b/mm/memory-tiers.c
>> @@ -1,6 +1,4 @@
>> // SPDX-License-Identifier: GPL-2.0
>> -#include <linux/types.h>
>> -#include <linux/nodemask.h>
>> #include <linux/slab.h>
>> #include <linux/lockdep.h>
>> #include <linux/sysfs.h>
>> @@ -21,26 +19,10 @@ struct memory_tier {
>> int adistance_start;
>> };
>>
>> -struct memory_dev_type {
>> - /* list of memory types that are part of same tier as this type */
>> - struct list_head tier_sibiling;
>> - /* abstract distance for this specific memory type */
>> - int adistance;
>> - /* Nodes of same abstract distance */
>> - nodemask_t nodes;
>> - struct memory_tier *memtier;
>> -};
>> -
>> static DEFINE_MUTEX(memory_tier_lock);
>> static LIST_HEAD(memory_tiers);
>> static struct memory_dev_type *node_memory_types[MAX_NUMNODES];
>> -/*
>> - * For now let's have 4 memory tier below default DRAM tier.
>> - */
>> -static struct memory_dev_type default_dram_type = {
>> - .adistance = MEMTIER_ADISTANCE_DRAM,
>> - .tier_sibiling = LIST_HEAD_INIT(default_dram_type.tier_sibiling),
>> -};
>> +static struct memory_dev_type *default_dram_type;
>>
>> static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
>> {
>> @@ -96,6 +78,14 @@ static struct memory_tier *__node_get_memory_tier(int node)
>> return NULL;
>> }
>>
>> +static inline void __init_node_memory_type(int node, struct memory_dev_type *default_type)
>> +{
>> + if (!node_memory_types[node]) {
>> + node_memory_types[node] = default_type;
>> + kref_get(&default_type->kref);
>> + }
>> +}
>> +
>> static struct memory_tier *set_node_memory_tier(int node)
>> {
>> struct memory_tier *memtier;
>> @@ -107,7 +97,7 @@ static struct memory_tier *set_node_memory_tier(int node)
>> return ERR_PTR(-EINVAL);
>>
>> if (!node_memory_types[node])
>> - node_memory_types[node] = &default_dram_type;
>> + __init_node_memory_type(node, default_dram_type);
>>
>> memtype = node_memory_types[node];
>> node_set(node, memtype->nodes);
>> @@ -143,6 +133,34 @@ static bool clear_node_memory_tier(int node)
>> return cleared;
>> }
>>
>> +void init_node_memory_type(int node, struct memory_dev_type *default_type)
>> +{
>> +
>> + mutex_lock(&memory_tier_lock);
>> + __init_node_memory_type(node, default_type);
>> + mutex_unlock(&memory_tier_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(init_node_memory_type);
>> +
>> +static void release_memtype(struct kref *kref)
>> +{
>> + struct memory_dev_type *memtype;
>> +
>> + memtype = container_of(kref, struct memory_dev_type, kref);
>> + kfree(memtype);
>> +}
>> +
>> +void clear_node_memory_type(int node, struct memory_dev_type *memtype)
>> +{
>> + mutex_lock(&memory_tier_lock);
>> + if (node_memory_types[node] == memtype) {
>> + node_memory_types[node] = NULL;
>> + kref_put(&memtype->kref, release_memtype);
>> + }
>> + mutex_unlock(&memory_tier_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(clear_node_memory_type);
>> +
>> static int __meminit memtier_hotplug_callback(struct notifier_block *self,
>> unsigned long action, void *_arg)
>> {
>> @@ -176,17 +194,27 @@ static int __init memory_tier_init(void)
>> int node;
>> struct memory_tier *memtier;
>>
>> + default_dram_type = kmalloc(sizeof(*default_dram_type), GFP_KERNEL);
>> + if (!default_dram_type)
>> + panic("%s() failed to allocate default DRAM tier\n", __func__);
>> +
>> mutex_lock(&memory_tier_lock);
>> +
>> + /* For now let's have 4 memory tier below default DRAM tier. */
>> + default_dram_type->adistance = MEMTIER_ADISTANCE_DRAM;
>> + INIT_LIST_HEAD(&default_dram_type->tier_sibiling);
>> + default_dram_type->memtier = NULL;
>> + kref_init(&default_dram_type->kref);
>
> It appears that we can define a function to initialize a memory type.

I added the below

static inline struct memory_dev_type *alloc_memory_type(int adistance)
static inline void destroy_memory_type(struct memory_dev_type *memtype)


>
>> /* CPU only nodes are not part of memory tiers. */
>> - default_dram_type.nodes = node_states[N_MEMORY];
>> + default_dram_type->nodes = node_states[N_MEMORY];
>>
>> - memtier = find_create_memory_tier(&default_dram_type);
>> + memtier = find_create_memory_tier(default_dram_type);
>> if (IS_ERR(memtier))
>> panic("%s() failed to register memory tier: %ld\n",
>> __func__, PTR_ERR(memtier));
>>
>> for_each_node_state(node, N_MEMORY)
>> - node_memory_types[node] = &default_dram_type;
>> + __init_node_memory_type(node, default_dram_type);
>>
>> mutex_unlock(&memory_tier_lock);
>
> Best Regards,
> Huang, Ying