Re: [PATCH v5 4/4] KVM: arm64: Initialize the ptdump parser with stage-2 attributes

From: Sebastian Ene
Date: Tue Feb 13 2024 - 11:59:12 EST


On Tue, Feb 13, 2024 at 12:42:42AM +0000, Oliver Upton wrote:
> On Wed, Feb 07, 2024 at 02:48:33PM +0000, Sebastian Ene wrote:
> > Define a set of attributes used by the ptdump parser to display the
> > properties of a guest memory region covered by a pagetable descriptor.
> > Build a description of the pagetable levels and initialize the parser
> > with this configuration.
> >
> > Signed-off-by: Sebastian Ene <sebastianene@xxxxxxxxxx>
> > ---
> > arch/arm64/kvm/ptdump.c | 156 ++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 156 insertions(+)
> >
> > diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
> > index a4e984da8aa7..60725d46f17b 100644
> > --- a/arch/arm64/kvm/ptdump.c
> > +++ b/arch/arm64/kvm/ptdump.c
> > @@ -14,6 +14,69 @@
> > #include <kvm_ptdump.h>
> >
> >
> > +#define ADDR_MARKER_LEN (2)
> > +#define MARKER_MSG_LEN (32)
> > +
> > +static const struct prot_bits stage2_pte_bits[] = {
> > + {
> > + .mask = PTE_VALID,
> > + .val = PTE_VALID,
> > + .set = " ",
> > + .clear = "F",
> > + }, {
> > + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> > + .val = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> > + .set = "XN",
> > + .clear = " ",
> > + }, {
> > + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> > + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> > + .set = "R",
> > + .clear = " ",
> > + }, {
> > + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> > + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> > + .set = "W",
> > + .clear = " ",
> > + }, {
> > + .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> > + .val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> > + .set = "AF",
> > + .clear = " ",
> > + }, {
> > + .mask = PTE_NG,
> > + .val = PTE_NG,
> > + .set = "FnXS",
> > + .clear = " ",
> > + }, {
> > + .mask = PTE_CONT | PTE_VALID,
> > + .val = PTE_CONT | PTE_VALID,
> > + .set = "CON",
> > + .clear = " ",
> > + }, {
> > + .mask = PTE_TABLE_BIT,
> > + .val = PTE_TABLE_BIT,
> > + .set = " ",
> > + .clear = "BLK",
>
> <snip>
>
> > + }, {
> > + .mask = KVM_PGTABLE_PROT_SW0,
> > + .val = KVM_PGTABLE_PROT_SW0,
> > + .set = "SW0", /* PKVM_PAGE_SHARED_OWNED */
> > + }, {
> > + .mask = KVM_PGTABLE_PROT_SW1,
> > + .val = KVM_PGTABLE_PROT_SW1,
> > + .set = "SW1", /* PKVM_PAGE_SHARED_BORROWED */
> > + }, {
> > + .mask = KVM_PGTABLE_PROT_SW2,
> > + .val = KVM_PGTABLE_PROT_SW2,
> > + .set = "SW2",
> > + }, {
> > + .mask = KVM_PGTABLE_PROT_SW3,
> > + .val = KVM_PGTABLE_PROT_SW3,
> > + .set = "SW3",
> > + },
>
> </snip>
>
> These bits are never set in a 'normal' stage-2 PTE, does it make sense
> to carry descriptors for them here? In contexts where the SW bits are
> used it might be more useful if the ptdump used the specific meaning of
> the bit (e.g. OWNED, BORROWED, etc) instead of the generic SW%d.
>
> That can all wait for when the pKVM bits come into play though.
>

True, I guess we don't need these bits for now. We can insert them at a
later time when we will have the pKVM support and then we will have to
use their real maning for the state tracking.

> > +};
> > +
> > static int kvm_ptdump_guest_open(struct inode *inode, struct file *file);
> > static int kvm_ptdump_guest_show(struct seq_file *m, void *);
> >
> > @@ -52,6 +115,94 @@ static int kvm_ptdump_show_common(struct seq_file *m,
> > return kvm_pgtable_walk(pgtable, 0, BIT(pgtable->ia_bits), &walker);
> > }
> >
> > +static void kvm_ptdump_build_levels(struct pg_level *level, u32 start_lvl)
> > +{
> > + static const char * const level_names[] = {"PGD", "PUD", "PMD", "PTE"};
> > + u32 i = 0;
> > + u64 mask_lvl = 0;
>
> nit: _lvl adds nothing to this, and actually confused me for a sec as
> to whether the mask changed per level.
>

I will drop it from the name to avoid the confusion.

> > + if (start_lvl > 2) {
> > + pr_err("invalid start_lvl %u\n", start_lvl);
> > + return;
> > + }
>
> Can't we get something like -EINVAL out here and fail initialization?
> Otherwise breadcrumbs like this pr_err() are hard to connect to a
> specific failure.
>

Ok, I will add a return code for this function.

> > + for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
> > + mask_lvl |= stage2_pte_bits[i].mask;
> > +
> > + for (i = start_lvl; i <= KVM_PGTABLE_LAST_LEVEL; i++) {
> > + level[i].name = level_names[i];
> > + level[i].num = ARRAY_SIZE(stage2_pte_bits);
> > + level[i].bits = stage2_pte_bits;
> > + level[i].mask = mask_lvl;
> > + }
> > +
> > + if (start_lvl > 0)
> > + level[start_lvl].name = level_names[0];
> > +}
> > +
> > +static int kvm_ptdump_parser_init(struct pg_state *st,
> > + struct kvm_pgtable *pgtable,
> > + struct seq_file *m)
> > +{
> > + struct addr_marker *ipa_addr_marker;
> > + char *marker_msg;
> > + struct pg_level *level_descr;
> > + struct ptdump_range *range;
> > +
> > + ipa_addr_marker = kzalloc(sizeof(struct addr_marker) * ADDR_MARKER_LEN,
> > + GFP_KERNEL_ACCOUNT);
> > + if (!ipa_addr_marker)
> > + return -ENOMEM;
> > +
> > + marker_msg = kzalloc(MARKER_MSG_LEN, GFP_KERNEL_ACCOUNT);
> > + if (!marker_msg)
> > + goto free_with_marker;
> > +
> > + level_descr = kzalloc(sizeof(struct pg_level) * (KVM_PGTABLE_LAST_LEVEL + 1),
> > + GFP_KERNEL_ACCOUNT);
> > + if (!level_descr)
> > + goto free_with_msg;
> > +
> > + range = kzalloc(sizeof(struct ptdump_range) * ADDR_MARKER_LEN,
> > + GFP_KERNEL_ACCOUNT);
> > + if (!range)
> > + goto free_with_level;
> > +
> > + kvm_ptdump_build_levels(level_descr, pgtable->start_level);
> > +
> > + snprintf(marker_msg, MARKER_MSG_LEN, "IPA bits %2u start lvl %1d",
> > + pgtable->ia_bits, pgtable->start_level);
> > +
> > + ipa_addr_marker[0].name = marker_msg;
>
> Is the dynamic name worth the added complexity? I see nothing wrong with
> exposing additional debugfs files for simple attributes like the IPA
> range and page table levels.
>
> I know it isn't *that* much, just looking for every opportunity to
> simplify further.
>

We can keep them separate, I have no strong opinion about this. I think
this was Vincent's, original suggestion to have them so I will check with
him as well.

> > + ipa_addr_marker[1].start_address = BIT(pgtable->ia_bits);
> > + range[0].end = BIT(pgtable->ia_bits);
> > +
> > + st->seq = m;
> > + st->marker = ipa_addr_marker;
> > + st->level = -1,
> > + st->pg_level = level_descr,
> > + st->ptdump.range = range;
> > + return 0;
> > +
> > +free_with_level:
> > + kfree(level_descr);
> > +free_with_msg:
> > + kfree(marker_msg);
> > +free_with_marker:
> > + kfree(ipa_addr_marker);
> > + return -ENOMEM;
> > +}
> > +
> > +static void kvm_ptdump_parser_teardown(struct pg_state *st)
> > +{
> > + const struct addr_marker *ipa_addr_marker = st->marker;
> > +
> > + kfree(ipa_addr_marker[0].name);
> > + kfree(ipa_addr_marker);
> > + kfree(st->pg_level);
> > + kfree(st->ptdump.range);
> > +}
> > +
> > static int kvm_ptdump_guest_show(struct seq_file *m, void *)
> > {
> > struct kvm *guest_kvm = m->private;
> > @@ -59,10 +210,15 @@ static int kvm_ptdump_guest_show(struct seq_file *m, void *)
> > struct pg_state parser_state = {0};
> > int ret;
> >
> > + ret = kvm_ptdump_parser_init(&parser_state, mmu->pgt, m);
> > + if (ret)
> > + return ret;
> > +
>
> Can this be done at open(), or am I missing something?
>

I guess we can do this in open() but then we will have to add again that
struct that wraps some ptdump specific state tracking. It seemed a bit cleaner in
this way. What do you think ?

> > write_lock(&guest_kvm->mmu_lock);
> > ret = kvm_ptdump_show_common(m, mmu->pgt, &parser_state);
> > write_unlock(&guest_kvm->mmu_lock);
> >
> > + kvm_ptdump_parser_teardown(&parser_state);
>
> Same question here, can this happen at close()? I guess you'll need a
> struct to encapsulate pg_state and a pointer to the VM at least.
>

Right, I tried to avoid using a separate struct as we discussed in v4.

> Actually, come to think of it, if you embed all of the data you need for
> the walker into a structure you can just do a single allocation for it
> upfront.
>
> --
> Thanks,
> Oliver

Thanks for the feedback,
Seb