Re: [PATCH v5 4/4] KVM: arm64: Initialize the ptdump parser with stage-2 attributes

From: Oliver Upton
Date: Mon Feb 12 2024 - 20:16:10 EST


On Wed, Feb 07, 2024 at 02:48:33PM +0000, Sebastian Ene wrote:
> Define a set of attributes used by the ptdump parser to display the
> properties of a guest memory region covered by a pagetable descriptor.
> Build a description of the pagetable levels and initialize the parser
> with this configuration.
>
> Signed-off-by: Sebastian Ene <sebastianene@xxxxxxxxxx>
> ---
> arch/arm64/kvm/ptdump.c | 156 ++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 156 insertions(+)
>
> diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
> index a4e984da8aa7..60725d46f17b 100644
> --- a/arch/arm64/kvm/ptdump.c
> +++ b/arch/arm64/kvm/ptdump.c
> @@ -14,6 +14,69 @@
> #include <kvm_ptdump.h>
>
>
> +#define ADDR_MARKER_LEN (2)
> +#define MARKER_MSG_LEN (32)
> +
> +static const struct prot_bits stage2_pte_bits[] = {
> + {
> + .mask = PTE_VALID,
> + .val = PTE_VALID,
> + .set = " ",
> + .clear = "F",
> + }, {
> + .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> + .val = KVM_PTE_LEAF_ATTR_HI_S2_XN | PTE_VALID,
> + .set = "XN",
> + .clear = " ",
> + }, {
> + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | PTE_VALID,
> + .set = "R",
> + .clear = " ",
> + }, {
> + .mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> + .val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | PTE_VALID,
> + .set = "W",
> + .clear = " ",
> + }, {
> + .mask = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> + .val = KVM_PTE_LEAF_ATTR_LO_S2_AF | PTE_VALID,
> + .set = "AF",
> + .clear = " ",
> + }, {
> + .mask = PTE_NG,
> + .val = PTE_NG,
> + .set = "FnXS",
> + .clear = " ",
> + }, {
> + .mask = PTE_CONT | PTE_VALID,
> + .val = PTE_CONT | PTE_VALID,
> + .set = "CON",
> + .clear = " ",
> + }, {
> + .mask = PTE_TABLE_BIT,
> + .val = PTE_TABLE_BIT,
> + .set = " ",
> + .clear = "BLK",

<snip>

> + }, {
> + .mask = KVM_PGTABLE_PROT_SW0,
> + .val = KVM_PGTABLE_PROT_SW0,
> + .set = "SW0", /* PKVM_PAGE_SHARED_OWNED */
> + }, {
> + .mask = KVM_PGTABLE_PROT_SW1,
> + .val = KVM_PGTABLE_PROT_SW1,
> + .set = "SW1", /* PKVM_PAGE_SHARED_BORROWED */
> + }, {
> + .mask = KVM_PGTABLE_PROT_SW2,
> + .val = KVM_PGTABLE_PROT_SW2,
> + .set = "SW2",
> + }, {
> + .mask = KVM_PGTABLE_PROT_SW3,
> + .val = KVM_PGTABLE_PROT_SW3,
> + .set = "SW3",
> + },

</snip>

These bits are never set in a 'normal' stage-2 PTE, does it make sense
to carry descriptors for them here? In contexts where the SW bits are
used it might be more useful if the ptdump used the specific meaning of
the bit (e.g. OWNED, BORROWED, etc) instead of the generic SW%d.

That can all wait for when the pKVM bits come into play though.

> +};
> +
> static int kvm_ptdump_guest_open(struct inode *inode, struct file *file);
> static int kvm_ptdump_guest_show(struct seq_file *m, void *);
>
> @@ -52,6 +115,94 @@ static int kvm_ptdump_show_common(struct seq_file *m,
> return kvm_pgtable_walk(pgtable, 0, BIT(pgtable->ia_bits), &walker);
> }
>
> +static void kvm_ptdump_build_levels(struct pg_level *level, u32 start_lvl)
> +{
> + static const char * const level_names[] = {"PGD", "PUD", "PMD", "PTE"};
> + u32 i = 0;
> + u64 mask_lvl = 0;

nit: _lvl adds nothing to this, and actually confused me for a sec as
to whether the mask changed per level.

> + if (start_lvl > 2) {
> + pr_err("invalid start_lvl %u\n", start_lvl);
> + return;
> + }

Can't we get something like -EINVAL out here and fail initialization?
Otherwise breadcrumbs like this pr_err() are hard to connect to a
specific failure.

> + for (i = 0; i < ARRAY_SIZE(stage2_pte_bits); i++)
> + mask_lvl |= stage2_pte_bits[i].mask;
> +
> + for (i = start_lvl; i <= KVM_PGTABLE_LAST_LEVEL; i++) {
> + level[i].name = level_names[i];
> + level[i].num = ARRAY_SIZE(stage2_pte_bits);
> + level[i].bits = stage2_pte_bits;
> + level[i].mask = mask_lvl;
> + }
> +
> + if (start_lvl > 0)
> + level[start_lvl].name = level_names[0];
> +}
> +
> +static int kvm_ptdump_parser_init(struct pg_state *st,
> + struct kvm_pgtable *pgtable,
> + struct seq_file *m)
> +{
> + struct addr_marker *ipa_addr_marker;
> + char *marker_msg;
> + struct pg_level *level_descr;
> + struct ptdump_range *range;
> +
> + ipa_addr_marker = kzalloc(sizeof(struct addr_marker) * ADDR_MARKER_LEN,
> + GFP_KERNEL_ACCOUNT);
> + if (!ipa_addr_marker)
> + return -ENOMEM;
> +
> + marker_msg = kzalloc(MARKER_MSG_LEN, GFP_KERNEL_ACCOUNT);
> + if (!marker_msg)
> + goto free_with_marker;
> +
> + level_descr = kzalloc(sizeof(struct pg_level) * (KVM_PGTABLE_LAST_LEVEL + 1),
> + GFP_KERNEL_ACCOUNT);
> + if (!level_descr)
> + goto free_with_msg;
> +
> + range = kzalloc(sizeof(struct ptdump_range) * ADDR_MARKER_LEN,
> + GFP_KERNEL_ACCOUNT);
> + if (!range)
> + goto free_with_level;
> +
> + kvm_ptdump_build_levels(level_descr, pgtable->start_level);
> +
> + snprintf(marker_msg, MARKER_MSG_LEN, "IPA bits %2u start lvl %1d",
> + pgtable->ia_bits, pgtable->start_level);
> +
> + ipa_addr_marker[0].name = marker_msg;

Is the dynamic name worth the added complexity? I see nothing wrong with
exposing additional debugfs files for simple attributes like the IPA
range and page table levels.

I know it isn't *that* much, just looking for every opportunity to
simplify further.

> + ipa_addr_marker[1].start_address = BIT(pgtable->ia_bits);
> + range[0].end = BIT(pgtable->ia_bits);
> +
> + st->seq = m;
> + st->marker = ipa_addr_marker;
> + st->level = -1,
> + st->pg_level = level_descr,
> + st->ptdump.range = range;
> + return 0;
> +
> +free_with_level:
> + kfree(level_descr);
> +free_with_msg:
> + kfree(marker_msg);
> +free_with_marker:
> + kfree(ipa_addr_marker);
> + return -ENOMEM;
> +}
> +
> +static void kvm_ptdump_parser_teardown(struct pg_state *st)
> +{
> + const struct addr_marker *ipa_addr_marker = st->marker;
> +
> + kfree(ipa_addr_marker[0].name);
> + kfree(ipa_addr_marker);
> + kfree(st->pg_level);
> + kfree(st->ptdump.range);
> +}
> +
> static int kvm_ptdump_guest_show(struct seq_file *m, void *)
> {
> struct kvm *guest_kvm = m->private;
> @@ -59,10 +210,15 @@ static int kvm_ptdump_guest_show(struct seq_file *m, void *)
> struct pg_state parser_state = {0};
> int ret;
>
> + ret = kvm_ptdump_parser_init(&parser_state, mmu->pgt, m);
> + if (ret)
> + return ret;
> +

Can this be done at open(), or am I missing something?

> write_lock(&guest_kvm->mmu_lock);
> ret = kvm_ptdump_show_common(m, mmu->pgt, &parser_state);
> write_unlock(&guest_kvm->mmu_lock);
>
> + kvm_ptdump_parser_teardown(&parser_state);

Same question here, can this happen at close()? I guess you'll need a
struct to encapsulate pg_state and a pointer to the VM at least.

Actually, come to think of it, if you embed all of the data you need for
the walker into a structure you can just do a single allocation for it
upfront.

--
Thanks,
Oliver