Re: [PATCH v19 038/130] KVM: TDX: create/destroy VM structure

From: Huang, Kai
Date: Wed Mar 27 2024 - 19:34:03 EST


.. to continue the previous review ...

+static int __tdx_td_init(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages;
+ unsigned long *tdcs_pa = NULL;
+ unsigned long tdr_pa = 0;
+ unsigned long va;
+ int ret, i;
+ u64 err;
+
+ ret = tdx_guest_keyid_alloc();
+ if (ret < 0)
+ return ret;
+ kvm_tdx->hkid = ret;
+
+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!va)
+ goto free_hkid;
+ tdr_pa = __pa(va);
+
+ tdcs_pa = kcalloc(tdx_info->nr_tdcs_pages, sizeof(*kvm_tdx->tdcs_pa),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!tdcs_pa)
+ goto free_tdr;

Empty line.

+ for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!va)
+ goto free_tdcs;
+ tdcs_pa[i] = __pa(va);
+ }
+
+ if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto free_tdcs;
+ }

Empty line.

+ cpus_read_lock();
+ /*
+ * Need at least one CPU of the package to be online in order to
+ * program all packages for host key id. Check it.
+ */
+ for_each_present_cpu(i)
+ cpumask_set_cpu(topology_physical_package_id(i), packages);
+ for_each_online_cpu(i)
+ cpumask_clear_cpu(topology_physical_package_id(i), packages);
+ if (!cpumask_empty(packages)) {
+ ret = -EIO;
+ /*
+ * Because it's hard for human operator to figure out the
+ * reason, warn it.
+ */
+#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
+ pr_warn_ratelimited(MSG_ALLPKG);
+ goto free_packages;
+ }
+
+ /*
+ * Acquire global lock to avoid TDX_OPERAND_BUSY:
+ * TDH.MNG.CREATE and other APIs try to lock the global Key Owner
+ * Table (KOT) to track the assigned TDX private HKID. It doesn't spin
+ * to acquire the lock, returns TDX_OPERAND_BUSY instead, and let the
+ * caller to handle the contention. This is because of time limitation
+ * usable inside the TDX module and OS/VMM knows better about process
+ * scheduling.
+ *
+ * APIs to acquire the lock of KOT:
+ * TDH.MNG.CREATE, TDH.MNG.KEY.FREEID, TDH.MNG.VPFLUSHDONE, and
+ * TDH.PHYMEM.CACHE.WB. > + */

Don't need to mention all SEAMCALLs here, but put a comment where appliciable, i.e., where they are used.

/*
* TDH.MNG.CREATE tries to grab the global TDX module and fails
* with TDX_OPERAND_BUSY when it fails to grab. Take the global
* lock to prevent it from failure.
*/
+ mutex_lock(&tdx_lock);
+ err = tdh_mng_create(tdr_pa, kvm_tdx->hkid);
+ mutex_unlock(&tdx_lock);

Empty line.

+ if (err == TDX_RND_NO_ENTROPY) {
+ ret = -EAGAIN;
+ goto free_packages;
+ }
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_CREATE, err, NULL);
+ ret = -EIO;
+ goto free_packages;
+ }

I would prefer more empty lines.

+ kvm_tdx->tdr_pa = tdr_pa;
+
+ for_each_online_cpu(i) {
+ int pkg = topology_physical_package_id(i);
+
+ if (cpumask_test_and_set_cpu(pkg, packages))
+ continue;
+
+ /*
+ * Program the memory controller in the package with an
+ * encryption key associated to a TDX private host key id
+ * assigned to this TDR. Concurrent operations on same memory
+ * controller results in TDX_OPERAND_BUSY. Avoid this race by
+ * mutex.
+ */

IIUC the race can only happen when you are creating multiple TDX guests simulatenously? Please clarify this in the comment.

And I even don't think you need all these TDX module details:

/*
* Concurrent run of TDH.MNG.KEY.CONFIG on the same
* package resluts in TDX_OPERAND_BUSY. When creating
* multiple TDX guests simultaneously this can run
* concurrently. Take the per-package lock to
* serialize.
*/
+ mutex_lock(&tdx_mng_key_config_lock[pkg]);
+ ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+ &kvm_tdx->tdr_pa, true);
+ mutex_unlock(&tdx_mng_key_config_lock[pkg]);
+ if (ret)
+ break;
+ }
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+ if (ret) {
+ i = 0;
+ goto teardown;
+ }
+
+ kvm_tdx->tdcs_pa = tdcs_pa;
+ for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
+ err = tdh_mng_addcx(kvm_tdx->tdr_pa, tdcs_pa[i]);
+ if (err == TDX_RND_NO_ENTROPY) {
+ /* Here it's hard to allow userspace to retry. */
+ ret = -EBUSY;
+ goto teardown;
+ }
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_ADDCX, err, NULL);
+ ret = -EIO;
+ goto teardown;
+ }
+ }
+
+ /*
+ * Note, TDH_MNG_INIT cannot be invoked here. TDH_MNG_INIT requires a dedicated
+ * ioctl() to define the configure CPUID values for the TD.
+ */

Then, how about renaming this function to __tdx_td_create()?

+ return 0;
+
+ /*
+ * The sequence for freeing resources from a partially initialized TD
+ * varies based on where in the initialization flow failure occurred.
+ * Simply use the full teardown and destroy, which naturally play nice
+ * with partial initialization.
+ */
+teardown:
+ for (; i < tdx_info->nr_tdcs_pages; i++) {
+ if (tdcs_pa[i]) {
+ free_page((unsigned long)__va(tdcs_pa[i]));
+ tdcs_pa[i] = 0;
+ }
+ }
+ if (!kvm_tdx->tdcs_pa)
+ kfree(tdcs_pa);

The code to "free TDCS pages in a loop and free the array" is done below with duplicated code. I am wondering whether we have way to eliminate one.

But I have lost track here, so perhaps we can review again after we split the patch to smaller pieces.

+ tdx_mmu_release_hkid(kvm);
+ tdx_vm_free(kvm);
+ return ret;
+
+free_packages:
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+free_tdcs:
+ for (i = 0; i < tdx_info->nr_tdcs_pages; i++) {
+ if (tdcs_pa[i])
+ free_page((unsigned long)__va(tdcs_pa[i]));
+ }
+ kfree(tdcs_pa);
+ kvm_tdx->tdcs_pa = NULL;
+
+free_tdr:
+ if (tdr_pa)
+ free_page((unsigned long)__va(tdr_pa));
+ kvm_tdx->tdr_pa = 0;
+free_hkid:
+ if (is_hkid_assigned(kvm_tdx))
+ tdx_hkid_free(kvm_tdx);
+ return ret;
+}
+
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_tdx_cmd tdx_cmd;
@@ -215,12 +664,13 @@ static int tdx_md_read(struct tdx_md_map *maps, int nr_maps)
static int __init tdx_module_setup(void)
{
- u16 num_cpuid_config;
+ u16 num_cpuid_config, tdcs_base_size;
int ret;
u32 i;
struct tdx_md_map mds[] = {
TDX_MD_MAP(NUM_CPUID_CONFIG, &num_cpuid_config),
+ TDX_MD_MAP(TDCS_BASE_SIZE, &tdcs_base_size),
};
struct tdx_metadata_field_mapping fields[] = {
@@ -273,6 +723,8 @@ static int __init tdx_module_setup(void)
c->edx = ecx_edx >> 32;
}
+ tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE;
+

Round up the 'tdcs_base_size' to make sure you have enough room, or put a WARN() here if not page aligned?

return 0;
error_out:
@@ -319,13 +771,27 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
struct tdx_enabled enable = {
.err = ATOMIC_INIT(0),
};
+ int max_pkgs;
int r = 0;
+ int i;

Nit: you can put the 3 into one line.

+ if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+ pr_warn("MOVDIR64B is reqiured for TDX\n");

It's better to make it more clear:

"Disable TDX: MOVDIR64B is not supported or disabled by the kernel."

Or, to match below:

"Cannot enable TDX w/o MOVDIR64B".

+ return -EOPNOTSUPP;
+ }
if (!enable_ept) {
pr_warn("Cannot enable TDX with EPT disabled\n");
return -EINVAL;
}
+ max_pkgs = topology_max_packages();
+ tdx_mng_key_config_lock = kcalloc(max_pkgs, sizeof(*tdx_mng_key_config_lock),
+ GFP_KERNEL);
+ if (!tdx_mng_key_config_lock)
+ return -ENOMEM;
+ for (i = 0; i < max_pkgs; i++)
+ mutex_init(&tdx_mng_key_config_lock[i]);
+

Using a per-socket lock looks a little bit overkill to me. I don't know whether we need to do in the initial version. Will leave to others.

Please at least add a comment to explain this is for better performance when creating multiple TDX guests IIUC?

if (!zalloc_cpumask_var(&enable.enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out;
@@ -350,4 +816,5 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
void tdx_hardware_unsetup(void)
{
kfree(tdx_info);
+ kfree(tdx_mng_key_config_lock);

The kernel actually has a mutex_destroy(). It is empty when CONFIG_DEBUG_LOCK_ALLOC is off, but I think it should be standard proceedure to also mutex_destory()?

[...]