[PATCH v18 09/18] s390/vfio-ap: introduce new mutex to control access to the KVM pointer

From: Tony Krowiak
Date: Mon Feb 14 2022 - 19:51:18 EST


The vfio_ap device driver registers for notification when the pointer to
the KVM object for a guest is set. Recall that the KVM lock (kvm->lock)
mutex must be taken outside of the matrix_dev->lock mutex to prevent the
reporting by lockdep of a circular locking dependency (a.k.a., a lockdep
splat):

* see commit 0cc00c8d4050 ("Fix circular lockdep when setting/clearing
crypto masks")

* see commit 86956e70761b ("replace open coded locks for
VFIO_GROUP_NOTIFY_SET_KVM notification")

With the introduction of support for hot plugging/unplugging AP devices
passed through to a KVM guest, a new guests_lock mutex is introduced to
ensure the proper locking order is maintained:

struct ap_matrix_dev {
...
struct mutex guests_lock;
...
}

The matrix_dev->guests_lock controls access to the matrix_mdev instances
that hold the state for AP devices that have been passed through to a
KVM guest. This lock must be held:

1. To control access to the KVM pointer (matrix_mdev->kvm) while the
vfio_ap device driver is using it to plug/unplug AP devices passed
through to the KVM guest.

2. To add matrix_mdev instances to or remove them from
matrix_dev->mdev_list. This is necessary to ensure the proper locking
order when the list is iterated to find an ap_matrix_mdev instance for
the purpose of plugging/unplugging AP devices passed through to a KVM
guest; for example, when a queue device is bound to (probe callback) or
removed from (remove callback) the vfio_ap device driver.

For example, when a queue device is removed from the vfio_ap device
driver, if the adapter is passed through to a KVM guest, it will have to
be unplugged. Since the vfio_ap device driver only knows the APQN of the
queue device when it is removed, in order to figure out whether the
adapter is passed through, the matrix_mdev object to which the queue is
assigned will have to be found. Its KVM pointer (matrix_mdev->kvm) can
then be used to determine if the mediated device is passed through
(matrix_mdev->kvm != NULL) and if so, to unplug the adapter.

In addition to introducing the matrix_mdev->guests_lock mutex, the
matrix_dev->lock mutex is being renamed to matrix_dev->mdevs_lock to
better reflect its purpose, which is to control access to the state of the
mediated devices under the control of the vfio_ap device driver.

It is not necessary to take the matrix_dev->guests_lock to access the KVM
pointer if the pointer is not used to plug/unplug AP devices; however, in
this case, the matrix_dev->mdevs_lock must be held in order to access the
KVM pointer since it set and cleared under the protection of the
mdevs_lock.
A case in point is the function that handles interception of the PQAP(AQIC)
instruction sub-function by the host. This handler needs to access the KVM
pointer, but only for the purposes of setting or clearing IRQ resources, so
only the matrix_dev->mdevs_lock needs to be held.

The proper locking order must be maintained whenever plugging/unplugging
AP devices passed through to a KVM guest under the control of the vfio_ap
device driver:

matrix_dev->guests_lock
matrix_mdev->kvm->lock
matrix_dev->mdevs_lock

Signed-off-by: Tony Krowiak <akrowiak@xxxxxxxxxxxxx>
---
drivers/s390/crypto/vfio_ap_drv.c | 7 ++-
drivers/s390/crypto/vfio_ap_ops.c | 81 +++++++++++++++------------
drivers/s390/crypto/vfio_ap_private.h | 17 ++++--
3 files changed, 61 insertions(+), 44 deletions(-)

diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
index 5013a5531171..e8f3540ebfda 100644
--- a/drivers/s390/crypto/vfio_ap_drv.c
+++ b/drivers/s390/crypto/vfio_ap_drv.c
@@ -68,7 +68,7 @@ static ssize_t status_show(struct device *dev,
struct ap_matrix_mdev *matrix_mdev;
struct ap_device *apdev = to_ap_dev(dev);

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->guests_lock);
q = dev_get_drvdata(&apdev->device);
matrix_mdev = vfio_ap_mdev_for_queue(q);

@@ -84,7 +84,7 @@ static ssize_t status_show(struct device *dev,
AP_QUEUE_UNASSIGNED);
}

- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->guests_lock);

return nchars;
}
@@ -155,8 +155,9 @@ static int vfio_ap_matrix_dev_create(void)
goto matrix_alloc_err;
}

- mutex_init(&matrix_dev->lock);
+ mutex_init(&matrix_dev->mdevs_lock);
INIT_LIST_HEAD(&matrix_dev->mdev_list);
+ mutex_init(&matrix_dev->guests_lock);

dev_set_name(&matrix_dev->device, "%s", VFIO_AP_DEV_NAME);
matrix_dev->device.parent = root_device;
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index f2b98f347f9f..623a4b38676d 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -278,7 +278,7 @@ static int handle_pqap(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;

apqn = vcpu->run->s.regs.gprs[0] & 0xffff;
- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);

if (!vcpu->kvm->arch.crypto.pqap_hook)
goto out_unlock;
@@ -305,7 +305,7 @@ static int handle_pqap(struct kvm_vcpu *vcpu)
out_unlock:
memcpy(&vcpu->run->s.regs.gprs[1], &qstatus, sizeof(qstatus));
vcpu->run->s.regs.gprs[1] >>= 32;
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);
return 0;
}

@@ -396,9 +396,9 @@ static int vfio_ap_mdev_probe(struct mdev_device *mdev)
vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->shadow_apcb);
hash_init(matrix_mdev->qtable.queues);
mdev_set_drvdata(mdev, matrix_mdev);
- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->guests_lock);
list_add(&matrix_mdev->node, &matrix_dev->mdev_list);
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->guests_lock);

ret = vfio_register_emulated_iommu_dev(&matrix_mdev->vdev);
if (ret)
@@ -407,9 +407,9 @@ static int vfio_ap_mdev_probe(struct mdev_device *mdev)
return 0;

err_list:
- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->guests_lock);
list_del(&matrix_mdev->node);
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->guests_lock);
vfio_uninit_group_dev(&matrix_mdev->vdev);
kfree(matrix_mdev);
err_dec_available:
@@ -472,11 +472,11 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev)

vfio_unregister_group_dev(&matrix_mdev->vdev);

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);
vfio_ap_mdev_reset_queues(matrix_mdev);
vfio_ap_mdev_unlink_fr_queues(matrix_mdev);
list_del(&matrix_mdev->node);
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);
vfio_uninit_group_dev(&matrix_mdev->vdev);
kfree(matrix_mdev);
atomic_inc(&matrix_dev->available_instances);
@@ -652,7 +652,8 @@ static ssize_t assign_adapter_store(struct device *dev,

struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->guests_lock);
+ mutex_lock(&matrix_dev->mdevs_lock);

/* If the KVM guest is running, disallow assignment of adapter */
if (matrix_mdev->kvm) {
@@ -683,7 +684,8 @@ static ssize_t assign_adapter_store(struct device *dev,
vfio_ap_mdev_filter_matrix(apm, matrix_mdev->matrix.aqm, matrix_mdev);
ret = count;
done:
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);
+ mutex_unlock(&matrix_dev->guests_lock);

return ret;
}
@@ -726,7 +728,7 @@ static ssize_t unassign_adapter_store(struct device *dev,
unsigned long apid;
struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);

/* If the KVM guest is running, disallow unassignment of adapter */
if (matrix_mdev->kvm) {
@@ -751,7 +753,7 @@ static ssize_t unassign_adapter_store(struct device *dev,

ret = count;
done:
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);
return ret;
}
static DEVICE_ATTR_WO(unassign_adapter);
@@ -806,7 +808,8 @@ static ssize_t assign_domain_store(struct device *dev,
struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);
unsigned long max_apqi = matrix_mdev->matrix.aqm_max;

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->guests_lock);
+ mutex_lock(&matrix_dev->mdevs_lock);

/* If the KVM guest is running, disallow assignment of domain */
if (matrix_mdev->kvm) {
@@ -836,7 +839,8 @@ static ssize_t assign_domain_store(struct device *dev,
vfio_ap_mdev_filter_matrix(matrix_mdev->matrix.apm, aqm, matrix_mdev);
ret = count;
done:
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);
+ mutex_unlock(&matrix_dev->guests_lock);

return ret;
}
@@ -879,7 +883,7 @@ static ssize_t unassign_domain_store(struct device *dev,
unsigned long apqi;
struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);

/* If the KVM guest is running, disallow unassignment of domain */
if (matrix_mdev->kvm) {
@@ -905,7 +909,7 @@ static ssize_t unassign_domain_store(struct device *dev,
ret = count;

done:
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);
return ret;
}
static DEVICE_ATTR_WO(unassign_domain);
@@ -932,7 +936,7 @@ static ssize_t assign_control_domain_store(struct device *dev,
unsigned long id;
struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);

/* If the KVM guest is running, disallow assignment of control domain */
if (matrix_mdev->kvm) {
@@ -958,7 +962,7 @@ static ssize_t assign_control_domain_store(struct device *dev,
vfio_ap_mdev_filter_cdoms(matrix_mdev);
ret = count;
done:
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);
return ret;
}
static DEVICE_ATTR_WO(assign_control_domain);
@@ -986,7 +990,7 @@ static ssize_t unassign_control_domain_store(struct device *dev,
struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);
unsigned long max_domid = matrix_mdev->matrix.adm_max;

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);

/* If a KVM guest is running, disallow unassignment of control domain */
if (matrix_mdev->kvm) {
@@ -1009,7 +1013,7 @@ static ssize_t unassign_control_domain_store(struct device *dev,

ret = count;
done:
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);
return ret;
}
static DEVICE_ATTR_WO(unassign_control_domain);
@@ -1025,13 +1029,13 @@ static ssize_t control_domains_show(struct device *dev,
struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);
unsigned long max_domid = matrix_mdev->matrix.adm_max;

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);
for_each_set_bit_inv(id, matrix_mdev->matrix.adm, max_domid + 1) {
n = sprintf(bufpos, "%04lx\n", id);
bufpos += n;
nchars += n;
}
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);

return nchars;
}
@@ -1054,7 +1058,7 @@ static ssize_t matrix_show(struct device *dev, struct device_attribute *attr,
apid1 = find_first_bit_inv(matrix_mdev->matrix.apm, napm_bits);
apqi1 = find_first_bit_inv(matrix_mdev->matrix.aqm, naqm_bits);

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);

if ((apid1 < napm_bits) && (apqi1 < naqm_bits)) {
for_each_set_bit_inv(apid, matrix_mdev->matrix.apm, napm_bits) {
@@ -1080,7 +1084,7 @@ static ssize_t matrix_show(struct device *dev, struct device_attribute *attr,
}
}

- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);

return nchars;
}
@@ -1134,13 +1138,15 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook;
up_write(&kvm->arch.crypto.pqap_hook_rwsem);

+ mutex_lock(&matrix_dev->guests_lock);
mutex_lock(&kvm->lock);
- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);

list_for_each_entry(m, &matrix_dev->mdev_list, node) {
if (m != matrix_mdev && m->kvm == kvm) {
+ mutex_unlock(&matrix_dev->mdevs_lock);
mutex_unlock(&kvm->lock);
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->guests_lock);
return -EPERM;
}
}
@@ -1151,8 +1157,9 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
matrix_mdev->shadow_apcb.aqm,
matrix_mdev->shadow_apcb.adm);

+ mutex_unlock(&matrix_dev->mdevs_lock);
mutex_unlock(&kvm->lock);
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->guests_lock);
}

return 0;
@@ -1210,16 +1217,18 @@ static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev,
kvm->arch.crypto.pqap_hook = NULL;
up_write(&kvm->arch.crypto.pqap_hook_rwsem);

+ mutex_lock(&matrix_dev->guests_lock);
mutex_lock(&kvm->lock);
- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);

kvm_arch_crypto_clear_masks(kvm);
vfio_ap_mdev_reset_queues(matrix_mdev);
kvm_put_kvm(kvm);
matrix_mdev->kvm = NULL;

+ mutex_unlock(&matrix_dev->mdevs_lock);
mutex_unlock(&kvm->lock);
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->guests_lock);
}
}

@@ -1396,7 +1405,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct vfio_device *vdev,
container_of(vdev, struct ap_matrix_mdev, vdev);
int ret;

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);
switch (cmd) {
case VFIO_DEVICE_GET_INFO:
ret = vfio_ap_mdev_get_device_info(arg);
@@ -1408,7 +1417,7 @@ static ssize_t vfio_ap_mdev_ioctl(struct vfio_device *vdev,
ret = -EOPNOTSUPP;
break;
}
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);

return ret;
}
@@ -1493,7 +1502,8 @@ int vfio_ap_mdev_probe_queue(struct ap_device *apdev)
if (!q)
return -ENOMEM;

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->guests_lock);
+ mutex_lock(&matrix_dev->mdevs_lock);
q->apqn = to_ap_queue(&apdev->device)->qid;
q->saved_isc = VFIO_AP_ISC_INVALID;
vfio_ap_queue_link_mdev(q);
@@ -1504,7 +1514,8 @@ int vfio_ap_mdev_probe_queue(struct ap_device *apdev)
q->matrix_mdev);
}
dev_set_drvdata(&apdev->device, q);
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);
+ mutex_unlock(&matrix_dev->guests_lock);

return 0;
}
@@ -1514,7 +1525,7 @@ void vfio_ap_mdev_remove_queue(struct ap_device *apdev)
unsigned long apid;
struct vfio_ap_queue *q;

- mutex_lock(&matrix_dev->lock);
+ mutex_lock(&matrix_dev->mdevs_lock);
q = dev_get_drvdata(&apdev->device);

if (q->matrix_mdev) {
@@ -1528,5 +1539,5 @@ void vfio_ap_mdev_remove_queue(struct ap_device *apdev)
vfio_ap_mdev_reset_queue(q, 1);
dev_set_drvdata(&apdev->device, NULL);
kfree(q);
- mutex_unlock(&matrix_dev->lock);
+ mutex_unlock(&matrix_dev->mdevs_lock);
}
diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h
index fa11a7e91e24..7e82a72d2083 100644
--- a/drivers/s390/crypto/vfio_ap_private.h
+++ b/drivers/s390/crypto/vfio_ap_private.h
@@ -33,20 +33,25 @@
* @available_instances: number of mediated matrix devices that can be created
* @info: the struct containing the output from the PQAP(QCI) instruction
* @mdev_list: the list of mediated matrix devices created
- * @lock: mutex for locking the AP matrix device. This lock will be
- * taken every time we fiddle with state managed by the vfio_ap
- * driver, be it using @mdev_list or writing the state of a
- * single ap_matrix_mdev device. It's quite coarse but we don't
- * expect much contention.
+ * @mdevs_lock: mutex for locking the ap_matrix_mdev devices under the control
+ * of the vfio_ap device driver. This lock will be taken every time
+ * we fiddle with state of an ap_matrix_mdev device. It's quite
+ * coarse but we don't expect much contention.
* @vfio_ap_drv: the vfio_ap device driver
+ * @guests_lock: mutex for controlling access to a guest that is using AP
+ * devices passed through by the vfio_ap device driver. This lock
+ * will be taken when the AP devices are plugged into or unplugged
+ * from a guest, and when an ap_matrix_mdev device is added to or
+ * removed from @mdev_list or the list is iterated.
*/
struct ap_matrix_dev {
struct device device;
atomic_t available_instances;
struct ap_config_info info;
struct list_head mdev_list;
- struct mutex lock;
+ struct mutex mdevs_lock;
struct ap_driver *vfio_ap_drv;
+ struct mutex guests_lock;
};

extern struct ap_matrix_dev *matrix_dev;
--
2.31.1