Re: [PATCH v2 11/13] KVM: x86: add KVM_CAP_X2APIC_API

From: Yang Zhang
Date: Mon Jul 11 2016 - 02:07:15 EST


On 2016/7/8 1:15, Radim KrÄmÃÅ wrote:
KVM_CAP_X2APIC_API can be enabled to extend APIC ID in get/set ioctl and MSI
addresses to 32 bits. Both are needed to support x2APIC.

The capability has to be toggleable and disabled by default, because get/set
ioctl shifted and truncated APIC ID to 8 bits by using a non-standard protocol
inspired by xAPIC and the change is not backward-compatible.

Changes to MSI addresses follow the format used by interrupt remapping unit.
The upper address word, that used to be 0, contains upper 24 bits of the LAPIC
address in its upper 24 bits. Lower 8 bits are reserved as 0.
Using the upper address word is not backward-compatible either as we didn't
check that userspace zeroed the word. Reserved bits are still not explicitly

Does this means we cannot migrate the VM from KVM_CAP_X2APIC_API enabled host to the disable host even VM doesn't have more than 255 VCPUs?

checked, but non-zero data will affect LAPIC addresses, which will cause a bug.

Signed-off-by: Radim KrÄmÃÅ <rkrcmar@xxxxxxxxxx>
---
v2:
* pass struct kvm into kvm_set_msi_irq [Paolo]
* trace address_hi [David]
* use hex dst, like other tracepoins
* strict reserved MSI bits checking [Paolo]
* loose reserved capability bits checking [Paolo]
* improved documentation [Paolo]

Documentation/virtual/kvm/api.txt | 32 ++++++++++++++++++++++++++++++++
arch/x86/include/asm/kvm_host.h | 4 +++-
arch/x86/kvm/irq_comm.c | 26 +++++++++++++++++++++-----
arch/x86/kvm/lapic.c | 13 +++++++++----
arch/x86/kvm/vmx.c | 2 +-
arch/x86/kvm/x86.c | 5 +++++
include/trace/events/kvm.h | 5 +++--
include/uapi/linux/kvm.h | 1 +
8 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 09efa9eb3926..a8f2ef910f98 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1482,6 +1482,10 @@ struct kvm_irq_routing_msi {
__u32 pad;
};

+On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is
+enabled. If it is enabled, address_hi bits 31-8 provide bits 31-8 of the
+destination id. Bits 7-0 of address_hi must be zero.
+
struct kvm_irq_routing_s390_adapter {
__u64 ind_addr;
__u64 summary_addr;
@@ -1583,6 +1587,15 @@ struct kvm_lapic_state {
Reads the Local APIC registers and copies them into the input argument. The
data format and layout are the same as documented in the architecture manual.

+If KVM_CAP_X2APIC_API is enabled, then the format of APIC_ID register depends
+on the APIC mode (reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores
+APIC ID in the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC
+ID which is stored in bits 31-24 of the APIC register, or equivalently in byte
+35 of struct kvm_lapic_state's regs field.
+
+If KVM_CAP_X2APIC_API is disabled, struct kvm_lapic_state always uses xAPIC
+format.
+

4.58 KVM_SET_LAPIC

@@ -1600,6 +1613,10 @@ struct kvm_lapic_state {
Copies the input argument into the Local APIC registers. The data format
and layout are the same as documented in the architecture manual.

+The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's
+regs field) depends on the state of the KVM_CAP_X2APIC_API capability.
+See the note in KVM_GET_LAPIC.
+

4.59 KVM_IOEVENTFD

@@ -2180,6 +2197,10 @@ struct kvm_msi {

No flags are defined so far. The corresponding field must be 0.

+On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is
+enabled. If it is enabled, address_hi bits 31-8 provide bits 31-8 of the
+destination id. Bits 7-0 of address_hi must be zero.
+

4.71 KVM_CREATE_PIT2

@@ -3811,6 +3832,17 @@ Allows use of runtime-instrumentation introduced with zEC12 processor.
Will return -EINVAL if the machine does not support runtime-instrumentation.
Will return -EBUSY if a VCPU has already been created.

+7.7 KVM_CAP_X2APIC_API
+
+Architectures: x86
+Parameters: none
+Returns: 0
+
+Enabling this capability changes the behavior of KVM_SET_GSI_ROUTING,
+KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, allowing the use of 32-bit
+APIC IDs. See KVM_CAP_X2APIC_API in their respective sections.
+
+
8. Other capabilities.
----------------------

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 643e3dffcd85..f1b202b34c72 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -782,6 +782,8 @@ struct kvm_arch {
u32 ldr_mode;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
+
+ bool x2apic_api;
};

struct kvm_vm_stat {
@@ -1364,7 +1366,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);

-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq);

static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 889563d50c55..2ff9691e4603 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}

-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
{
- trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+ trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_api ?
+ (u64)e->msi.address_hi << 32 : 0),
+ e->msi.data);

irq->dest_id = (e->msi.address_lo &
MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+ if (kvm->arch.x2apic_api)
+ irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
irq->vector = (e->msi.data &
MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
@@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
}
EXPORT_SYMBOL_GPL(kvm_set_msi_irq);

+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_api && (e->msi.address_hi & 0xff);
+}
+
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, bool line_status)
{
struct kvm_lapic_irq irq;

+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
if (!level)
return -1;

- kvm_set_msi_irq(e, &irq);
+ kvm_set_msi_irq(kvm, e, &irq);

return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
}
@@ -153,7 +166,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
return -EWOULDBLOCK;


Shouldn't we check the msi_route_invalid in kvm_arch_set_irq_inatomic?

- kvm_set_msi_irq(e, &irq);
+ kvm_set_msi_irq(kvm, e, &irq);

if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
return r;
@@ -286,6 +299,9 @@ int kvm_set_routing_entry(struct kvm *kvm,
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ goto out;
break;
case KVM_IRQ_ROUTING_HV_SINT:
e->set = kvm_hv_set_sint;
@@ -394,7 +410,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
if (entry->type != KVM_IRQ_ROUTING_MSI)
continue;

- kvm_set_msi_irq(entry, &irq);
+ kvm_set_msi_irq(vcpu->kvm, entry, &irq);

if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
irq.dest_id, irq.dest_mode))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3c2a8c113054..b47a82bfc8c4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1991,10 +1991,15 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
if (apic_x2apic_mode(vcpu->arch.apic)) {
u32 *id = (u32 *)(s->regs + APIC_ID);

- if (set)
- *id >>= 24;
- else
- *id <<= 24;
+ if (vcpu->kvm->arch.x2apic_api) {
+ if (*id != vcpu->vcpu_id)
+ return -EINVAL;
+ } else {
+ if (set)
+ *id >>= 24;
+ else
+ *id <<= 24;
+ }
}

return 0;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0a6a8845ea0c..ada9437d6e18 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11075,7 +11075,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
* We will support full lowest-priority interrupt later.
*/

- kvm_set_msi_irq(e, &irq);
+ kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
/*
* Make sure the IRTE is in remapped mode if
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b178c8c12717..5d4089dcb3eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2576,6 +2576,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
+ case KVM_CAP_X2APIC_API:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -3799,6 +3800,10 @@ split_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
}
+ case KVM_CAP_X2APIC_API:
+ kvm->arch.x2apic_api = true;
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index f28292d73ddb..8ade3eb6c640 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -151,8 +151,9 @@ TRACE_EVENT(kvm_msi_set_irq,
__entry->data = data;
),

- TP_printk("dst %u vec %u (%s|%s|%s%s)",
- (u8)(__entry->address >> 12), (u8)__entry->data,
+ TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+ (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+ (u8)__entry->data,
__print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
(__entry->address & (1<<2)) ? "logical" : "physical",
(__entry->data & (1<<15)) ? "level" : "edge",
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 05ebf475104c..43b355d6db7b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_ARM_PMU_V3 126
#define KVM_CAP_VCPU_ATTRIBUTES 127
#define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_X2APIC_API 129

#ifdef KVM_CAP_IRQ_ROUTING




--
best regards
yang