[PATCH] Drivers: hv: vmbus: Introduce the CHANNELMSG_MODIFYCHANNEL_RESPONSE message type

From: Andrea Parri (Microsoft)
Date: Thu Nov 26 2020 - 14:12:44 EST


Quoting from commit 7527810573436f ("Drivers: hv: vmbus: Introduce
the CHANNELMSG_MODIFYCHANNEL message type"),

"[...] Hyper-V can *not* currently ACK CHANNELMSG_MODIFYCHANNEL
messages with the promise that (after the ACK is sent) the
channel won't send any more interrupts to the "old" CPU.

The peculiarity of the CHANNELMSG_MODIFYCHANNEL messages is
problematic if the user want to take a CPU offline, since we
don't want to take a CPU offline (and, potentially, "lose"
channel interrupts on such CPU) if the host is still processing
a CHANNELMSG_MODIFYCHANNEL message associated to that CPU."

Introduce the CHANNELMSG_MODIFYCHANNEL_RESPONSE(24) message type,
which embodies the type of the CHANNELMSG_MODIFYCHANNEL ACK.

Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@xxxxxxxxx>
---
drivers/hv/channel.c | 108 ++++++++++++++++++++++++++++++++------
drivers/hv/channel_mgmt.c | 42 +++++++++++++++
drivers/hv/connection.c | 3 +-
drivers/hv/hv.c | 52 ++++++++++++++++++
drivers/hv/hv_trace.h | 15 ++++++
drivers/hv/vmbus_drv.c | 4 +-
include/linux/hyperv.h | 13 ++++-
7 files changed, 218 insertions(+), 19 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index fbdda9938039a..6801d89a20051 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -209,31 +209,107 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
}
EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request);

+static int send_modifychannel_without_ack(struct vmbus_channel *channel, u32 target_vp)
+{
+ struct vmbus_channel_modifychannel msg;
+ int ret;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+ msg.child_relid = channel->offermsg.child_relid;
+ msg.target_vp = target_vp;
+
+ ret = vmbus_post_msg(&msg, sizeof(msg), true);
+ trace_vmbus_send_modifychannel(&msg, ret);
+
+ return ret;
+}
+
+static int send_modifychannel_with_ack(struct vmbus_channel *channel, u32 target_vp)
+{
+ struct vmbus_channel_modifychannel *msg;
+ struct vmbus_channel_msginfo *info;
+ unsigned long flags;
+ int ret;
+
+ info = kzalloc(sizeof(struct vmbus_channel_msginfo) +
+ sizeof(struct vmbus_channel_modifychannel),
+ GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ init_completion(&info->waitevent);
+ info->waiting_channel = channel;
+
+ msg = (struct vmbus_channel_modifychannel *)info->msg;
+ msg->header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+ msg->child_relid = channel->offermsg.child_relid;
+ msg->target_vp = target_vp;
+
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+ list_add_tail(&info->msglistentry, &vmbus_connection.chn_msg_list);
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+ if (channel->rescind) {
+ ret = -ENODEV;
+ goto free_info;
+ }
+
+ ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_modifychannel), true);
+ trace_vmbus_send_modifychannel(msg, ret);
+ if (ret != 0)
+ goto clean_msglist;
+
+ /*
+ * Release channel_mutex; otherwise, vmbus_onoffer_rescind() could block on
+ * the mutex and be unable to signal the completion.
+ */
+ mutex_unlock(&vmbus_connection.channel_mutex);
+ wait_for_completion(&info->waitevent);
+ mutex_lock(&vmbus_connection.channel_mutex);
+
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+ list_del(&info->msglistentry);
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+ if (channel->rescind) {
+ ret = -ENODEV;
+ goto free_info;
+ }
+
+ if (info->response.modify_response.status) {
+ kfree(info);
+ return -EAGAIN;
+ }
+
+ kfree(info);
+ return 0;
+
+clean_msglist:
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+ list_del(&info->msglistentry);
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+free_info:
+ kfree(info);
+ return ret;
+}
+
/*
* Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt.
*
* CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not
- * ACK such messages. IOW we can't know when the host will stop interrupting
- * the "old" vCPU and start interrupting the "new" vCPU for the given channel.
+ * ACK such messages before VERSION_WIN10_V5_3. Without ACK, we can not know
+ * when the host will stop interrupting the "old" vCPU and start interrupting
+ * the "new" vCPU for the given channel.
*
* The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version
* VERSION_WIN10_V4_1.
*/
-int vmbus_send_modifychannel(u32 child_relid, u32 target_vp)
+int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp)
{
- struct vmbus_channel_modifychannel conn_msg;
- int ret;
-
- memset(&conn_msg, 0, sizeof(conn_msg));
- conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
- conn_msg.child_relid = child_relid;
- conn_msg.target_vp = target_vp;
-
- ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true);
-
- trace_vmbus_send_modifychannel(&conn_msg, ret);
-
- return ret;
+ if (vmbus_proto_version >= VERSION_WIN10_V5_3)
+ return send_modifychannel_with_ack(channel, target_vp);
+ return send_modifychannel_without_ack(channel, target_vp);
}
EXPORT_SYMBOL_GPL(vmbus_send_modifychannel);

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 1d44bb635bb84..8fcb66d623ba4 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -1200,6 +1200,46 @@ static void vmbus_onopen_result(struct vmbus_channel_message_header *hdr)
spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
}

+/*
+ * vmbus_onmodifychannel_response - Modify Channel response handler.
+ *
+ * This is invoked when we received a response to our channel modify request.
+ * Find the matching request, copy the response and signal the requesting thread.
+ */
+static void vmbus_onmodifychannel_response(struct vmbus_channel_message_header *hdr)
+{
+ struct vmbus_channel_modifychannel_response *response;
+ struct vmbus_channel_msginfo *msginfo;
+ unsigned long flags;
+
+ response = (struct vmbus_channel_modifychannel_response *)hdr;
+
+ trace_vmbus_onmodifychannel_response(response);
+
+ /*
+ * Find the modify msg, copy the response and signal/unblock the wait event.
+ */
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+
+ list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, msglistentry) {
+ struct vmbus_channel_message_header *responseheader =
+ (struct vmbus_channel_message_header *)msginfo->msg;
+
+ if (responseheader->msgtype == CHANNELMSG_MODIFYCHANNEL) {
+ struct vmbus_channel_modifychannel *modifymsg;
+
+ modifymsg = (struct vmbus_channel_modifychannel *)msginfo->msg;
+ if (modifymsg->child_relid == response->child_relid) {
+ memcpy(&msginfo->response.modify_response, response,
+ sizeof(struct vmbus_channel_modifychannel_response));
+ complete(&msginfo->waitevent);
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+}
+
/*
* vmbus_ongpadl_created - GPADL created handler.
*
@@ -1366,6 +1406,8 @@ channel_message_table[CHANNELMSG_COUNT] = {
{ CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0},
{ CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0},
{ CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0},
+ { CHANNELMSG_MODIFYCHANNEL_RESPONSE, 1, vmbus_onmodifychannel_response,
+ sizeof(struct vmbus_channel_modifychannel_response)},
};

/*
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 11170d9a2e1a5..cdf41c504d914 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -45,6 +45,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
* Table of VMBus versions listed from newest to oldest.
*/
static __u32 vmbus_versions[] = {
+ VERSION_WIN10_V5_3,
VERSION_WIN10_V5_2,
VERSION_WIN10_V5_1,
VERSION_WIN10_V5,
@@ -60,7 +61,7 @@ static __u32 vmbus_versions[] = {
* Maximal VMBus protocol version guests can negotiate. Useful to cap the
* VMBus version for testing and debugging purpose.
*/
-static uint max_version = VERSION_WIN10_V5_2;
+static uint max_version = VERSION_WIN10_V5_3;

module_param(max_version, uint, S_IRUGO);
MODULE_PARM_DESC(max_version,
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 0cde10fe0e71f..35f240f4c833e 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -16,6 +16,7 @@
#include <linux/version.h>
#include <linux/random.h>
#include <linux/clockchips.h>
+#include <linux/delay.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
@@ -237,6 +238,40 @@ void hv_synic_disable_regs(unsigned int cpu)
hv_set_synic_state(sctrl.as_uint64);
}

+#define HV_MAX_TRIES 3
+/*
+ * Scan the event flags page of 'this' CPU looking for any bit that is set. If we find one
+ * bit set, then wait for a few milliseconds. Repeat these steps for a maximum of 3 times.
+ * Return 'true', if there is still any set bit after this operation; 'false', otherwise.
+ */
+static bool hv_synic_event_pending(void)
+{
+ struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ union hv_synic_event_flags *event =
+ (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
+ unsigned long *recv_int_page = event->flags;
+ bool pending;
+ u32 relid;
+ int tries = 0;
+
+retry:
+ pending = false;
+ for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
+ /* Special case - VMBus channel protocol messages */
+ if (relid == 0)
+ continue;
+ if (sync_test_bit(relid, recv_int_page)) {
+ pending = true;
+ break;
+ }
+ }
+ if (pending && tries++ < HV_MAX_TRIES) {
+ usleep_range(10000, 20000);
+ goto retry;
+ }
+ return pending;
+}
+
int hv_synic_cleanup(unsigned int cpu)
{
struct vmbus_channel *channel, *sc;
@@ -276,6 +311,23 @@ int hv_synic_cleanup(unsigned int cpu)
if (channel_found && vmbus_connection.conn_state == CONNECTED)
return -EBUSY;

+ if (vmbus_proto_version >= VERSION_WIN10_V5_3) {
+ /*
+ * channel_found == false means that any channels that were previously
+ * assigned to the CPU have been reassigned elsewhere. Since we have
+ * received a ModifyChannel ACK from Hyper-V for all such reassignments,
+ * we know that Hyper-V won't set any new bits in the event flags page.
+ * However, there may be existing bits set in this page that have not
+ * been processed by vmbus_chan_sched(). We scan the event flags page
+ * looking for any bits that are set and waiting (with a timeout) for
+ * vmbus_chan_sched() to process such bits. If bits are still set after
+ * this operation (and VMBus is connected), we fail the CPU offlining
+ * operation.
+ */
+ if (hv_synic_event_pending() && vmbus_connection.conn_state == CONNECTED)
+ return -EBUSY;
+ }
+
hv_stimer_legacy_cleanup(cpu);

hv_synic_disable_regs(cpu);
diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h
index 6063bb21bb137..3e83c24856dbe 100644
--- a/drivers/hv/hv_trace.h
+++ b/drivers/hv/hv_trace.h
@@ -86,6 +86,21 @@ TRACE_EVENT(vmbus_onopen_result,
)
);

+TRACE_EVENT(vmbus_onmodifychannel_response,
+ TP_PROTO(const struct vmbus_channel_modifychannel_response *response),
+ TP_ARGS(response),
+ TP_STRUCT__entry(
+ __field(u32, child_relid)
+ __field(u32, status)
+ ),
+ TP_fast_assign(__entry->child_relid = response->child_relid;
+ __entry->status = response->status;
+ ),
+ TP_printk("child_relid 0x%x, status %d",
+ __entry->child_relid, __entry->status
+ )
+ );
+
TRACE_EVENT(vmbus_ongpadl_created,
TP_PROTO(const struct vmbus_channel_gpadl_created *gpadlcreated),
TP_ARGS(gpadlcreated),
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 4fad3e6745e53..3e1cd5e8f769e 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1767,13 +1767,15 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
if (target_cpu == origin_cpu)
goto cpu_store_unlock;

- if (vmbus_send_modifychannel(channel->offermsg.child_relid,
+ if (vmbus_send_modifychannel(channel,
hv_cpu_number_to_vp_number(target_cpu))) {
ret = -EIO;
goto cpu_store_unlock;
}

/*
+ * For version before VERSION_WIN10_V5_3, the following warning holds:
+ *
* Warning. At this point, there is *no* guarantee that the host will
* have successfully processed the vmbus_send_modifychannel() request.
* See the header comment of vmbus_send_modifychannel() for more info.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 1ce131f29f3b4..808acf4c3fe61 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -234,6 +234,7 @@ static inline u32 hv_get_avail_to_write_percent(
* 5 . 0 (Newer Windows 10)
* 5 . 1 (Windows 10 RS4)
* 5 . 2 (Windows Server 2019, RS5)
+ * 5 . 3 (Windows Server 2021) // FIXME: use proper version number/name
*/

#define VERSION_WS2008 ((0 << 16) | (13))
@@ -245,6 +246,7 @@ static inline u32 hv_get_avail_to_write_percent(
#define VERSION_WIN10_V5 ((5 << 16) | (0))
#define VERSION_WIN10_V5_1 ((5 << 16) | (1))
#define VERSION_WIN10_V5_2 ((5 << 16) | (2))
+#define VERSION_WIN10_V5_3 ((5 << 16) | (3))

/* Make maximum size of pipe payload of 16K */
#define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384)
@@ -475,6 +477,7 @@ enum vmbus_channel_message_type {
CHANNELMSG_TL_CONNECT_REQUEST = 21,
CHANNELMSG_MODIFYCHANNEL = 22,
CHANNELMSG_TL_CONNECT_RESULT = 23,
+ CHANNELMSG_MODIFYCHANNEL_RESPONSE = 24,
CHANNELMSG_COUNT
};

@@ -588,6 +591,13 @@ struct vmbus_channel_open_result {
u32 status;
} __packed;

+/* Modify Channel Result parameters */
+struct vmbus_channel_modifychannel_response {
+ struct vmbus_channel_message_header header;
+ u32 child_relid;
+ u32 status;
+} __packed;
+
/* Close channel parameters; */
struct vmbus_channel_close_channel {
struct vmbus_channel_message_header header;
@@ -720,6 +730,7 @@ struct vmbus_channel_msginfo {
struct vmbus_channel_gpadl_torndown gpadl_torndown;
struct vmbus_channel_gpadl_created gpadl_created;
struct vmbus_channel_version_response version_response;
+ struct vmbus_channel_modifychannel_response modify_response;
} response;

u32 msgsize;
@@ -1562,7 +1573,7 @@ extern __u32 vmbus_proto_version;

int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
const guid_t *shv_host_servie_id);
-int vmbus_send_modifychannel(u32 child_relid, u32 target_vp);
+int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp);
void vmbus_set_event(struct vmbus_channel *channel);

/* Get the start of the ring buffer. */
--
2.25.1