[RFC PATCH net-next v2 4/5] net/smc: avoid data copy from sndbuf to peer RMB in SMC-D loopback

From: Wen Gu
Date: Mon Dec 19 2022 - 22:22:43 EST


This patch aims to improve SMC-D loopback performance by avoiding
data copy from local sndbuf to peer RMB. The main idea is to let
local sndbuf and peer RMB share the same physical memory.

+----------+ +----------+
| socket A | | socket B |
+----------+ +----------+
| ^
| +---------+ |
regard as | | ----------|
local sndbuf | B's | regard as
| | RMB | local RMB
|-------> | |
+---------+

For connections using smcd loopback device:

1. Only create and maintain local RMB.
a. Create or reuse RMB when create connection;
b. Free RMB when lgr free;

2. Attach local sndbuf to peer RMB.
a. sndbuf_desc describes the same memory region as peer rmb_desc.
b. sndbuf_desc is exclusive to specific connection and won't be
added to lgr buffer pool for reuse.
c. sndbuf is attached to peer RMB when receive remote token after
CLC accept/confirm message.
d. sndbuf is detached from peer RMB when connection is freed.

Therefore, the data copied from the userspace to local sndbuf directly
reaches the peer RMB.

Signed-off-by: Wen Gu <guwen@xxxxxxxxxxxxxxxxx>
---
net/smc/af_smc.c | 23 +++++++++++++++++++-
net/smc/smc_core.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
net/smc/smc_core.h | 2 ++
3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index b9884c8..c7de566 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1073,7 +1073,6 @@ static int smc_find_proposal_devices(struct smc_sock *smc,
* The RFC patch hasn't resolved this, just simply always
* chooses loopback device first, and fallback if loopback
* communication is impossible.
- *
*/
/* check if there is an ism or loopback device available */
if (!(ini->smcd_version & SMC_V1) ||
@@ -1397,6 +1396,17 @@ static int smc_connect_ism(struct smc_sock *smc,
}

smc_conn_save_peer_info(smc, aclc);
+
+ /* special for smcd loopback
+ * conns above smcd loopback dev only create their rmbs.
+ * their sndbufs are 'maps' of peer rmbs.
+ */
+ if (smc->conn.lgr->smcd->is_loopback) {
+ rc = smcd_buf_attach(&smc->conn);
+ if (rc)
+ goto connect_abort;
+ smc->sk.sk_sndbuf = 2 * (smc->conn.sndbuf_desc->len);
+ }
smc_close_init(smc);
smc_rx_init(smc);
smc_tx_init(smc);
@@ -2464,6 +2474,17 @@ static void smc_listen_work(struct work_struct *work)
mutex_unlock(&smc_server_lgr_pending);
}
smc_conn_save_peer_info(new_smc, cclc);
+
+ /* special for smcd loopback
+ * conns above smcd loopback dev only create their rmbs.
+ * their sndbufs are 'maps' of peer rmbs.
+ */
+ if (ini->is_smcd && new_smc->conn.lgr->smcd->is_loopback) {
+ rc = smcd_buf_attach(&new_smc->conn);
+ if (rc)
+ goto out_decl;
+ new_smc->sk.sk_sndbuf = 2 * (new_smc->conn.sndbuf_desc->len);
+ }
smc_listen_out_connected(new_smc);
SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
goto out_free;
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index c305d8d..bf40ad3 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1171,6 +1171,10 @@ void smc_conn_free(struct smc_connection *conn)
if (!list_empty(&lgr->list))
smc_ism_unset_conn(conn);
tasklet_kill(&conn->rx_tsklet);
+
+ /* detach sndbuf from peer rmb */
+ if (lgr->smcd->is_loopback)
+ smcd_buf_detach(conn);
} else {
smc_cdc_wait_pend_tx_wr(conn);
if (current_work() != &conn->abort_work)
@@ -2423,6 +2427,14 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
{
int rc;

+ if (is_smcd && smc->conn.lgr->smcd->is_loopback) {
+ /* Conns above smcd loopback device only create and maintain
+ * their RMBs. The sndbufs will be attached to peer RMBs once
+ * getting the tokens.
+ */
+ return __smc_buf_create(smc, is_smcd, true);
+ }
+
/* create send buffer */
rc = __smc_buf_create(smc, is_smcd, false);
if (rc)
@@ -2439,6 +2451,56 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
return rc;
}

+/* for smcd loopback conns, attach local sndbuf to peer RMB.
+ * The data copy to sndbuf is equal to data copy to peer RMB.
+ */
+int smcd_buf_attach(struct smc_connection *conn)
+{
+ struct smcd_dev *smcd = conn->lgr->smcd;
+ u64 peer_token = conn->peer_token;
+ struct smc_buf_desc *buf_desc;
+ int rc;
+
+ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
+ if (!buf_desc)
+ return -ENOMEM;
+ rc = smc_ism_attach_dmb(smcd, peer_token, buf_desc);
+ if (rc) {
+ rc = SMC_CLC_DECL_ERR_RTOK;
+ goto free;
+ }
+
+ /* attach local sndbuf to peer RMB.
+ * refer to local sndbuf is equal to refer to peer RMB.
+ */
+ /* align with peer rmb */
+ buf_desc->cpu_addr = (u8 *)buf_desc->cpu_addr + sizeof(struct smcd_cdc_msg);
+ buf_desc->len -= sizeof(struct smcd_cdc_msg);
+ conn->sndbuf_desc = buf_desc;
+ conn->sndbuf_desc->used = 1;
+ //smc->sk.sk_sndbuf = 2 * (smc->conn->sndbuf_desc->len);
+ atomic_set(&conn->sndbuf_space, conn->sndbuf_desc->len);
+ return 0;
+
+free:
+ kfree(buf_desc);
+ return rc;
+}
+
+void smcd_buf_detach(struct smc_connection *conn)
+{
+ struct smcd_dev *smcd = conn->lgr->smcd;
+ u64 peer_token = conn->peer_token;
+
+ if (!conn->sndbuf_desc)
+ return;
+
+ smc_ism_detach_dmb(smcd, peer_token);
+
+ kfree(conn->sndbuf_desc);
+ conn->sndbuf_desc = NULL;
+}
+
static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
{
int i;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 285f9bd..b51b020 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -518,6 +518,8 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
void smc_smcd_terminate_all(struct smcd_dev *dev);
void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
int smc_buf_create(struct smc_sock *smc, bool is_smcd);
+int smcd_buf_attach(struct smc_connection *conn);
+void smcd_buf_detach(struct smc_connection *conn);
int smc_uncompress_bufsize(u8 compressed);
int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc);
--
1.8.3.1