[PATCH v4 4/6] RDMA/rxe: Implement flush execution in responder side

From: Li Zhijian
Date: Fri Aug 05 2022 - 03:40:35 EST


In contrast to other opcodes, after a series of sanity checking, FLUSH
opcode will do a Placement Type checking before it really do the FLUSH
operation. Only the requesting placement types that also registered in
the destination memory region are acceptable.

Otherwise, responder will also reply NAK "Remote Access Error" if it
found a placement type violation.

We will persist data via arch_wb_cache_pmem(), which could be
architecture specific.

Signed-off-by: Li Zhijian <lizhijian@xxxxxxxxxxx>
---
v4: add send_read_response_ack and flush resource
---
drivers/infiniband/sw/rxe/rxe_hdr.h | 28 ++++
drivers/infiniband/sw/rxe/rxe_loc.h | 2 +
drivers/infiniband/sw/rxe/rxe_mr.c | 4 +-
drivers/infiniband/sw/rxe/rxe_resp.c | 187 +++++++++++++++++++++++++-
drivers/infiniband/sw/rxe/rxe_verbs.h | 6 +
include/uapi/rdma/ib_user_verbs.h | 10 ++
6 files changed, 231 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h
index 8063b5018445..2fe98146130e 100644
--- a/drivers/infiniband/sw/rxe/rxe_hdr.h
+++ b/drivers/infiniband/sw/rxe/rxe_hdr.h
@@ -626,6 +626,34 @@ static inline void feth_init(struct rxe_pkt_info *pkt, u8 type, u8 level)
*p = cpu_to_be32(feth);
}

+static inline u32 __feth_plt(void *arg)
+{
+ __be32 *fethp = arg;
+ u32 feth = be32_to_cpu(*fethp);
+
+ return (feth & FETH_PLT_MASK) >> FETH_PLT_SHIFT;
+}
+
+static inline u32 __feth_sel(void *arg)
+{
+ __be32 *fethp = arg;
+ u32 feth = be32_to_cpu(*fethp);
+
+ return (feth & FETH_SEL_MASK) >> FETH_SEL_SHIFT;
+}
+
+static inline u32 feth_plt(struct rxe_pkt_info *pkt)
+{
+ return __feth_plt(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_FETH]);
+}
+
+static inline u32 feth_sel(struct rxe_pkt_info *pkt)
+{
+ return __feth_sel(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_FETH]);
+}
+
/******************************************************************************
* Atomic Extended Transport Header
******************************************************************************/
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index 22f6cc31d1d6..a77266cdc066 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -72,6 +72,8 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
enum rxe_mr_copy_dir dir);
int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma,
void *addr, int length, enum rxe_mr_copy_dir dir);
+void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
+ size_t *offset_out);
void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length);
struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
enum rxe_mr_lookup_type type);
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 24ca014cdecd..98460fde7332 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -238,8 +238,8 @@ int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mr *mr)
return err;
}

-static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
- size_t *offset_out)
+void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
+ size_t *offset_out)
{
size_t offset = iova - mr->iova + mr->offset;
int map_index;
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index 4c398fa220fa..4602cfbab78f 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -5,6 +5,7 @@
*/

#include <linux/skbuff.h>
+#include <linux/libnvdimm.h>

#include "rxe.h"
#include "rxe_loc.h"
@@ -19,9 +20,11 @@ enum resp_states {
RESPST_CHK_RESOURCE,
RESPST_CHK_LENGTH,
RESPST_CHK_RKEY,
+ RESPST_CHK_PLT,
RESPST_EXECUTE,
RESPST_READ_REPLY,
RESPST_ATOMIC_REPLY,
+ RESPST_PROCESS_FLUSH,
RESPST_COMPLETE,
RESPST_ACKNOWLEDGE,
RESPST_CLEANUP,
@@ -36,6 +39,7 @@ enum resp_states {
RESPST_ERR_TOO_MANY_RDMA_ATM_REQ,
RESPST_ERR_RNR,
RESPST_ERR_RKEY_VIOLATION,
+ RESPST_ERR_PLT_VIOLATION,
RESPST_ERR_INVALIDATE_RKEY,
RESPST_ERR_LENGTH,
RESPST_ERR_CQ_OVERFLOW,
@@ -54,9 +58,11 @@ static char *resp_state_name[] = {
[RESPST_CHK_RESOURCE] = "CHK_RESOURCE",
[RESPST_CHK_LENGTH] = "CHK_LENGTH",
[RESPST_CHK_RKEY] = "CHK_RKEY",
+ [RESPST_CHK_PLT] = "CHK_PLACEMENT_TYPE",
[RESPST_EXECUTE] = "EXECUTE",
[RESPST_READ_REPLY] = "READ_REPLY",
[RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY",
+ [RESPST_PROCESS_FLUSH] = "PROCESS_FLUSH",
[RESPST_COMPLETE] = "COMPLETE",
[RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE",
[RESPST_CLEANUP] = "CLEANUP",
@@ -71,6 +77,7 @@ static char *resp_state_name[] = {
[RESPST_ERR_TOO_MANY_RDMA_ATM_REQ] = "ERR_TOO_MANY_RDMA_ATM_REQ",
[RESPST_ERR_RNR] = "ERR_RNR",
[RESPST_ERR_RKEY_VIOLATION] = "ERR_RKEY_VIOLATION",
+ [RESPST_ERR_PLT_VIOLATION] = "ERR_PLACEMENT_TYPE_VIOLATION",
[RESPST_ERR_INVALIDATE_RKEY] = "ERR_INVALIDATE_RKEY_VIOLATION",
[RESPST_ERR_LENGTH] = "ERR_LENGTH",
[RESPST_ERR_CQ_OVERFLOW] = "ERR_CQ_OVERFLOW",
@@ -402,6 +409,24 @@ static enum resp_states check_length(struct rxe_qp *qp,
}
}

+static enum resp_states check_placement_type(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct rxe_mr *mr = qp->resp.mr;
+ u32 plt = feth_plt(pkt);
+
+ if ((plt & IB_EXT_PLT_GLB_VIS &&
+ !(mr->access & IB_ACCESS_FLUSH_GLOBAL_VISIBILITY)) ||
+ (plt & IB_EXT_PLT_PERSIST &&
+ !(mr->access & IB_ACCESS_FLUSH_PERSISTENT))) {
+ pr_info("Target MR didn't support this placement type, registered flag: %x, requested flag: %x\n",
+ (mr->access & IB_ACCESS_FLUSHABLE) >> 8, plt);
+ return RESPST_ERR_PLT_VIOLATION;
+ }
+
+ return RESPST_EXECUTE;
+}
+
static enum resp_states check_rkey(struct rxe_qp *qp,
struct rxe_pkt_info *pkt)
{
@@ -415,7 +440,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
enum resp_states state;
int access;

- if (pkt->mask & RXE_READ_OR_WRITE_MASK) {
+ if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_FLUSH_MASK)) {
if (pkt->mask & RXE_RETH_MASK) {
qp->resp.va = reth_va(pkt);
qp->resp.offset = 0;
@@ -423,8 +448,12 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
qp->resp.resid = reth_len(pkt);
qp->resp.length = reth_len(pkt);
}
- access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
- : IB_ACCESS_REMOTE_WRITE;
+ if (pkt->mask & RXE_FLUSH_MASK)
+ access = IB_ACCESS_FLUSHABLE;
+ else if (pkt->mask & RXE_READ_MASK)
+ access = IB_ACCESS_REMOTE_READ;
+ else
+ access = IB_ACCESS_REMOTE_WRITE;
} else if (pkt->mask & RXE_ATOMIC_MASK) {
qp->resp.va = atmeth_va(pkt);
qp->resp.offset = 0;
@@ -436,8 +465,10 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
}

/* A zero-byte op is not required to set an addr or rkey. */
+ /* RXE_FETH_MASK carraies zero-byte payload */
if ((pkt->mask & RXE_READ_OR_WRITE_MASK) &&
(pkt->mask & RXE_RETH_MASK) &&
+ !(pkt->mask & RXE_FETH_MASK) &&
reth_len(pkt) == 0) {
return RESPST_EXECUTE;
}
@@ -507,7 +538,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
WARN_ON_ONCE(qp->resp.mr);

qp->resp.mr = mr;
- return RESPST_EXECUTE;
+ return pkt->mask & RXE_FETH_MASK ? RESPST_CHK_PLT : RESPST_EXECUTE;

err:
if (mr)
@@ -553,6 +584,64 @@ static enum resp_states write_data_in(struct rxe_qp *qp,
return rc;
}

+static int nvdimm_flush_iova(struct rxe_mr *mr, u64 iova, int length)
+{
+ int err;
+ int bytes;
+ u8 *va;
+ struct rxe_map **map;
+ struct rxe_phys_buf *buf;
+ int m;
+ int i;
+ size_t offset;
+
+ if (length == 0)
+ return 0;
+
+ if (mr->type == IB_MR_TYPE_DMA) {
+ err = -EFAULT;
+ goto err1;
+ }
+
+ err = mr_check_range(mr, iova, length);
+ if (err) {
+ err = -EFAULT;
+ goto err1;
+ }
+
+ lookup_iova(mr, iova, &m, &i, &offset);
+
+ map = mr->map + m;
+ buf = map[0]->buf + i;
+
+ while (length > 0) {
+ va = (u8 *)(uintptr_t)buf->addr + offset;
+ bytes = buf->size - offset;
+
+ if (bytes > length)
+ bytes = length;
+
+ arch_wb_cache_pmem(va, bytes);
+
+ length -= bytes;
+
+ offset = 0;
+ buf++;
+ i++;
+
+ if (i == RXE_BUF_PER_MAP) {
+ i = 0;
+ map++;
+ buf = map[0]->buf;
+ }
+ }
+
+ return 0;
+
+err1:
+ return err;
+}
+
static struct resp_res *rxe_prepare_res(struct rxe_qp *qp,
struct rxe_pkt_info *pkt,
int type)
@@ -587,11 +676,60 @@ static struct resp_res *rxe_prepare_res(struct rxe_qp *qp,
res->last_psn = pkt->psn;
res->cur_psn = pkt->psn;
break;
+ case RXE_FLUSH_MASK:
+ res->flush.va = qp->resp.va + qp->resp.offset;
+ res->flush.length = qp->resp.length;
+ res->flush.type = feth_plt(pkt);
+ res->flush.level = feth_sel(pkt);
}

return res;
}

+static enum resp_states process_flush(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ u64 length, start;
+ struct rxe_mr *mr = qp->resp.mr;
+ struct resp_res *res = qp->resp.res;
+
+ /* oA19-14, oA19-15 */
+ if (res && res->replay)
+ return RESPST_ACKNOWLEDGE;
+ else if (!res) {
+ res = rxe_prepare_res(qp, pkt, RXE_FLUSH_MASK);
+ qp->resp.res = res;
+ }
+
+ if (res->flush.level == IB_EXT_SEL_MR_RANGE) {
+ start = res->flush.va;
+ length = res->flush.length;
+ } else { /* level == IB_EXT_SEL_MR_WHOLE */
+ start = mr->iova;
+ length = mr->length;
+ }
+
+ if (res->flush.type & IB_EXT_PLT_PERSIST) {
+ if (nvdimm_flush_iova(mr, start, length))
+ return RESPST_ERR_RKEY_VIOLATION;
+ /* Make data persistent. */
+ wmb();
+ } else if (res->flush.type & IB_EXT_PLT_GLB_VIS)
+ /* Make data global visibility. */
+ wmb();
+
+ qp->resp.msn++;
+
+ /* next expected psn, read handles this separately */
+ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+ qp->resp.ack_psn = qp->resp.psn;
+
+ qp->resp.opcode = pkt->opcode;
+ qp->resp.status = IB_WC_SUCCESS;
+
+ return RESPST_ACKNOWLEDGE;
+}
+
/* Guarantee atomicity of atomic operations at the machine level. */
static DEFINE_SPINLOCK(atomic_ops_lock);

@@ -892,6 +1030,8 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
return RESPST_READ_REPLY;
} else if (pkt->mask & RXE_ATOMIC_MASK) {
return RESPST_ATOMIC_REPLY;
+ } else if (pkt->mask & RXE_FLUSH_MASK) {
+ return RESPST_PROCESS_FLUSH;
} else {
/* Unreachable */
WARN_ON_ONCE(1);
@@ -1065,6 +1205,19 @@ static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn)
return ret;
}

+static int send_read_response_ack(struct rxe_qp *qp, u8 syndrome, u32 psn)
+{
+ int ret = send_common_ack(qp, syndrome, psn,
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY,
+ "RDMA READ response of length zero ACK");
+
+ /* have to clear this since it is used to trigger
+ * long read replies
+ */
+ qp->resp.res = NULL;
+ return ret;
+}
+
static enum resp_states acknowledge(struct rxe_qp *qp,
struct rxe_pkt_info *pkt)
{
@@ -1075,6 +1228,8 @@ static enum resp_states acknowledge(struct rxe_qp *qp,
send_ack(qp, qp->resp.aeth_syndrome, pkt->psn);
else if (pkt->mask & RXE_ATOMIC_MASK)
send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
+ else if (pkt->mask & RXE_FLUSH_MASK)
+ send_read_response_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
else if (bth_ack(pkt))
send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);

@@ -1131,6 +1286,22 @@ static enum resp_states duplicate_request(struct rxe_qp *qp,
/* SEND. Ack again and cleanup. C9-105. */
send_ack(qp, AETH_ACK_UNLIMITED, prev_psn);
return RESPST_CLEANUP;
+ } else if (pkt->mask & RXE_FLUSH_MASK) {
+ struct resp_res *res;
+
+ /* Find the operation in our list of responder resources. */
+ res = find_resource(qp, pkt->psn);
+ if (res) {
+ res->replay = 1;
+ res->cur_psn = pkt->psn;
+ qp->resp.res = res;
+ rc = RESPST_PROCESS_FLUSH;
+ goto out;
+ }
+
+ /* Resource not found. Class D error. Drop the request. */
+ rc = RESPST_CLEANUP;
+ goto out;
} else if (pkt->mask & RXE_READ_MASK) {
struct resp_res *res;

@@ -1312,6 +1483,9 @@ int rxe_responder(void *arg)
case RESPST_CHK_RKEY:
state = check_rkey(qp, pkt);
break;
+ case RESPST_CHK_PLT:
+ state = check_placement_type(qp, pkt);
+ break;
case RESPST_EXECUTE:
state = execute(qp, pkt);
break;
@@ -1324,6 +1498,9 @@ int rxe_responder(void *arg)
case RESPST_ATOMIC_REPLY:
state = atomic_reply(qp, pkt);
break;
+ case RESPST_PROCESS_FLUSH:
+ state = process_flush(qp, pkt);
+ break;
case RESPST_ACKNOWLEDGE:
state = acknowledge(qp, pkt);
break;
@@ -1369,6 +1546,8 @@ int rxe_responder(void *arg)
break;

case RESPST_ERR_RKEY_VIOLATION:
+ /* oA19-13 8 */
+ case RESPST_ERR_PLT_VIOLATION:
if (qp_type(qp) == IB_QPT_RC) {
/* Class C */
do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR,
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 96af3e054f4d..ac04cd275400 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -165,6 +165,12 @@ struct resp_res {
u64 va;
u32 resid;
} read;
+ struct {
+ u32 length;
+ u64 va;
+ u8 type;
+ u8 level;
+ } flush;
};
};

diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 808cf7a39498..4efa3d76d71d 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -105,6 +105,16 @@ enum {
IB_USER_VERBS_EX_CMD_MODIFY_CQ
};

+enum ib_ext_placement_type {
+ IB_EXT_PLT_GLB_VIS = 1 << 0,
+ IB_EXT_PLT_PERSIST = 1 << 1,
+};
+
+enum ib_ext_selectivity_level {
+ IB_EXT_SEL_MR_RANGE = 0, /* select a MR range */
+ IB_EXT_SEL_MR_WHOLE, /* select the whole MR */
+};
+
/*
* Make sure that all structs defined in this file remain laid out so
* that they pack the same way on 32-bit and 64-bit architectures (to
--
2.31.1