Re: [PATCHv2 7/7] nvme: add support for enhanced metadata

From: Hannes Reinecke
Date: Wed Feb 02 2022 - 08:28:57 EST


On 2/1/22 20:01, Keith Busch wrote:
NVM Express ratified TP 4069 defines new protection information formats.
Implement support for the CRC64 guard tags.

Since the block layer doesn't support variable length reference tags,
driver support for the Storage Tag space is not supported at this time.

Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx>
---
v1->v2:

Added support for PRACT

Fixed endian conversion

drivers/nvme/host/core.c | 164 +++++++++++++++++++++++++++++++++------
drivers/nvme/host/nvme.h | 4 +-
include/linux/nvme.h | 53 +++++++++++--
3 files changed, 190 insertions(+), 31 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b3eabf6a08b9..0f2ea2a4c718 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -882,6 +882,30 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
return BLK_STS_OK;
}
+static inline void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
+ struct request *req)
+{
+ u32 upper, lower;
+ u64 ref48;
+
+ /* both rw and write zeroes share the same reftag format */
+ switch (ns->guard_type) {
+ case NVME_NVM_NS_16B_GUARD:
+ cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
+ break;
+ case NVME_NVM_NS_64B_GUARD:
+ ref48 = nvme_pi_extended_ref_tag(req);
+ lower = lower_32_bits(ref48);
+ upper = upper_32_bits(ref48);
+
+ cmnd->rw.reftag = cpu_to_le32(lower);
+ cmnd->rw.cdw3 = cpu_to_le32(upper);
+ break;
+ default:
+ break;
+ }
+}
+
static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd)
{
@@ -903,8 +927,7 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
- cmnd->write_zeroes.reftag =
- cpu_to_le32(t10_pi_ref_tag(req));
+ nvme_set_ref_tag(ns, cmnd, req);
break;
}
}
@@ -931,7 +954,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.opcode = op;
cmnd->rw.flags = 0;
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
- cmnd->rw.rsvd2 = 0;
+ cmnd->rw.cdw2 = 0;
+ cmnd->rw.cdw3 = 0;
cmnd->rw.metadata = 0;
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
@@ -965,7 +989,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
NVME_RW_PRINFO_PRCHK_REF;
if (op == nvme_cmd_zone_append)
control |= NVME_RW_APPEND_PIREMAP;
- cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
+ nvme_set_ref_tag(ns, cmnd, req);
break;
}
}
@@ -1619,33 +1643,58 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
+static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
u32 max_integrity_segments)
{
struct blk_integrity integrity = { };
- switch (pi_type) {
+ switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
- integrity.profile = &t10_pi_type3_crc;
- integrity.tag_size = sizeof(u16) + sizeof(u32);
- integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ switch (ns->guard_type) {
+ case NVME_NVM_NS_16B_GUARD:
+ integrity.profile = &t10_pi_type3_crc;
+ integrity.tag_size = sizeof(u16) + sizeof(u32);
+ integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ break;
+ case NVME_NVM_NS_64B_GUARD:
+ integrity.profile = &nvme_pi_type1_crc64;
+ integrity.tag_size = sizeof(u16) + 6;
+ integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ break;
+ default:
+ integrity.profile = NULL;
+ break;
+ }
break;
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
- integrity.profile = &t10_pi_type1_crc;
- integrity.tag_size = sizeof(u16);
- integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ switch (ns->guard_type) {
+ case NVME_NVM_NS_16B_GUARD:
+ integrity.profile = &t10_pi_type1_crc;
+ integrity.tag_size = sizeof(u16);
+ integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ break;
+ case NVME_NVM_NS_64B_GUARD:
+ integrity.profile = &nvme_pi_type1_crc64;
+ integrity.tag_size = sizeof(u16);

Is that correct? Shouldn't it be '8' like in the above case?

+ integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+ break;
+ default:
+ integrity.profile = NULL;
+ break;
+ }
break;
default:
integrity.profile = NULL;
break;
}
- integrity.tuple_size = ms;
+
+ integrity.tuple_size = ns->ms;
blk_integrity_register(disk, &integrity);
blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
}
#else
-static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
+static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
u32 max_integrity_segments)
{
}
@@ -1722,17 +1771,75 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return 0;
}
-static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
+static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
{
+ bool first = id->dps & NVME_NS_DPS_PI_FIRST;
+ unsigned lbaf = nvme_lbaf_index(id->flbas);
struct nvme_ctrl *ctrl = ns->ctrl;
+ struct nvme_command c = { };
+ struct nvme_id_ns_nvm *nvm;
+ int ret = 0;
+ u32 elbaf;
+
+ ns->pi_size = 0;
+ ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+ if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
+ ns->pi_size = sizeof(struct t10_pi_tuple);
+ ns->guard_type = NVME_NVM_NS_16B_GUARD;
+ goto set_pi;
+ }
- ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
- if (id->dps & NVME_NS_DPS_PI_FIRST ||
- ns->ms == sizeof(struct t10_pi_tuple))
+ nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
+ if (!nvm)
+ return -ENOMEM;
+
+ c.identify.opcode = nvme_admin_identify;
+ c.identify.nsid = cpu_to_le32(ns->head->ns_id);
+ c.identify.cns = NVME_ID_CNS_CS_NS;
+ c.identify.csi = NVME_CSI_NVM;
+
+ ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, nvm, sizeof(*nvm));
+ if (ret)
+ goto free_data;
+
+ elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
+
+ /* no support for storage tag formats right now */
+ if (nvme_elbaf_sts(elbaf))
+ goto free_data;
+
+ ns->guard_type = nvme_elbaf_guard_type(elbaf);
+ switch (ns->guard_type) {
+ case NVME_NVM_NS_64B_GUARD:
+ ns->pi_size = sizeof(struct nvme_crc64_pi_tuple);
+ break;
+ case NVME_NVM_NS_16B_GUARD:
+ ns->pi_size = sizeof(struct t10_pi_tuple);
+ break;
+ default:
+ break;
+ }
+
+free_data:
+ kfree(nvm);
+set_pi:
+ if (ns->pi_size && (first || ns->ms == ns->pi_size))
ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
else
ns->pi_type = 0;
+ return ret;
+}
+
+static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ int ret;
+
+ ret = nvme_init_ms(ns, id);
+ if (ret)
+ return ret;
+
ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
return 0;
@@ -1850,7 +1957,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
if (ns->ms) {
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
(ns->features & NVME_NS_METADATA_SUPPORTED))
- nvme_init_integrity(disk, ns->ms, ns->pi_type,
+ nvme_init_integrity(disk, ns,
ns->ctrl->max_integrity_segments);
else if (!nvme_ns_has_pi(ns))
capacity = 0;
@@ -1905,7 +2012,7 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
{
- unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
+ unsigned lbaf = nvme_lbaf_index(id->flbas);
int ret;
blk_mq_freeze_queue(ns->disk->queue);
@@ -2252,20 +2359,27 @@ static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
return ret;
}
-static int nvme_configure_acre(struct nvme_ctrl *ctrl)
+static int nvme_configure_host_options(struct nvme_ctrl *ctrl)
{
struct nvme_feat_host_behavior *host;
+ u8 acre = 0, lbafee = 0;
int ret;
/* Don't bother enabling the feature if retry delay is not reported */
- if (!ctrl->crdt[0])
+ if (ctrl->crdt[0])
+ acre = NVME_ENABLE_ACRE;
+ if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
+ lbafee = NVME_ENABLE_LBAFEE;
+
+ if (!acre && !lbafee)
return 0;
host = kzalloc(sizeof(*host), GFP_KERNEL);
if (!host)
return 0;
- host->acre = NVME_ENABLE_ACRE;
+ host->acre = acre;
+ host->lbafee = lbafee;
ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
host, sizeof(*host), NULL);
kfree(host);
@@ -3104,7 +3218,7 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
if (ret < 0)
return ret;
- ret = nvme_configure_acre(ctrl);
+ ret = nvme_configure_host_options(ctrl);
if (ret < 0)
return ret;

This could be made into a separate patch, is it's not directly related to PI support.

Cheers,

Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@xxxxxxx +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer