[PATCH 02/20] habanalabs/gaudi: add razwi notify event

From: Oded Gabbay
Date: Thu Nov 17 2022 - 11:20:10 EST


From: Dani Liberman <dliberman@xxxxxxxxx>

Each time razwi (read-only zero, write ignore) happens, besides
capturing its data, also notify the user about it.

Signed-off-by: Dani Liberman <dliberman@xxxxxxxxx>
Reviewed-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx>
---
drivers/misc/habanalabs/common/device.c | 8 +++++
drivers/misc/habanalabs/common/habanalabs.h | 2 ++
drivers/misc/habanalabs/gaudi/gaudi.c | 37 +++++++++++----------
include/uapi/misc/habanalabs.h | 2 ++
4 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 35ed494fcfdf..d1a609589558 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -2409,6 +2409,14 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_
num_of_engines * sizeof(u16));
hdev->captured_err_info.razwi.flags = flags;
}
+
+void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
+ u8 flags, u64 *event_mask)
+{
+ hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags);
+ *event_mask |= HL_NOTIFIER_EVENT_RAZWI;
+}
+
static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu)
{
struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info;
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index e391e7951fb7..d9335f3769b8 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -3812,6 +3812,8 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg,
__printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...);
void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
u8 flags);
+void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
+ u8 flags, u64 *event_mask);
void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu);

#ifdef CONFIG_DEBUG_FS
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 3dfb9ecf7db3..035865cb097c 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7301,7 +7301,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
}

static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
- bool razwi)
+ bool razwi, u64 *event_mask)
{
bool is_read = false, is_write = false;
u16 engine_id[2], num_of_razwi_eng = 0;
@@ -7337,7 +7337,8 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
num_of_razwi_eng = 1;
}

- hl_capture_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags);
+ hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags,
+ event_mask);
}
}

@@ -7675,7 +7676,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR:
case GAUDI_EVENT_MMU_DERR:
case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
- gaudi_print_irq_info(hdev, event_type, true);
+ gaudi_print_irq_info(hdev, event_type, true, &event_mask);
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
@@ -7685,7 +7686,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_AXI_ECC:
case GAUDI_EVENT_L2_RAM_ECC:
case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
- gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_irq_info(hdev, event_type, false, &event_mask);
fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
goto reset_device;
@@ -7694,7 +7695,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_HBM1_SPI_0:
case GAUDI_EVENT_HBM2_SPI_0:
case GAUDI_EVENT_HBM3_SPI_0:
- gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_hbm_read_interrupts(hdev,
gaudi_hbm_event_to_dev(event_type),
&eq_entry->hbm_ecc_data);
@@ -7706,7 +7707,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_HBM1_SPI_1:
case GAUDI_EVENT_HBM2_SPI_1:
case GAUDI_EVENT_HBM3_SPI_1:
- gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_hbm_read_interrupts(hdev,
gaudi_hbm_event_to_dev(event_type),
&eq_entry->hbm_ecc_data);
@@ -7728,7 +7729,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
* if the event is a TPC Assertion or a "real" TPC DEC.
*/
event_mask |= HL_NOTIFIER_EVENT_TPC_ASSERT;
- gaudi_print_irq_info(hdev, event_type, true);
+ gaudi_print_irq_info(hdev, event_type, true, &event_mask);
reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_dec_event_to_tpc_id(event_type),
"AXI_SLV_DEC_Error");
@@ -7753,7 +7754,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_TPC5_KRN_ERR:
case GAUDI_EVENT_TPC6_KRN_ERR:
case GAUDI_EVENT_TPC7_KRN_ERR:
- gaudi_print_irq_info(hdev, event_type, true);
+ gaudi_print_irq_info(hdev, event_type, true, &event_mask);
reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_krn_event_to_tpc_id(event_type),
"KRN_ERR");
@@ -7792,7 +7793,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR:
fallthrough;
case GAUDI_EVENT_MMU_SERR:
- gaudi_print_irq_info(hdev, event_type, true);
+ gaudi_print_irq_info(hdev, event_type, true, &event_mask);
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
hl_fw_unmask_irq(hdev, event_type);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
@@ -7802,14 +7803,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_CPU_AXI_SPLITTER:
case GAUDI_EVENT_PSOC_AXI_DEC:
case GAUDI_EVENT_PSOC_PRSTN_FALL:
- gaudi_print_irq_info(hdev, event_type, true);
+ gaudi_print_irq_info(hdev, event_type, true, &event_mask);
hl_fw_unmask_irq(hdev, event_type);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;

case GAUDI_EVENT_MMU_PAGE_FAULT:
case GAUDI_EVENT_MMU_WR_PERM:
- gaudi_print_irq_info(hdev, event_type, true);
+ gaudi_print_irq_info(hdev, event_type, true, &event_mask);
hl_fw_unmask_irq(hdev, event_type);
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
@@ -7838,14 +7839,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_NIC4_QM1:
case GAUDI_EVENT_DMA0_CORE ... GAUDI_EVENT_DMA7_CORE:
case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
- gaudi_print_irq_info(hdev, event_type, true);
+ gaudi_print_irq_info(hdev, event_type, true, &event_mask);
gaudi_handle_qman_err(hdev, event_type, &event_mask);
hl_fw_unmask_irq(hdev, event_type);
event_mask |= (HL_NOTIFIER_EVENT_USER_ENGINE_ERR | HL_NOTIFIER_EVENT_DEVICE_RESET);
break;

case GAUDI_EVENT_RAZWI_OR_ADC_SW:
- gaudi_print_irq_info(hdev, event_type, true);
+ gaudi_print_irq_info(hdev, event_type, true, &event_mask);
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
goto reset_device;

@@ -7858,7 +7859,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
case GAUDI_EVENT_TPC6_BMON_SPMU:
case GAUDI_EVENT_TPC7_BMON_SPMU:
case GAUDI_EVENT_DMA_BM_CH0 ... GAUDI_EVENT_DMA_BM_CH7:
- gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_irq_info(hdev, event_type, false, &event_mask);
hl_fw_unmask_irq(hdev, event_type);
event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
@@ -7870,7 +7871,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
break;

case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3:
- gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_print_sm_sei_info(hdev, event_type,
&eq_entry->sm_sei_data);
rc = hl_state_dump(hdev);
@@ -7899,18 +7900,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
break;

case GAUDI_EVENT_DEV_RESET_REQ:
- gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_irq_info(hdev, event_type, false, &event_mask);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
goto reset_device;

case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
- gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
goto reset_device;

case GAUDI_EVENT_FW_ALIVE_S:
- gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_irq_info(hdev, event_type, false, &event_mask);
gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
goto reset_device;
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 58343998bd63..7747e19e81fe 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -721,6 +721,7 @@ enum hl_server_type {
* HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE - Indicates device is unavailable
* HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state
* HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error
+ * HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened
*/
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0)
#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1)
@@ -729,6 +730,7 @@ enum hl_server_type {
#define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE (1ULL << 4)
#define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5)
#define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6)
+#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7)

/* Opcode for management ioctl
*
--
2.25.1