From af730e08203522dbf5a03853576c5b43c9d1afea Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Fri, 29 Mar 2024 14:42:59 +0800 Subject: drm/amdgpu: Add interface to reserve bad page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add interface to reserve bad page. Signed-off-by: YiPeng Chai Reviewed-by: Christian König Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 8d26989c75c8..ab5bf573378e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -500,6 +500,7 @@ struct amdgpu_ras { wait_queue_head_t page_retirement_wq; struct mutex page_retirement_lock; atomic_t page_retirement_req_cnt; + struct mutex page_rsv_lock; /* Fatal error detected flag */ atomic_t fed; @@ -909,4 +910,7 @@ bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id); u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type); + +int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); + #endif -- cgit From 98b5bc878d4b522c035309c8f6d3247d54050369 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 19 Mar 2024 09:57:58 +0800 Subject: drm/amdgpu: add message fifo to handle RAS poison events Add message fifo to handle RAS poison events. Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ab5bf573378e..2b15996f1ede 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -26,6 +26,7 @@ #include #include +#include #include "ta_ras_if.h" #include "amdgpu_ras_eeprom.h" #include "amdgpu_smuio.h" @@ -442,6 +443,17 @@ struct ras_query_context { u64 event_id; }; +typedef int (*pasid_notify)(struct amdgpu_device *adev, + uint16_t pasid, void *data); + +struct ras_poison_msg { + enum amdgpu_ras_block block; + uint16_t pasid; + uint32_t reset; + pasid_notify pasid_fn; + void *data; +}; + struct amdgpu_ras { /* ras infrastructure */ /* for ras itself. */ @@ -501,6 +513,8 @@ struct amdgpu_ras { struct mutex page_retirement_lock; atomic_t page_retirement_req_cnt; struct mutex page_rsv_lock; + DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); + /* Fatal error detected flag */ atomic_t fed; @@ -913,4 +927,8 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); +int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t pasid, + pasid_notify pasid_fn, void *data, uint32_t reset); + #endif -- cgit From f493dd64ee6680dc5bb46d7c800346eadb18049a Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Mon, 22 Apr 2024 17:37:36 +0800 Subject: drm/amdgpu: prepare for logging ecc errors Prepare for logging ecc errors. Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 2b15996f1ede..634654cf2634 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include "ta_ras_if.h" #include "amdgpu_ras_eeprom.h" #include "amdgpu_smuio.h" @@ -454,6 +456,26 @@ struct ras_poison_msg { void *data; }; +struct ras_err_pages { + uint32_t count; + uint64_t *pfn; +}; + +struct ras_ecc_err { + u64 hash_index; + uint64_t status; + uint64_t ipid; + uint64_t addr; + struct ras_err_pages err_pages; +}; + +struct ras_ecc_log_info { + struct mutex lock; + siphash_key_t ecc_key; + struct radix_tree_root de_page_tree; + bool de_updated; +}; + struct amdgpu_ras { /* ras infrastructure */ /* for ras itself. */ @@ -514,6 +536,7 @@ struct amdgpu_ras { atomic_t page_retirement_req_cnt; struct mutex page_rsv_lock; DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); + struct ras_ecc_log_info umc_ecc_log; /* Fatal error detected flag */ atomic_t fed; -- cgit From 2cf8e50ec381e6a6be3835a421f279d88fcb5ba4 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Mon, 22 Apr 2024 17:38:54 +0800 Subject: drm/amdgpu: Add delay work to retire bad pages Add delay work to retire bad pages. Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 634654cf2634..cb5a0f31d201 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -537,6 +537,7 @@ struct amdgpu_ras { struct mutex page_rsv_lock; DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); struct ras_ecc_log_info umc_ecc_log; + struct delayed_work page_retirement_dwork; /* Fatal error detected flag */ atomic_t fed; -- cgit From e74313be5a71df63e307ad98b6ab202b8a222817 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Fri, 22 Mar 2024 12:06:01 +0800 Subject: drm/amdgpu: add condition check for amdgpu_umc_fill_error_record Add condition check for amdgpu_umc_fill_error_record. Signed-off-by: YiPeng Chai Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index cb5a0f31d201..c8980d5f6540 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -579,6 +579,7 @@ struct ras_err_data { unsigned long de_count; unsigned long err_addr_cnt; struct eeprom_table_record *err_addr; + unsigned long err_addr_len; u32 err_list_count; struct list_head err_node_list; }; -- cgit