From a7e8467fbeee654e390aad1736291d273b407a2c Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Thu, 11 Jul 2024 16:27:08 +0800 Subject: drm/amdgpu: Remove unused code Remove unused code. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index d0307c55da50..0fb2d9285834 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2881,9 +2881,6 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) { mutex_init(&ecc_log->lock); - /* Set any value as siphash key */ - memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key)); - INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); ecc_log->de_queried_count = 0; ecc_log->prev_de_queried_count = 0; @@ -4611,8 +4608,6 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d if (!err_node) return NULL; - INIT_LIST_HEAD(&err_node->err_info.err_addr_list); - memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); err_data->err_list_count++; @@ -4622,18 +4617,6 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d return &err_node->err_info; } -void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr) -{ - /* This function will be retired. */ - return; -} - -void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr) -{ - list_del(&mca_err_addr->node); - kfree(mca_err_addr); -} - int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, struct amdgpu_smuio_mcm_config_info *mcm_info, struct ras_err_addr *err_addr, u64 count) @@ -4650,9 +4633,6 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, if (!err_info) return -EINVAL; - if (err_addr && err_addr->err_status) - amdgpu_ras_add_mca_err_addr(err_info, err_addr); - err_info->ue_count += count; err_data->ue_count += count; @@ -4697,9 +4677,6 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, if (!err_info) return -EINVAL; - if (err_addr && err_addr->err_status) - amdgpu_ras_add_mca_err_addr(err_info, err_addr); - err_info->de_count += count; err_data->de_count += count; -- cgit From dfe9d047b162f3a79ab63046608c693ee14c5b7a Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 1 Aug 2024 13:45:27 +0800 Subject: drm/amdgpu: Add more types for boot time error reporting Data abort exception and unknown errors are supported. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0fb2d9285834..9cda368ad794 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -4748,6 +4748,16 @@ static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n", socket_id, aid_id, hbm_id, fw_status); + + if (AMDGPU_RAS_GPU_ERR_DATA_ABORT(boot_error)) + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, data abort exception\n", + socket_id, aid_id, fw_status); + + if (AMDGPU_RAS_GPU_ERR_UNKNOWN(boot_error)) + dev_info(adev->dev, + "socket: %d, aid: %d, fw_status: 0x%x, unknown boot time errors\n", + socket_id, aid_id, fw_status); } static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev, -- cgit From 792be2e23ac69821db7860ba4ba94592101f0b07 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 1 Aug 2024 14:26:19 +0800 Subject: drm/amdgpu: create function to check RAS RMA status In the convenience of calling it globally. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9cda368ad794..16da939a8406 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2153,7 +2153,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * /* gpu reset is fallback for failed and default cases. * For RMA case, amdgpu_umc_poison_handler will handle gpu reset. */ - if (poison_stat && !con->is_rma) { + if (poison_stat && !amdgpu_ras_is_rma(adev)) { event_id = amdgpu_ras_acquire_event_id(adev, type); RAS_EVENT_LOG(adev, event_id, "GPU reset for %s RAS poison consumption is issued!\n", @@ -2945,7 +2945,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) amdgpu_ras_error_data_fini(&err_data); - if (err_cnt && con->is_rma) + if (err_cnt && amdgpu_ras_is_rma(adev)) amdgpu_ras_reset_gpu(adev); amdgpu_ras_schedule_retirement_dwork(con, @@ -3046,7 +3046,7 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, } /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ - if (reset_flags && !con->is_rma) { + if (reset_flags && !amdgpu_ras_is_rma(adev)) { if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) @@ -3192,7 +3192,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) * This calling fails when is_rma is true or * ret != 0. */ - if (con->is_rma || ret) + if (amdgpu_ras_is_rma(adev) || ret) goto free; if (con->eeprom_control.ras_num_recs) { @@ -3241,7 +3241,7 @@ out: * Except error threshold exceeding case, other failure cases in this * function would not fail amdgpu driver init. */ - if (!con->is_rma) + if (!amdgpu_ras_is_rma(adev)) ret = 0; else ret = -EINVAL; @@ -4284,7 +4284,7 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); /* mode1 is the only selection for RMA status */ - if (ras->is_rma) { + if (amdgpu_ras_is_rma(adev)) { ras->gpu_reset_flags = 0; ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; } @@ -4824,3 +4824,13 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, va_end(args); } + +bool amdgpu_ras_is_rma(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!con) + return false; + + return con->is_rma; +} -- cgit From 671af06690e7f79db51b475a35c3b2619f345abc Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Fri, 2 Aug 2024 10:11:37 +0800 Subject: drm/amdgpu: remove RAS unused paramter 'err_addr' - amdgpu_ras_error_statistic_ue_count() - amdgpu_ras_error_statistic_ce_count() - amdgpu_ras_error_statistic_de_count() The parameter 'err_addr' is no longer used since following patch. Fixes: a7e8467fbeee ("drm/amdgpu: Remove unused code") Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 16da939a8406..61a2f386d9fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1223,11 +1223,11 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s for_each_ras_error(err_node, err_data) { err_info = &err_node->err_info; amdgpu_ras_error_statistic_de_count(&obj->err_data, - &err_info->mcm_info, NULL, err_info->de_count); + &err_info->mcm_info, err_info->de_count); amdgpu_ras_error_statistic_ce_count(&obj->err_data, - &err_info->mcm_info, NULL, err_info->ce_count); + &err_info->mcm_info, err_info->ce_count); amdgpu_ras_error_statistic_ue_count(&obj->err_data, - &err_info->mcm_info, NULL, err_info->ue_count); + &err_info->mcm_info, err_info->ue_count); } } else { /* for legacy asic path which doesn't has error source info */ @@ -4618,8 +4618,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d } int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, - struct amdgpu_smuio_mcm_config_info *mcm_info, - struct ras_err_addr *err_addr, u64 count) + struct amdgpu_smuio_mcm_config_info *mcm_info, + u64 count) { struct ras_err_info *err_info; @@ -4640,8 +4640,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, } int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, - struct amdgpu_smuio_mcm_config_info *mcm_info, - struct ras_err_addr *err_addr, u64 count) + struct amdgpu_smuio_mcm_config_info *mcm_info, + u64 count) { struct ras_err_info *err_info; @@ -4662,8 +4662,8 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, } int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, - struct amdgpu_smuio_mcm_config_info *mcm_info, - struct ras_err_addr *err_addr, u64 count) + struct amdgpu_smuio_mcm_config_info *mcm_info, + u64 count) { struct ras_err_info *err_info; -- cgit From c389a0604cfbcdb1f8f53a76560eb31e0700e206 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Mon, 9 Sep 2024 18:51:42 +0800 Subject: drm/amdgpu: disable GPU RAS bad page feature for specific ASIC The feature is not applicable to specific app platform. v2: update the disablement condition and commit description v3: move the setting to amdgpu_ras_check_supported Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 61a2f386d9fb..2e4c867e1d42 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3468,6 +3468,11 @@ init_ras_enabled_flag: /* aca is disabled by default */ adev->aca.is_enabled = false; + + /* bad page feature is not applicable to specific app platform */ + if (adev->gmc.is_app_apu && + amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0)) + amdgpu_bad_page_threshold = 0; } static void amdgpu_ras_counte_dw(struct work_struct *work) -- cgit From 0110ac11952f06419d267f51a3989e989b17e67a Mon Sep 17 00:00:00 2001 From: Yan Zhen Date: Wed, 11 Sep 2024 12:27:38 +0800 Subject: drm/amdgpu: fix typo in the comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correctly spelled comments make it easier for the reader to understand the code. Replace 'udpate' with 'update' in the comment & replace 'recieved' with 'received' in the comment & replace 'dsiable' with 'disable' in the comment & replace 'Initiailize' with 'Initialize' in the comment & replace 'disble' with 'disable' in the comment & replace 'Disbale' with 'Disable' in the comment & replace 'enogh' with 'enough' in the comment & replace 'availabe' with 'available' in the comment. Acked-by: Christian König Signed-off-by: Yan Zhen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2e4c867e1d42..1a1395c5fff1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -882,7 +882,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, if (ret) return ret; - /* gfx block ras dsiable cmd must send to ras-ta */ + /* gfx block ras disable cmd must send to ras-ta */ if (head->block == AMDGPU_RAS_BLOCK__GFX) con->features |= BIT(head->block); -- cgit