From b712d7c20133b67f13aa134e7534369f19e1214f Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 14 May 2024 07:48:19 +0800 Subject: drm/amdgpu: fix compiler 'side-effect' check issue for RAS_EVENT_LOG() create a new helper function to avoid compiler 'side-effect' check about RAS_EVENT_LOG() macro. Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index c8980d5f6540..6a8c7b1609df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -67,13 +67,8 @@ struct amdgpu_iv_entry; /* The high three bits indicates socketid */ #define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) -#define RAS_EVENT_LOG(_adev, _id, _fmt, ...) \ -do { \ - if (amdgpu_ras_event_id_is_valid((_adev), (_id))) \ - dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__); \ - else \ - dev_info((_adev)->dev, _fmt, ##__VA_ARGS__); \ -} while (0) +#define RAS_EVENT_LOG(adev, id, fmt, ...) \ + amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__); enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__UMC = 0, @@ -956,4 +951,8 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint16_t pasid, pasid_notify pasid_fn, void *data, uint32_t reset); +__printf(3, 4) +void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, + const char *fmt, ...); + #endif -- cgit From cf85764e2bf1acbefb45e09919a8e9fafc58e119 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Tue, 21 May 2024 15:03:02 +0800 Subject: drm/amdgpu: correct hbm field in boot status hbm filed takes bit 13 and bit 14 in boot status. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6a8c7b1609df..d06c01b978cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -46,7 +46,7 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x) AMDGPU_GET_REG_FIELD(x, 7, 7) #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8) #define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11) -#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 13, 13) +#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 14, 13) #define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x) AMDGPU_GET_REG_FIELD(x, 31, 31) #define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 1000 -- cgit From 473af28d3e63b9b679c7878df33616c7ca6ea947 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Tue, 28 May 2024 13:52:46 +0800 Subject: drm/amdgpu: Estimate RAS reservation when report capacity v2 Add estimate of how much vram we need to reserve for RAS when caculating the total available vram. v2: apply the change to MP0 v13_0_2 and v13_0_14 Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index d06c01b978cd..36a4de89c27d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -64,6 +64,9 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29 #define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000 +/* Reserve 8 physical dram row for possible retirement. + * In worst cases, it will lose 8 * 2MB memory in vram domain */ +#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20) /* The high three bits indicates socketid */ #define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) @@ -541,6 +544,7 @@ struct amdgpu_ras { struct ras_event_manager __event_mgr; struct ras_event_manager *event_mgr; + uint64_t reserved_pages_in_bytes; }; struct ras_fs_data { -- cgit From a474161e84fc0b15534a80f8dfcbaf5e48fd8249 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 30 May 2024 16:02:12 +0800 Subject: drm/amdgpu: Update programming for boot error reporting AMDGPU_RAS_GPU_ERR_BOOT_STATUS field is no longer valid. The polling sequence is also simplifed according to the latest firmware change. Signed-off-by: Hawking Zhang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 36a4de89c27d..56b9bf63b67f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -47,12 +47,10 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8) #define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11) #define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 14, 13) -#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x) AMDGPU_GET_REG_FIELD(x, 31, 31) -#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 1000 +#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 100 #define AMDGPU_RAS_BOOT_STEADY_STATUS 0xBA #define AMDGPU_RAS_BOOT_STATUS_MASK 0xFF -#define AMDGPU_RAS_BOOT_SUCEESS 0x80000000 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) /* position of instance value in sub_block_index of -- cgit From b95fa494d6b74c30eeb4a50481aa1041c631754e Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 23 May 2024 11:23:20 +0800 Subject: drm/amdgpu: add RAS is_rma flag Set the flag to true if bad page number reaches threshold. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 56b9bf63b67f..e70c45712ddb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -522,6 +522,7 @@ struct amdgpu_ras { bool update_channel_flag; /* Record status of smu mca debug mode */ bool is_aca_debug_mode; + bool is_rma; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; -- cgit