2 * Copyright 2018 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
24 #include <linux/debugfs.h>
25 #include <linux/list.h>
26 #include <linux/module.h>
27 #include <linux/uaccess.h>
28 #include <linux/reboot.h>
29 #include <linux/syscalls.h>
32 #include "amdgpu_ras.h"
33 #include "amdgpu_atomfirmware.h"
34 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
36 const char *ras_error_string
[] = {
40 "multi_uncorrectable",
44 const char *ras_block_string
[] = {
61 #define ras_err_str(i) (ras_error_string[ffs(i)])
62 #define ras_block_str(i) (ras_block_string[i])
64 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
65 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
66 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
68 /* inject address is 52 bits */
69 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
71 enum amdgpu_ras_retire_page_reservation
{
72 AMDGPU_RAS_RETIRE_PAGE_RESERVED
,
73 AMDGPU_RAS_RETIRE_PAGE_PENDING
,
74 AMDGPU_RAS_RETIRE_PAGE_FAULT
,
77 atomic_t amdgpu_ras_in_intr
= ATOMIC_INIT(0);
79 static bool amdgpu_ras_check_bad_page(struct amdgpu_device
*adev
,
82 static ssize_t
amdgpu_ras_debugfs_read(struct file
*f
, char __user
*buf
,
83 size_t size
, loff_t
*pos
)
85 struct ras_manager
*obj
= (struct ras_manager
*)file_inode(f
)->i_private
;
86 struct ras_query_if info
= {
92 if (amdgpu_ras_error_query(obj
->adev
, &info
))
95 s
= snprintf(val
, sizeof(val
), "%s: %lu\n%s: %lu\n",
102 s
= min_t(u64
, s
, size
);
105 if (copy_to_user(buf
, &val
[*pos
], s
))
113 static const struct file_operations amdgpu_ras_debugfs_ops
= {
114 .owner
= THIS_MODULE
,
115 .read
= amdgpu_ras_debugfs_read
,
117 .llseek
= default_llseek
120 static int amdgpu_ras_find_block_id_by_name(const char *name
, int *block_id
)
124 for (i
= 0; i
< ARRAY_SIZE(ras_block_string
); i
++) {
126 if (strcmp(name
, ras_block_str(i
)) == 0)
132 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file
*f
,
133 const char __user
*buf
, size_t size
,
134 loff_t
*pos
, struct ras_debug_if
*data
)
136 ssize_t s
= min_t(u64
, 64, size
);
149 memset(str
, 0, sizeof(str
));
150 memset(data
, 0, sizeof(*data
));
152 if (copy_from_user(str
, buf
, s
))
155 if (sscanf(str
, "disable %32s", block_name
) == 1)
157 else if (sscanf(str
, "enable %32s %8s", block_name
, err
) == 2)
159 else if (sscanf(str
, "inject %32s %8s", block_name
, err
) == 2)
161 else if (str
[0] && str
[1] && str
[2] && str
[3])
162 /* ascii string, but commands are not matched. */
166 if (amdgpu_ras_find_block_id_by_name(block_name
, &block_id
))
169 data
->head
.block
= block_id
;
170 /* only ue and ce errors are supported */
171 if (!memcmp("ue", err
, 2))
172 data
->head
.type
= AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE
;
173 else if (!memcmp("ce", err
, 2))
174 data
->head
.type
= AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE
;
181 if (sscanf(str
, "%*s %*s %*s %u %llu %llu",
182 &sub_block
, &address
, &value
) != 3)
183 if (sscanf(str
, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
184 &sub_block
, &address
, &value
) != 3)
186 data
->head
.sub_block_index
= sub_block
;
187 data
->inject
.address
= address
;
188 data
->inject
.value
= value
;
191 if (size
< sizeof(*data
))
194 if (copy_from_user(data
, buf
, sizeof(*data
)))
202 * DOC: AMDGPU RAS debugfs control interface
204 * It accepts struct ras_debug_if who has two members.
206 * First member: ras_debug_if::head or ras_debug_if::inject.
208 * head is used to indicate which IP block will be under control.
210 * head has four members, they are block, type, sub_block_index, name.
211 * block: which IP will be under control.
212 * type: what kind of error will be enabled/disabled/injected.
213 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
214 * name: the name of IP.
216 * inject has two more members than head, they are address, value.
217 * As their names indicate, inject operation will write the
218 * value to the address.
220 * The second member: struct ras_debug_if::op.
221 * It has three kinds of operations.
223 * - 0: disable RAS on the block. Take ::head as its data.
224 * - 1: enable RAS on the block. Take ::head as its data.
225 * - 2: inject errors on the block. Take ::inject as its data.
227 * How to use the interface?
231 * Copy the struct ras_debug_if in your codes and initialize it.
232 * Write the struct to the control node.
236 * .. code-block:: bash
238 * echo op block [error [sub_block address value]] > .../ras/ras_ctrl
242 * op: disable, enable, inject
243 * disable: only block is needed
244 * enable: block and error are needed
245 * inject: error, address, value are needed
246 * block: umc, sdma, gfx, .........
247 * see ras_block_string[] for details
249 * ue: multi_uncorrectable
250 * ce: single_correctable
252 * sub block index, pass 0 if there is no sub block
254 * here are some examples for bash commands:
256 * .. code-block:: bash
258 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
259 * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
260 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
262 * How to check the result?
264 * For disable/enable, please check ras features at
265 * /sys/class/drm/card[0/1/2...]/device/ras/features
267 * For inject, please check corresponding err count at
268 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
271 * Operations are only allowed on blocks which are supported.
272 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
273 * to see which blocks support RAS on a particular asic.
276 static ssize_t
amdgpu_ras_debugfs_ctrl_write(struct file
*f
, const char __user
*buf
,
277 size_t size
, loff_t
*pos
)
279 struct amdgpu_device
*adev
= (struct amdgpu_device
*)file_inode(f
)->i_private
;
280 struct ras_debug_if data
;
283 ret
= amdgpu_ras_debugfs_ctrl_parse_data(f
, buf
, size
, pos
, &data
);
287 if (!amdgpu_ras_is_supported(adev
, data
.head
.block
))
292 ret
= amdgpu_ras_feature_enable(adev
, &data
.head
, 0);
295 ret
= amdgpu_ras_feature_enable(adev
, &data
.head
, 1);
298 if ((data
.inject
.address
>= adev
->gmc
.mc_vram_size
) ||
299 (data
.inject
.address
>= RAS_UMC_INJECT_ADDR_LIMIT
)) {
304 /* umc ce/ue error injection for a bad page is not allowed */
305 if ((data
.head
.block
== AMDGPU_RAS_BLOCK__UMC
) &&
306 amdgpu_ras_check_bad_page(adev
, data
.inject
.address
)) {
307 DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n",
308 data
.inject
.address
);
312 /* data.inject.address is offset instead of absolute gpu address */
313 ret
= amdgpu_ras_error_inject(adev
, &data
.inject
);
327 * DOC: AMDGPU RAS debugfs EEPROM table reset interface
329 * Some boards contain an EEPROM which is used to persistently store a list of
330 * bad pages which experiences ECC errors in vram. This interface provides
331 * a way to reset the EEPROM, e.g., after testing error injection.
335 * .. code-block:: bash
337 * echo 1 > ../ras/ras_eeprom_reset
339 * will reset EEPROM table to 0 entries.
342 static ssize_t
amdgpu_ras_debugfs_eeprom_write(struct file
*f
, const char __user
*buf
,
343 size_t size
, loff_t
*pos
)
345 struct amdgpu_device
*adev
= (struct amdgpu_device
*)file_inode(f
)->i_private
;
348 ret
= amdgpu_ras_eeprom_reset_table(&adev
->psp
.ras
.ras
->eeprom_control
);
350 return ret
== 1 ? size
: -EIO
;
353 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops
= {
354 .owner
= THIS_MODULE
,
356 .write
= amdgpu_ras_debugfs_ctrl_write
,
357 .llseek
= default_llseek
360 static const struct file_operations amdgpu_ras_debugfs_eeprom_ops
= {
361 .owner
= THIS_MODULE
,
363 .write
= amdgpu_ras_debugfs_eeprom_write
,
364 .llseek
= default_llseek
368 * DOC: AMDGPU RAS sysfs Error Count Interface
370 * It allows the user to read the error count for each IP block on the gpu through
371 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
373 * It outputs the multiple lines which report the uncorrected (ue) and corrected
376 * The format of one line is below,
382 * .. code-block:: bash
388 static ssize_t
amdgpu_ras_sysfs_read(struct device
*dev
,
389 struct device_attribute
*attr
, char *buf
)
391 struct ras_manager
*obj
= container_of(attr
, struct ras_manager
, sysfs_attr
);
392 struct ras_query_if info
= {
396 if (amdgpu_ras_error_query(obj
->adev
, &info
))
399 return snprintf(buf
, PAGE_SIZE
, "%s: %lu\n%s: %lu\n",
401 "ce", info
.ce_count
);
406 #define get_obj(obj) do { (obj)->use++; } while (0)
407 #define alive_obj(obj) ((obj)->use)
409 static inline void put_obj(struct ras_manager
*obj
)
411 if (obj
&& --obj
->use
== 0)
412 list_del(&obj
->node
);
413 if (obj
&& obj
->use
< 0) {
414 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj
->head
.name
);
418 /* make one obj and return it. */
419 static struct ras_manager
*amdgpu_ras_create_obj(struct amdgpu_device
*adev
,
420 struct ras_common_if
*head
)
422 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
423 struct ras_manager
*obj
;
428 if (head
->block
>= AMDGPU_RAS_BLOCK_COUNT
)
431 obj
= &con
->objs
[head
->block
];
432 /* already exist. return obj? */
438 list_add(&obj
->node
, &con
->head
);
444 /* return an obj equal to head, or the first when head is NULL */
445 struct ras_manager
*amdgpu_ras_find_obj(struct amdgpu_device
*adev
,
446 struct ras_common_if
*head
)
448 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
449 struct ras_manager
*obj
;
456 if (head
->block
>= AMDGPU_RAS_BLOCK_COUNT
)
459 obj
= &con
->objs
[head
->block
];
461 if (alive_obj(obj
)) {
462 WARN_ON(head
->block
!= obj
->head
.block
);
466 for (i
= 0; i
< AMDGPU_RAS_BLOCK_COUNT
; i
++) {
468 if (alive_obj(obj
)) {
469 WARN_ON(i
!= obj
->head
.block
);
479 /* feature ctl begin */
480 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device
*adev
,
481 struct ras_common_if
*head
)
483 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
485 return con
->hw_supported
& BIT(head
->block
);
488 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device
*adev
,
489 struct ras_common_if
*head
)
491 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
493 return con
->features
& BIT(head
->block
);
497 * if obj is not created, then create one.
498 * set feature enable flag.
500 static int __amdgpu_ras_feature_enable(struct amdgpu_device
*adev
,
501 struct ras_common_if
*head
, int enable
)
503 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
504 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
506 /* If hardware does not support ras, then do not create obj.
507 * But if hardware support ras, we can create the obj.
508 * Ras framework checks con->hw_supported to see if it need do
509 * corresponding initialization.
510 * IP checks con->support to see if it need disable ras.
512 if (!amdgpu_ras_is_feature_allowed(adev
, head
))
514 if (!(!!enable
^ !!amdgpu_ras_is_feature_enabled(adev
, head
)))
519 obj
= amdgpu_ras_create_obj(adev
, head
);
523 /* In case we create obj somewhere else */
526 con
->features
|= BIT(head
->block
);
528 if (obj
&& amdgpu_ras_is_feature_enabled(adev
, head
)) {
529 con
->features
&= ~BIT(head
->block
);
537 /* wrapper of psp_ras_enable_features */
538 int amdgpu_ras_feature_enable(struct amdgpu_device
*adev
,
539 struct ras_common_if
*head
, bool enable
)
541 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
542 union ta_ras_cmd_input info
;
549 info
.disable_features
= (struct ta_ras_disable_features_input
) {
550 .block_id
= amdgpu_ras_block_to_ta(head
->block
),
551 .error_type
= amdgpu_ras_error_to_ta(head
->type
),
554 info
.enable_features
= (struct ta_ras_enable_features_input
) {
555 .block_id
= amdgpu_ras_block_to_ta(head
->block
),
556 .error_type
= amdgpu_ras_error_to_ta(head
->type
),
560 /* Do not enable if it is not allowed. */
561 WARN_ON(enable
&& !amdgpu_ras_is_feature_allowed(adev
, head
));
562 /* Are we alerady in that state we are going to set? */
563 if (!(!!enable
^ !!amdgpu_ras_is_feature_enabled(adev
, head
)))
566 if (!amdgpu_ras_intr_triggered()) {
567 ret
= psp_ras_enable_features(&adev
->psp
, &info
, enable
);
569 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
570 enable
? "enable":"disable",
571 ras_block_str(head
->block
),
573 if (ret
== TA_RAS_STATUS__RESET_NEEDED
)
580 __amdgpu_ras_feature_enable(adev
, head
, enable
);
585 /* Only used in device probe stage and called only once. */
586 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device
*adev
,
587 struct ras_common_if
*head
, bool enable
)
589 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
595 if (con
->flags
& AMDGPU_RAS_FLAG_INIT_BY_VBIOS
) {
597 /* There is no harm to issue a ras TA cmd regardless of
598 * the currecnt ras state.
599 * If current state == target state, it will do nothing
600 * But sometimes it requests driver to reset and repost
601 * with error code -EAGAIN.
603 ret
= amdgpu_ras_feature_enable(adev
, head
, 1);
604 /* With old ras TA, we might fail to enable ras.
605 * Log it and just setup the object.
606 * TODO need remove this WA in the future.
608 if (ret
== -EINVAL
) {
609 ret
= __amdgpu_ras_feature_enable(adev
, head
, 1);
611 DRM_INFO("RAS INFO: %s setup object\n",
612 ras_block_str(head
->block
));
615 /* setup the object then issue a ras TA disable cmd.*/
616 ret
= __amdgpu_ras_feature_enable(adev
, head
, 1);
620 ret
= amdgpu_ras_feature_enable(adev
, head
, 0);
623 ret
= amdgpu_ras_feature_enable(adev
, head
, enable
);
628 static int amdgpu_ras_disable_all_features(struct amdgpu_device
*adev
,
631 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
632 struct ras_manager
*obj
, *tmp
;
634 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
636 * aka just release the obj and corresponding flags
639 if (__amdgpu_ras_feature_enable(adev
, &obj
->head
, 0))
642 if (amdgpu_ras_feature_enable(adev
, &obj
->head
, 0))
647 return con
->features
;
650 static int amdgpu_ras_enable_all_features(struct amdgpu_device
*adev
,
653 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
654 int ras_block_count
= AMDGPU_RAS_BLOCK_COUNT
;
656 const enum amdgpu_ras_error_type default_ras_type
=
657 AMDGPU_RAS_ERROR__NONE
;
659 for (i
= 0; i
< ras_block_count
; i
++) {
660 struct ras_common_if head
= {
662 .type
= default_ras_type
,
663 .sub_block_index
= 0,
665 strcpy(head
.name
, ras_block_str(i
));
668 * bypass psp. vbios enable ras for us.
669 * so just create the obj
671 if (__amdgpu_ras_feature_enable(adev
, &head
, 1))
674 if (amdgpu_ras_feature_enable(adev
, &head
, 1))
679 return con
->features
;
681 /* feature ctl end */
683 /* query/inject/cure begin */
684 int amdgpu_ras_error_query(struct amdgpu_device
*adev
,
685 struct ras_query_if
*info
)
687 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
688 struct ras_err_data err_data
= {0, 0, 0, NULL
};
694 switch (info
->head
.block
) {
695 case AMDGPU_RAS_BLOCK__UMC
:
696 if (adev
->umc
.funcs
->query_ras_error_count
)
697 adev
->umc
.funcs
->query_ras_error_count(adev
, &err_data
);
698 /* umc query_ras_error_address is also responsible for clearing
701 if (adev
->umc
.funcs
->query_ras_error_address
)
702 adev
->umc
.funcs
->query_ras_error_address(adev
, &err_data
);
704 case AMDGPU_RAS_BLOCK__SDMA
:
705 if (adev
->sdma
.funcs
->query_ras_error_count
) {
706 for (i
= 0; i
< adev
->sdma
.num_instances
; i
++)
707 adev
->sdma
.funcs
->query_ras_error_count(adev
, i
,
711 case AMDGPU_RAS_BLOCK__GFX
:
712 if (adev
->gfx
.funcs
->query_ras_error_count
)
713 adev
->gfx
.funcs
->query_ras_error_count(adev
, &err_data
);
715 case AMDGPU_RAS_BLOCK__MMHUB
:
716 if (adev
->mmhub
.funcs
->query_ras_error_count
)
717 adev
->mmhub
.funcs
->query_ras_error_count(adev
, &err_data
);
719 case AMDGPU_RAS_BLOCK__PCIE_BIF
:
720 if (adev
->nbio
.funcs
->query_ras_error_count
)
721 adev
->nbio
.funcs
->query_ras_error_count(adev
, &err_data
);
727 obj
->err_data
.ue_count
+= err_data
.ue_count
;
728 obj
->err_data
.ce_count
+= err_data
.ce_count
;
730 info
->ue_count
= obj
->err_data
.ue_count
;
731 info
->ce_count
= obj
->err_data
.ce_count
;
733 if (err_data
.ce_count
) {
734 dev_info(adev
->dev
, "%ld correctable errors detected in %s block\n",
735 obj
->err_data
.ce_count
, ras_block_str(info
->head
.block
));
737 if (err_data
.ue_count
) {
738 dev_info(adev
->dev
, "%ld uncorrectable errors detected in %s block\n",
739 obj
->err_data
.ue_count
, ras_block_str(info
->head
.block
));
745 /* wrapper of psp_ras_trigger_error */
746 int amdgpu_ras_error_inject(struct amdgpu_device
*adev
,
747 struct ras_inject_if
*info
)
749 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
750 struct ta_ras_trigger_error_input block_info
= {
751 .block_id
= amdgpu_ras_block_to_ta(info
->head
.block
),
752 .inject_error_type
= amdgpu_ras_error_to_ta(info
->head
.type
),
753 .sub_block_index
= info
->head
.sub_block_index
,
754 .address
= info
->address
,
755 .value
= info
->value
,
762 switch (info
->head
.block
) {
763 case AMDGPU_RAS_BLOCK__GFX
:
764 if (adev
->gfx
.funcs
->ras_error_inject
)
765 ret
= adev
->gfx
.funcs
->ras_error_inject(adev
, info
);
769 case AMDGPU_RAS_BLOCK__UMC
:
770 case AMDGPU_RAS_BLOCK__MMHUB
:
771 case AMDGPU_RAS_BLOCK__XGMI_WAFL
:
772 case AMDGPU_RAS_BLOCK__PCIE_BIF
:
773 ret
= psp_ras_trigger_error(&adev
->psp
, &block_info
);
776 DRM_INFO("%s error injection is not supported yet\n",
777 ras_block_str(info
->head
.block
));
782 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
783 ras_block_str(info
->head
.block
),
789 int amdgpu_ras_error_cure(struct amdgpu_device
*adev
,
790 struct ras_cure_if
*info
)
792 /* psp fw has no cure interface for now. */
796 /* get the total error counts on all IPs */
797 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device
*adev
,
800 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
801 struct ras_manager
*obj
;
802 struct ras_err_data data
= {0, 0};
807 list_for_each_entry(obj
, &con
->head
, node
) {
808 struct ras_query_if info
= {
812 if (amdgpu_ras_error_query(adev
, &info
))
815 data
.ce_count
+= info
.ce_count
;
816 data
.ue_count
+= info
.ue_count
;
819 return is_ce
? data
.ce_count
: data
.ue_count
;
821 /* query/inject/cure end */
826 static int amdgpu_ras_badpages_read(struct amdgpu_device
*adev
,
827 struct ras_badpage
**bps
, unsigned int *count
);
829 static char *amdgpu_ras_badpage_flags_str(unsigned int flags
)
832 case AMDGPU_RAS_RETIRE_PAGE_RESERVED
:
834 case AMDGPU_RAS_RETIRE_PAGE_PENDING
:
836 case AMDGPU_RAS_RETIRE_PAGE_FAULT
:
843 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
845 * It allows user to read the bad pages of vram on the gpu through
846 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
848 * It outputs multiple lines, and each line stands for one gpu page.
850 * The format of one line is below,
851 * gpu pfn : gpu page size : flags
853 * gpu pfn and gpu page size are printed in hex format.
854 * flags can be one of below character,
856 * R: reserved, this gpu page is reserved and not able to use.
858 * P: pending for reserve, this gpu page is marked as bad, will be reserved
859 * in next window of page_reserve.
861 * F: unable to reserve. this gpu page can't be reserved due to some reasons.
865 * .. code-block:: bash
867 * 0x00000001 : 0x00001000 : R
868 * 0x00000002 : 0x00001000 : P
872 static ssize_t
amdgpu_ras_sysfs_badpages_read(struct file
*f
,
873 struct kobject
*kobj
, struct bin_attribute
*attr
,
874 char *buf
, loff_t ppos
, size_t count
)
876 struct amdgpu_ras
*con
=
877 container_of(attr
, struct amdgpu_ras
, badpages_attr
);
878 struct amdgpu_device
*adev
= con
->adev
;
879 const unsigned int element_size
=
880 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
881 unsigned int start
= div64_ul(ppos
+ element_size
- 1, element_size
);
882 unsigned int end
= div64_ul(ppos
+ count
- 1, element_size
);
884 struct ras_badpage
*bps
= NULL
;
885 unsigned int bps_count
= 0;
887 memset(buf
, 0, count
);
889 if (amdgpu_ras_badpages_read(adev
, &bps
, &bps_count
))
892 for (; start
< end
&& start
< bps_count
; start
++)
893 s
+= scnprintf(&buf
[s
], element_size
+ 1,
894 "0x%08x : 0x%08x : %1s\n",
897 amdgpu_ras_badpage_flags_str(bps
[start
].flags
));
904 static ssize_t
amdgpu_ras_sysfs_features_read(struct device
*dev
,
905 struct device_attribute
*attr
, char *buf
)
907 struct amdgpu_ras
*con
=
908 container_of(attr
, struct amdgpu_ras
, features_attr
);
910 return scnprintf(buf
, PAGE_SIZE
, "feature mask: 0x%x\n", con
->features
);
913 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device
*adev
)
915 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
916 struct attribute
*attrs
[] = {
917 &con
->features_attr
.attr
,
920 struct bin_attribute
*bin_attrs
[] = {
924 struct attribute_group group
= {
927 .bin_attrs
= bin_attrs
,
930 con
->features_attr
= (struct device_attribute
) {
935 .show
= amdgpu_ras_sysfs_features_read
,
938 con
->badpages_attr
= (struct bin_attribute
) {
940 .name
= "gpu_vram_bad_pages",
945 .read
= amdgpu_ras_sysfs_badpages_read
,
948 sysfs_attr_init(attrs
[0]);
949 sysfs_bin_attr_init(bin_attrs
[0]);
951 return sysfs_create_group(&adev
->dev
->kobj
, &group
);
954 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device
*adev
)
956 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
957 struct attribute
*attrs
[] = {
958 &con
->features_attr
.attr
,
961 struct bin_attribute
*bin_attrs
[] = {
965 struct attribute_group group
= {
968 .bin_attrs
= bin_attrs
,
971 sysfs_remove_group(&adev
->dev
->kobj
, &group
);
976 int amdgpu_ras_sysfs_create(struct amdgpu_device
*adev
,
977 struct ras_fs_if
*head
)
979 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &head
->head
);
981 if (!obj
|| obj
->attr_inuse
)
986 memcpy(obj
->fs_data
.sysfs_name
,
988 sizeof(obj
->fs_data
.sysfs_name
));
990 obj
->sysfs_attr
= (struct device_attribute
){
992 .name
= obj
->fs_data
.sysfs_name
,
995 .show
= amdgpu_ras_sysfs_read
,
997 sysfs_attr_init(&obj
->sysfs_attr
.attr
);
999 if (sysfs_add_file_to_group(&adev
->dev
->kobj
,
1000 &obj
->sysfs_attr
.attr
,
1006 obj
->attr_inuse
= 1;
1011 int amdgpu_ras_sysfs_remove(struct amdgpu_device
*adev
,
1012 struct ras_common_if
*head
)
1014 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
1016 if (!obj
|| !obj
->attr_inuse
)
1019 sysfs_remove_file_from_group(&adev
->dev
->kobj
,
1020 &obj
->sysfs_attr
.attr
,
1022 obj
->attr_inuse
= 0;
1028 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device
*adev
)
1030 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1031 struct ras_manager
*obj
, *tmp
;
1033 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
1034 amdgpu_ras_sysfs_remove(adev
, &obj
->head
);
1037 amdgpu_ras_sysfs_remove_feature_node(adev
);
1044 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
1046 * Normally when there is an uncorrectable error, the driver will reset
1047 * the GPU to recover. However, in the event of an unrecoverable error,
1048 * the driver provides an interface to reboot the system automatically
1051 * The following file in debugfs provides that interface:
1052 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1056 * .. code-block:: bash
1058 * echo true > .../ras/auto_reboot
1062 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device
*adev
)
1064 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1065 struct drm_minor
*minor
= adev
->ddev
->primary
;
1067 con
->dir
= debugfs_create_dir("ras", minor
->debugfs_root
);
1068 debugfs_create_file("ras_ctrl", S_IWUGO
| S_IRUGO
, con
->dir
,
1069 adev
, &amdgpu_ras_debugfs_ctrl_ops
);
1070 debugfs_create_file("ras_eeprom_reset", S_IWUGO
| S_IRUGO
, con
->dir
,
1071 adev
, &amdgpu_ras_debugfs_eeprom_ops
);
1074 * After one uncorrectable error happens, usually GPU recovery will
1075 * be scheduled. But due to the known problem in GPU recovery failing
1076 * to bring GPU back, below interface provides one direct way to
1077 * user to reboot system automatically in such case within
1078 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
1079 * will never be called.
1081 debugfs_create_bool("auto_reboot", S_IWUGO
| S_IRUGO
, con
->dir
,
1085 void amdgpu_ras_debugfs_create(struct amdgpu_device
*adev
,
1086 struct ras_fs_if
*head
)
1088 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1089 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &head
->head
);
1091 if (!obj
|| obj
->ent
)
1096 memcpy(obj
->fs_data
.debugfs_name
,
1098 sizeof(obj
->fs_data
.debugfs_name
));
1100 obj
->ent
= debugfs_create_file(obj
->fs_data
.debugfs_name
,
1101 S_IWUGO
| S_IRUGO
, con
->dir
, obj
,
1102 &amdgpu_ras_debugfs_ops
);
1105 void amdgpu_ras_debugfs_remove(struct amdgpu_device
*adev
,
1106 struct ras_common_if
*head
)
1108 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, head
);
1110 if (!obj
|| !obj
->ent
)
1113 debugfs_remove(obj
->ent
);
1118 static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device
*adev
)
1120 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1121 struct ras_manager
*obj
, *tmp
;
1123 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
1124 amdgpu_ras_debugfs_remove(adev
, &obj
->head
);
1127 debugfs_remove_recursive(con
->dir
);
1134 static int amdgpu_ras_fs_init(struct amdgpu_device
*adev
)
1136 amdgpu_ras_sysfs_create_feature_node(adev
);
1137 amdgpu_ras_debugfs_create_ctrl_node(adev
);
1142 static int amdgpu_ras_fs_fini(struct amdgpu_device
*adev
)
1144 amdgpu_ras_debugfs_remove_all(adev
);
1145 amdgpu_ras_sysfs_remove_all(adev
);
1151 static void amdgpu_ras_interrupt_handler(struct ras_manager
*obj
)
1153 struct ras_ih_data
*data
= &obj
->ih_data
;
1154 struct amdgpu_iv_entry entry
;
1156 struct ras_err_data err_data
= {0, 0, 0, NULL
};
1158 while (data
->rptr
!= data
->wptr
) {
1160 memcpy(&entry
, &data
->ring
[data
->rptr
],
1161 data
->element_size
);
1164 data
->rptr
= (data
->aligned_element_size
+
1165 data
->rptr
) % data
->ring_size
;
1167 /* Let IP handle its data, maybe we need get the output
1168 * from the callback to udpate the error type/count, etc
1171 ret
= data
->cb(obj
->adev
, &err_data
, &entry
);
1172 /* ue will trigger an interrupt, and in that case
1173 * we need do a reset to recovery the whole system.
1174 * But leave IP do that recovery, here we just dispatch
1177 if (ret
== AMDGPU_RAS_SUCCESS
) {
1178 /* these counts could be left as 0 if
1179 * some blocks do not count error number
1181 obj
->err_data
.ue_count
+= err_data
.ue_count
;
1182 obj
->err_data
.ce_count
+= err_data
.ce_count
;
1188 static void amdgpu_ras_interrupt_process_handler(struct work_struct
*work
)
1190 struct ras_ih_data
*data
=
1191 container_of(work
, struct ras_ih_data
, ih_work
);
1192 struct ras_manager
*obj
=
1193 container_of(data
, struct ras_manager
, ih_data
);
1195 amdgpu_ras_interrupt_handler(obj
);
1198 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device
*adev
,
1199 struct ras_dispatch_if
*info
)
1201 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
1202 struct ras_ih_data
*data
= &obj
->ih_data
;
1207 if (data
->inuse
== 0)
1210 /* Might be overflow... */
1211 memcpy(&data
->ring
[data
->wptr
], info
->entry
,
1212 data
->element_size
);
1215 data
->wptr
= (data
->aligned_element_size
+
1216 data
->wptr
) % data
->ring_size
;
1218 schedule_work(&data
->ih_work
);
1223 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device
*adev
,
1224 struct ras_ih_if
*info
)
1226 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
1227 struct ras_ih_data
*data
;
1232 data
= &obj
->ih_data
;
1233 if (data
->inuse
== 0)
1236 cancel_work_sync(&data
->ih_work
);
1239 memset(data
, 0, sizeof(*data
));
1245 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device
*adev
,
1246 struct ras_ih_if
*info
)
1248 struct ras_manager
*obj
= amdgpu_ras_find_obj(adev
, &info
->head
);
1249 struct ras_ih_data
*data
;
1252 /* in case we registe the IH before enable ras feature */
1253 obj
= amdgpu_ras_create_obj(adev
, &info
->head
);
1259 data
= &obj
->ih_data
;
1260 /* add the callback.etc */
1261 *data
= (struct ras_ih_data
) {
1264 .element_size
= sizeof(struct amdgpu_iv_entry
),
1269 INIT_WORK(&data
->ih_work
, amdgpu_ras_interrupt_process_handler
);
1271 data
->aligned_element_size
= ALIGN(data
->element_size
, 8);
1272 /* the ring can store 64 iv entries. */
1273 data
->ring_size
= 64 * data
->aligned_element_size
;
1274 data
->ring
= kmalloc(data
->ring_size
, GFP_KERNEL
);
1286 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device
*adev
)
1288 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1289 struct ras_manager
*obj
, *tmp
;
1291 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
1292 struct ras_ih_if info
= {
1295 amdgpu_ras_interrupt_remove_handler(adev
, &info
);
1302 /* recovery begin */
1304 /* return 0 on success.
1305 * caller need free bps.
1307 static int amdgpu_ras_badpages_read(struct amdgpu_device
*adev
,
1308 struct ras_badpage
**bps
, unsigned int *count
)
1310 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1311 struct ras_err_handler_data
*data
;
1315 if (!con
|| !con
->eh_data
|| !bps
|| !count
)
1318 mutex_lock(&con
->recovery_lock
);
1319 data
= con
->eh_data
;
1320 if (!data
|| data
->count
== 0) {
1326 *bps
= kmalloc(sizeof(struct ras_badpage
) * data
->count
, GFP_KERNEL
);
1332 for (; i
< data
->count
; i
++) {
1333 (*bps
)[i
] = (struct ras_badpage
){
1334 .bp
= data
->bps
[i
].retired_page
,
1335 .size
= AMDGPU_GPU_PAGE_SIZE
,
1336 .flags
= AMDGPU_RAS_RETIRE_PAGE_RESERVED
,
1339 if (data
->last_reserved
<= i
)
1340 (*bps
)[i
].flags
= AMDGPU_RAS_RETIRE_PAGE_PENDING
;
1341 else if (data
->bps_bo
[i
] == NULL
)
1342 (*bps
)[i
].flags
= AMDGPU_RAS_RETIRE_PAGE_FAULT
;
1345 *count
= data
->count
;
1347 mutex_unlock(&con
->recovery_lock
);
1351 static void amdgpu_ras_do_recovery(struct work_struct
*work
)
1353 struct amdgpu_ras
*ras
=
1354 container_of(work
, struct amdgpu_ras
, recovery_work
);
1356 if (amdgpu_device_should_recover_gpu(ras
->adev
))
1357 amdgpu_device_gpu_recover(ras
->adev
, 0);
1358 atomic_set(&ras
->in_recovery
, 0);
1361 /* alloc/realloc bps array */
1362 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device
*adev
,
1363 struct ras_err_handler_data
*data
, int pages
)
1365 unsigned int old_space
= data
->count
+ data
->space_left
;
1366 unsigned int new_space
= old_space
+ pages
;
1367 unsigned int align_space
= ALIGN(new_space
, 512);
1368 void *bps
= kmalloc(align_space
* sizeof(*data
->bps
), GFP_KERNEL
);
1369 struct amdgpu_bo
**bps_bo
=
1370 kmalloc(align_space
* sizeof(*data
->bps_bo
), GFP_KERNEL
);
1372 if (!bps
|| !bps_bo
) {
1379 memcpy(bps
, data
->bps
,
1380 data
->count
* sizeof(*data
->bps
));
1384 memcpy(bps_bo
, data
->bps_bo
,
1385 data
->count
* sizeof(*data
->bps_bo
));
1386 kfree(data
->bps_bo
);
1390 data
->bps_bo
= bps_bo
;
1391 data
->space_left
+= align_space
- old_space
;
1395 /* it deal with vram only. */
1396 int amdgpu_ras_add_bad_pages(struct amdgpu_device
*adev
,
1397 struct eeprom_table_record
*bps
, int pages
)
1399 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1400 struct ras_err_handler_data
*data
;
1403 if (!con
|| !con
->eh_data
|| !bps
|| pages
<= 0)
1406 mutex_lock(&con
->recovery_lock
);
1407 data
= con
->eh_data
;
1411 if (data
->space_left
<= pages
)
1412 if (amdgpu_ras_realloc_eh_data_space(adev
, data
, pages
)) {
1417 memcpy(&data
->bps
[data
->count
], bps
, pages
* sizeof(*data
->bps
));
1418 data
->count
+= pages
;
1419 data
->space_left
-= pages
;
1422 mutex_unlock(&con
->recovery_lock
);
1428 * write error record array to eeprom, the function should be
1429 * protected by recovery_lock
1431 static int amdgpu_ras_save_bad_pages(struct amdgpu_device
*adev
)
1433 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1434 struct ras_err_handler_data
*data
;
1435 struct amdgpu_ras_eeprom_control
*control
;
1438 if (!con
|| !con
->eh_data
)
1441 control
= &con
->eeprom_control
;
1442 data
= con
->eh_data
;
1443 save_count
= data
->count
- control
->num_recs
;
1444 /* only new entries are saved */
1446 if (amdgpu_ras_eeprom_process_recods(control
,
1447 &data
->bps
[control
->num_recs
],
1450 DRM_ERROR("Failed to save EEPROM table data!");
1458 * read error record array in eeprom and reserve enough space for
1459 * storing new bad pages
1461 static int amdgpu_ras_load_bad_pages(struct amdgpu_device
*adev
)
1463 struct amdgpu_ras_eeprom_control
*control
=
1464 &adev
->psp
.ras
.ras
->eeprom_control
;
1465 struct eeprom_table_record
*bps
= NULL
;
1468 /* no bad page record, skip eeprom access */
1469 if (!control
->num_recs
)
1472 bps
= kcalloc(control
->num_recs
, sizeof(*bps
), GFP_KERNEL
);
1476 if (amdgpu_ras_eeprom_process_recods(control
, bps
, false,
1477 control
->num_recs
)) {
1478 DRM_ERROR("Failed to load EEPROM table records!");
1483 ret
= amdgpu_ras_add_bad_pages(adev
, bps
, control
->num_recs
);
1491 * check if an address belongs to bad page
1493 * Note: this check is only for umc block
1495 static bool amdgpu_ras_check_bad_page(struct amdgpu_device
*adev
,
1498 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1499 struct ras_err_handler_data
*data
;
1503 if (!con
|| !con
->eh_data
)
1506 mutex_lock(&con
->recovery_lock
);
1507 data
= con
->eh_data
;
1511 addr
>>= AMDGPU_GPU_PAGE_SHIFT
;
1512 for (i
= 0; i
< data
->count
; i
++)
1513 if (addr
== data
->bps
[i
].retired_page
) {
1519 mutex_unlock(&con
->recovery_lock
);
1523 /* called in gpu recovery/init */
1524 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device
*adev
)
1526 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1527 struct ras_err_handler_data
*data
;
1529 struct amdgpu_bo
*bo
= NULL
;
1532 if (!con
|| !con
->eh_data
)
1535 mutex_lock(&con
->recovery_lock
);
1536 data
= con
->eh_data
;
1539 /* reserve vram at driver post stage. */
1540 for (i
= data
->last_reserved
; i
< data
->count
; i
++) {
1541 bp
= data
->bps
[i
].retired_page
;
1543 /* There are two cases of reserve error should be ignored:
1544 * 1) a ras bad page has been allocated (used by someone);
1545 * 2) a ras bad page has been reserved (duplicate error injection
1548 if (amdgpu_bo_create_kernel_at(adev
, bp
<< AMDGPU_GPU_PAGE_SHIFT
,
1549 AMDGPU_GPU_PAGE_SIZE
,
1550 AMDGPU_GEM_DOMAIN_VRAM
,
1552 DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp
);
1554 data
->bps_bo
[i
] = bo
;
1555 data
->last_reserved
= i
+ 1;
1559 /* continue to save bad pages to eeprom even reesrve_vram fails */
1560 ret
= amdgpu_ras_save_bad_pages(adev
);
1562 mutex_unlock(&con
->recovery_lock
);
1566 /* called when driver unload */
1567 static int amdgpu_ras_release_bad_pages(struct amdgpu_device
*adev
)
1569 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1570 struct ras_err_handler_data
*data
;
1571 struct amdgpu_bo
*bo
;
1574 if (!con
|| !con
->eh_data
)
1577 mutex_lock(&con
->recovery_lock
);
1578 data
= con
->eh_data
;
1582 for (i
= data
->last_reserved
- 1; i
>= 0; i
--) {
1583 bo
= data
->bps_bo
[i
];
1585 amdgpu_bo_free_kernel(&bo
, NULL
, NULL
);
1587 data
->bps_bo
[i
] = bo
;
1588 data
->last_reserved
= i
;
1591 mutex_unlock(&con
->recovery_lock
);
1595 int amdgpu_ras_recovery_init(struct amdgpu_device
*adev
)
1597 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1598 struct ras_err_handler_data
**data
;
1602 data
= &con
->eh_data
;
1606 *data
= kmalloc(sizeof(**data
), GFP_KERNEL
| __GFP_ZERO
);
1612 mutex_init(&con
->recovery_lock
);
1613 INIT_WORK(&con
->recovery_work
, amdgpu_ras_do_recovery
);
1614 atomic_set(&con
->in_recovery
, 0);
1617 ret
= amdgpu_ras_eeprom_init(&con
->eeprom_control
);
1621 if (con
->eeprom_control
.num_recs
) {
1622 ret
= amdgpu_ras_load_bad_pages(adev
);
1625 ret
= amdgpu_ras_reserve_bad_pages(adev
);
1633 amdgpu_ras_release_bad_pages(adev
);
1635 kfree((*data
)->bps
);
1636 kfree((*data
)->bps_bo
);
1638 con
->eh_data
= NULL
;
1640 DRM_WARN("Failed to initialize ras recovery!\n");
1645 static int amdgpu_ras_recovery_fini(struct amdgpu_device
*adev
)
1647 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1648 struct ras_err_handler_data
*data
= con
->eh_data
;
1650 /* recovery_init failed to init it, fini is useless */
1654 cancel_work_sync(&con
->recovery_work
);
1655 amdgpu_ras_release_bad_pages(adev
);
1657 mutex_lock(&con
->recovery_lock
);
1658 con
->eh_data
= NULL
;
1660 kfree(data
->bps_bo
);
1662 mutex_unlock(&con
->recovery_lock
);
1668 /* return 0 if ras will reset gpu and repost.*/
1669 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device
*adev
,
1672 struct amdgpu_ras
*ras
= amdgpu_ras_get_context(adev
);
1677 ras
->flags
|= AMDGPU_RAS_FLAG_INIT_NEED_RESET
;
1682 * check hardware's ras ability which will be saved in hw_supported.
1683 * if hardware does not support ras, we can skip some ras initializtion and
1684 * forbid some ras operations from IP.
1685 * if software itself, say boot parameter, limit the ras ability. We still
1686 * need allow IP do some limited operations, like disable. In such case,
1687 * we have to initialize ras as normal. but need check if operation is
1688 * allowed or not in each function.
1690 static void amdgpu_ras_check_supported(struct amdgpu_device
*adev
,
1691 uint32_t *hw_supported
, uint32_t *supported
)
1696 if (amdgpu_sriov_vf(adev
) ||
1697 (adev
->asic_type
!= CHIP_VEGA20
&&
1698 adev
->asic_type
!= CHIP_ARCTURUS
))
1701 if (adev
->is_atom_fw
&&
1702 (amdgpu_atomfirmware_mem_ecc_supported(adev
) ||
1703 amdgpu_atomfirmware_sram_ecc_supported(adev
)))
1704 *hw_supported
= AMDGPU_RAS_BLOCK_MASK
;
1706 *supported
= amdgpu_ras_enable
== 0 ?
1707 0 : *hw_supported
& amdgpu_ras_mask
;
1710 int amdgpu_ras_init(struct amdgpu_device
*adev
)
1712 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1718 con
= kmalloc(sizeof(struct amdgpu_ras
) +
1719 sizeof(struct ras_manager
) * AMDGPU_RAS_BLOCK_COUNT
,
1720 GFP_KERNEL
|__GFP_ZERO
);
1724 con
->objs
= (struct ras_manager
*)(con
+ 1);
1726 amdgpu_ras_set_context(adev
, con
);
1728 amdgpu_ras_check_supported(adev
, &con
->hw_supported
,
1730 if (!con
->hw_supported
) {
1731 amdgpu_ras_set_context(adev
, NULL
);
1737 INIT_LIST_HEAD(&con
->head
);
1738 /* Might need get this flag from vbios. */
1739 con
->flags
= RAS_DEFAULT_FLAGS
;
1741 if (adev
->nbio
.funcs
->init_ras_controller_interrupt
) {
1742 r
= adev
->nbio
.funcs
->init_ras_controller_interrupt(adev
);
1747 if (adev
->nbio
.funcs
->init_ras_err_event_athub_interrupt
) {
1748 r
= adev
->nbio
.funcs
->init_ras_err_event_athub_interrupt(adev
);
1753 amdgpu_ras_mask
&= AMDGPU_RAS_BLOCK_MASK
;
1755 if (amdgpu_ras_fs_init(adev
))
1758 DRM_INFO("RAS INFO: ras initialized successfully, "
1759 "hardware ability[%x] ras_mask[%x]\n",
1760 con
->hw_supported
, con
->supported
);
1763 amdgpu_ras_set_context(adev
, NULL
);
1769 /* helper function to handle common stuff in ip late init phase */
1770 int amdgpu_ras_late_init(struct amdgpu_device
*adev
,
1771 struct ras_common_if
*ras_block
,
1772 struct ras_fs_if
*fs_info
,
1773 struct ras_ih_if
*ih_info
)
1777 /* disable RAS feature per IP block if it is not supported */
1778 if (!amdgpu_ras_is_supported(adev
, ras_block
->block
)) {
1779 amdgpu_ras_feature_enable_on_boot(adev
, ras_block
, 0);
1783 r
= amdgpu_ras_feature_enable_on_boot(adev
, ras_block
, 1);
1786 /* request gpu reset. will run again */
1787 amdgpu_ras_request_reset_on_boot(adev
,
1790 } else if (adev
->in_suspend
|| adev
->in_gpu_reset
) {
1791 /* in resume phase, if fail to enable ras,
1792 * clean up all ras fs nodes, and disable ras */
1798 /* in resume phase, no need to create ras fs node */
1799 if (adev
->in_suspend
|| adev
->in_gpu_reset
)
1803 r
= amdgpu_ras_interrupt_add_handler(adev
, ih_info
);
1808 amdgpu_ras_debugfs_create(adev
, fs_info
);
1810 r
= amdgpu_ras_sysfs_create(adev
, fs_info
);
1816 amdgpu_ras_sysfs_remove(adev
, ras_block
);
1818 amdgpu_ras_debugfs_remove(adev
, ras_block
);
1820 amdgpu_ras_interrupt_remove_handler(adev
, ih_info
);
1822 amdgpu_ras_feature_enable(adev
, ras_block
, 0);
1826 /* helper function to remove ras fs node and interrupt handler */
1827 void amdgpu_ras_late_fini(struct amdgpu_device
*adev
,
1828 struct ras_common_if
*ras_block
,
1829 struct ras_ih_if
*ih_info
)
1831 if (!ras_block
|| !ih_info
)
1834 amdgpu_ras_sysfs_remove(adev
, ras_block
);
1835 amdgpu_ras_debugfs_remove(adev
, ras_block
);
1837 amdgpu_ras_interrupt_remove_handler(adev
, ih_info
);
1838 amdgpu_ras_feature_enable(adev
, ras_block
, 0);
1841 /* do some init work after IP late init as dependence.
1842 * and it runs in resume/gpu reset/booting up cases.
1844 void amdgpu_ras_resume(struct amdgpu_device
*adev
)
1846 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1847 struct ras_manager
*obj
, *tmp
;
1852 if (con
->flags
& AMDGPU_RAS_FLAG_INIT_BY_VBIOS
) {
1853 /* Set up all other IPs which are not implemented. There is a
1854 * tricky thing that IP's actual ras error type should be
1855 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
1856 * ERROR_NONE make sense anyway.
1858 amdgpu_ras_enable_all_features(adev
, 1);
1860 /* We enable ras on all hw_supported block, but as boot
1861 * parameter might disable some of them and one or more IP has
1862 * not implemented yet. So we disable them on behalf.
1864 list_for_each_entry_safe(obj
, tmp
, &con
->head
, node
) {
1865 if (!amdgpu_ras_is_supported(adev
, obj
->head
.block
)) {
1866 amdgpu_ras_feature_enable(adev
, &obj
->head
, 0);
1867 /* there should be no any reference. */
1868 WARN_ON(alive_obj(obj
));
1873 if (con
->flags
& AMDGPU_RAS_FLAG_INIT_NEED_RESET
) {
1874 con
->flags
&= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET
;
1875 /* setup ras obj state as disabled.
1876 * for init_by_vbios case.
1877 * if we want to enable ras, just enable it in a normal way.
1878 * If we want do disable it, need setup ras obj as enabled,
1879 * then issue another TA disable cmd.
1880 * See feature_enable_on_boot
1882 amdgpu_ras_disable_all_features(adev
, 1);
1883 amdgpu_ras_reset_gpu(adev
);
1887 void amdgpu_ras_suspend(struct amdgpu_device
*adev
)
1889 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1894 amdgpu_ras_disable_all_features(adev
, 0);
1895 /* Make sure all ras objects are disabled. */
1897 amdgpu_ras_disable_all_features(adev
, 1);
1900 /* do some fini work before IP fini as dependence */
1901 int amdgpu_ras_pre_fini(struct amdgpu_device
*adev
)
1903 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1908 /* Need disable ras on all IPs here before ip [hw/sw]fini */
1909 amdgpu_ras_disable_all_features(adev
, 0);
1910 amdgpu_ras_recovery_fini(adev
);
1914 int amdgpu_ras_fini(struct amdgpu_device
*adev
)
1916 struct amdgpu_ras
*con
= amdgpu_ras_get_context(adev
);
1921 amdgpu_ras_fs_fini(adev
);
1922 amdgpu_ras_interrupt_remove_all(adev
);
1924 WARN(con
->features
, "Feature mask is not cleared");
1927 amdgpu_ras_disable_all_features(adev
, 1);
1929 amdgpu_ras_set_context(adev
, NULL
);
1935 void amdgpu_ras_global_ras_isr(struct amdgpu_device
*adev
)
1937 uint32_t hw_supported
, supported
;
1939 amdgpu_ras_check_supported(adev
, &hw_supported
, &supported
);
1943 if (atomic_cmpxchg(&amdgpu_ras_in_intr
, 0, 1) == 0) {
1944 DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
1946 amdgpu_ras_reset_gpu(adev
);