This source file includes following definitions.
- amdgpu_ras_debugfs_read
- amdgpu_ras_find_block_id_by_name
- amdgpu_ras_debugfs_ctrl_parse_data
- amdgpu_ras_debugfs_ctrl_write
- amdgpu_ras_sysfs_read
- put_obj
- amdgpu_ras_create_obj
- amdgpu_ras_find_obj
- amdgpu_ras_is_feature_allowed
- amdgpu_ras_is_feature_enabled
- __amdgpu_ras_feature_enable
- amdgpu_ras_feature_enable
- amdgpu_ras_feature_enable_on_boot
- amdgpu_ras_disable_all_features
- amdgpu_ras_enable_all_features
- amdgpu_ras_error_query
- amdgpu_ras_error_inject
- amdgpu_ras_error_cure
- amdgpu_ras_query_error_count
- amdgpu_ras_badpage_flags_str
- amdgpu_ras_sysfs_badpages_read
- amdgpu_ras_sysfs_features_read
- amdgpu_ras_sysfs_create_feature_node
- amdgpu_ras_sysfs_remove_feature_node
- amdgpu_ras_sysfs_create
- amdgpu_ras_sysfs_remove
- amdgpu_ras_sysfs_remove_all
- amdgpu_ras_debugfs_create_ctrl_node
- amdgpu_ras_debugfs_create
- amdgpu_ras_debugfs_remove
- amdgpu_ras_debugfs_remove_all
- amdgpu_ras_fs_init
- amdgpu_ras_fs_fini
- amdgpu_ras_interrupt_handler
- amdgpu_ras_interrupt_process_handler
- amdgpu_ras_interrupt_dispatch
- amdgpu_ras_interrupt_remove_handler
- amdgpu_ras_interrupt_add_handler
- amdgpu_ras_interrupt_remove_all
- amdgpu_ras_badpages_read
- amdgpu_ras_do_recovery
- amdgpu_ras_realloc_eh_data_space
- amdgpu_ras_add_bad_pages
- amdgpu_ras_reserve_bad_pages
- amdgpu_ras_release_bad_pages
- amdgpu_ras_save_bad_pages
- amdgpu_ras_load_bad_pages
- amdgpu_ras_recovery_init
- amdgpu_ras_recovery_fini
- amdgpu_ras_request_reset_on_boot
- amdgpu_ras_check_supported
- amdgpu_ras_init
- amdgpu_ras_resume
- amdgpu_ras_suspend
- amdgpu_ras_pre_fini
- amdgpu_ras_fini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 #include <linux/debugfs.h>
25 #include <linux/list.h>
26 #include <linux/module.h>
27 #include <linux/uaccess.h>
28
29 #include "amdgpu.h"
30 #include "amdgpu_ras.h"
31 #include "amdgpu_atomfirmware.h"
32
33 const char *ras_error_string[] = {
34 "none",
35 "parity",
36 "single_correctable",
37 "multi_uncorrectable",
38 "poison",
39 };
40
41 const char *ras_block_string[] = {
42 "umc",
43 "sdma",
44 "gfx",
45 "mmhub",
46 "athub",
47 "pcie_bif",
48 "hdp",
49 "xgmi_wafl",
50 "df",
51 "smn",
52 "sem",
53 "mp0",
54 "mp1",
55 "fuse",
56 };
57
58 #define ras_err_str(i) (ras_error_string[ffs(i)])
59 #define ras_block_str(i) (ras_block_string[i])
60
61 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
62 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
63 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
64
65
66 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
67
68 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
69 size_t size, loff_t *pos)
70 {
71 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
72 struct ras_query_if info = {
73 .head = obj->head,
74 };
75 ssize_t s;
76 char val[128];
77
78 if (amdgpu_ras_error_query(obj->adev, &info))
79 return -EINVAL;
80
81 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
82 "ue", info.ue_count,
83 "ce", info.ce_count);
84 if (*pos >= s)
85 return 0;
86
87 s -= *pos;
88 s = min_t(u64, s, size);
89
90
91 if (copy_to_user(buf, &val[*pos], s))
92 return -EINVAL;
93
94 *pos += s;
95
96 return s;
97 }
98
99 static const struct file_operations amdgpu_ras_debugfs_ops = {
100 .owner = THIS_MODULE,
101 .read = amdgpu_ras_debugfs_read,
102 .write = NULL,
103 .llseek = default_llseek
104 };
105
106 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
107 {
108 int i;
109
110 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
111 *block_id = i;
112 if (strcmp(name, ras_block_str(i)) == 0)
113 return 0;
114 }
115 return -EINVAL;
116 }
117
118 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
119 const char __user *buf, size_t size,
120 loff_t *pos, struct ras_debug_if *data)
121 {
122 ssize_t s = min_t(u64, 64, size);
123 char str[65];
124 char block_name[33];
125 char err[9] = "ue";
126 int op = -1;
127 int block_id;
128 uint32_t sub_block;
129 u64 address, value;
130
131 if (*pos)
132 return -EINVAL;
133 *pos = size;
134
135 memset(str, 0, sizeof(str));
136 memset(data, 0, sizeof(*data));
137
138 if (copy_from_user(str, buf, s))
139 return -EINVAL;
140
141 if (sscanf(str, "disable %32s", block_name) == 1)
142 op = 0;
143 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
144 op = 1;
145 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
146 op = 2;
147 else if (str[0] && str[1] && str[2] && str[3])
148
149 return -EINVAL;
150
151 if (op != -1) {
152 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
153 return -EINVAL;
154
155 data->head.block = block_id;
156
157 if (!memcmp("ue", err, 2))
158 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
159 else if (!memcmp("ce", err, 2))
160 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
161 else
162 return -EINVAL;
163
164 data->op = op;
165
166 if (op == 2) {
167 if (sscanf(str, "%*s %*s %*s %u %llu %llu",
168 &sub_block, &address, &value) != 3)
169 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
170 &sub_block, &address, &value) != 3)
171 return -EINVAL;
172 data->head.sub_block_index = sub_block;
173 data->inject.address = address;
174 data->inject.value = value;
175 }
176 } else {
177 if (size < sizeof(*data))
178 return -EINVAL;
179
180 if (copy_from_user(data, buf, sizeof(*data)))
181 return -EINVAL;
182 }
183
184 return 0;
185 }
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
246 size_t size, loff_t *pos)
247 {
248 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
249 struct ras_debug_if data;
250 int ret = 0;
251
252 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
253 if (ret)
254 return -EINVAL;
255
256 if (!amdgpu_ras_is_supported(adev, data.head.block))
257 return -EINVAL;
258
259 switch (data.op) {
260 case 0:
261 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
262 break;
263 case 1:
264 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
265 break;
266 case 2:
267 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
268 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
269 ret = -EINVAL;
270 break;
271 }
272
273
274 ret = amdgpu_ras_error_inject(adev, &data.inject);
275 break;
276 default:
277 ret = -EINVAL;
278 break;
279 };
280
281 if (ret)
282 return -EINVAL;
283
284 return size;
285 }
286
287 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
288 .owner = THIS_MODULE,
289 .read = NULL,
290 .write = amdgpu_ras_debugfs_ctrl_write,
291 .llseek = default_llseek
292 };
293
294 static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
295 struct device_attribute *attr, char *buf)
296 {
297 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
298 struct ras_query_if info = {
299 .head = obj->head,
300 };
301
302 if (amdgpu_ras_error_query(obj->adev, &info))
303 return -EINVAL;
304
305 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
306 "ue", info.ue_count,
307 "ce", info.ce_count);
308 }
309
310
311
312 #define get_obj(obj) do { (obj)->use++; } while (0)
313 #define alive_obj(obj) ((obj)->use)
314
315 static inline void put_obj(struct ras_manager *obj)
316 {
317 if (obj && --obj->use == 0)
318 list_del(&obj->node);
319 if (obj && obj->use < 0) {
320 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
321 }
322 }
323
324
325 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
326 struct ras_common_if *head)
327 {
328 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
329 struct ras_manager *obj;
330
331 if (!con)
332 return NULL;
333
334 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
335 return NULL;
336
337 obj = &con->objs[head->block];
338
339 if (alive_obj(obj))
340 return NULL;
341
342 obj->head = *head;
343 obj->adev = adev;
344 list_add(&obj->node, &con->head);
345 get_obj(obj);
346
347 return obj;
348 }
349
350
351 static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
352 struct ras_common_if *head)
353 {
354 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
355 struct ras_manager *obj;
356 int i;
357
358 if (!con)
359 return NULL;
360
361 if (head) {
362 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
363 return NULL;
364
365 obj = &con->objs[head->block];
366
367 if (alive_obj(obj)) {
368 WARN_ON(head->block != obj->head.block);
369 return obj;
370 }
371 } else {
372 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
373 obj = &con->objs[i];
374 if (alive_obj(obj)) {
375 WARN_ON(i != obj->head.block);
376 return obj;
377 }
378 }
379 }
380
381 return NULL;
382 }
383
384
385
386 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
387 struct ras_common_if *head)
388 {
389 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
390
391 return con->hw_supported & BIT(head->block);
392 }
393
394 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
395 struct ras_common_if *head)
396 {
397 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
398
399 return con->features & BIT(head->block);
400 }
401
402
403
404
405
406 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
407 struct ras_common_if *head, int enable)
408 {
409 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
410 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
411
412
413
414
415
416
417
418 if (!amdgpu_ras_is_feature_allowed(adev, head))
419 return 0;
420 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
421 return 0;
422
423 if (enable) {
424 if (!obj) {
425 obj = amdgpu_ras_create_obj(adev, head);
426 if (!obj)
427 return -EINVAL;
428 } else {
429
430 get_obj(obj);
431 }
432 con->features |= BIT(head->block);
433 } else {
434 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
435 con->features &= ~BIT(head->block);
436 put_obj(obj);
437 }
438 }
439
440 return 0;
441 }
442
443
444 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
445 struct ras_common_if *head, bool enable)
446 {
447 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
448 union ta_ras_cmd_input info;
449 int ret;
450
451 if (!con)
452 return -EINVAL;
453
454 if (!enable) {
455 info.disable_features = (struct ta_ras_disable_features_input) {
456 .block_id = amdgpu_ras_block_to_ta(head->block),
457 .error_type = amdgpu_ras_error_to_ta(head->type),
458 };
459 } else {
460 info.enable_features = (struct ta_ras_enable_features_input) {
461 .block_id = amdgpu_ras_block_to_ta(head->block),
462 .error_type = amdgpu_ras_error_to_ta(head->type),
463 };
464 }
465
466
467 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
468
469 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
470 return 0;
471
472 ret = psp_ras_enable_features(&adev->psp, &info, enable);
473 if (ret) {
474 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
475 enable ? "enable":"disable",
476 ras_block_str(head->block),
477 ret);
478 if (ret == TA_RAS_STATUS__RESET_NEEDED)
479 return -EAGAIN;
480 return -EINVAL;
481 }
482
483
484 __amdgpu_ras_feature_enable(adev, head, enable);
485
486 return 0;
487 }
488
489
490 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
491 struct ras_common_if *head, bool enable)
492 {
493 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
494 int ret;
495
496 if (!con)
497 return -EINVAL;
498
499 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
500 if (enable) {
501
502
503
504
505
506
507 ret = amdgpu_ras_feature_enable(adev, head, 1);
508
509
510
511
512 if (ret == -EINVAL) {
513 ret = __amdgpu_ras_feature_enable(adev, head, 1);
514 if (!ret)
515 DRM_INFO("RAS INFO: %s setup object\n",
516 ras_block_str(head->block));
517 }
518 } else {
519
520 ret = __amdgpu_ras_feature_enable(adev, head, 1);
521 if (ret)
522 return ret;
523
524 ret = amdgpu_ras_feature_enable(adev, head, 0);
525 }
526 } else
527 ret = amdgpu_ras_feature_enable(adev, head, enable);
528
529 return ret;
530 }
531
532 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
533 bool bypass)
534 {
535 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
536 struct ras_manager *obj, *tmp;
537
538 list_for_each_entry_safe(obj, tmp, &con->head, node) {
539
540
541
542 if (bypass) {
543 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
544 break;
545 } else {
546 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
547 break;
548 }
549 }
550
551 return con->features;
552 }
553
554 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
555 bool bypass)
556 {
557 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
558 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
559 int i;
560 const enum amdgpu_ras_error_type default_ras_type =
561 AMDGPU_RAS_ERROR__NONE;
562
563 for (i = 0; i < ras_block_count; i++) {
564 struct ras_common_if head = {
565 .block = i,
566 .type = default_ras_type,
567 .sub_block_index = 0,
568 };
569 strcpy(head.name, ras_block_str(i));
570 if (bypass) {
571
572
573
574
575 if (__amdgpu_ras_feature_enable(adev, &head, 1))
576 break;
577 } else {
578 if (amdgpu_ras_feature_enable(adev, &head, 1))
579 break;
580 }
581 }
582
583 return con->features;
584 }
585
586
587
588 int amdgpu_ras_error_query(struct amdgpu_device *adev,
589 struct ras_query_if *info)
590 {
591 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
592 struct ras_err_data err_data = {0, 0, 0, NULL};
593
594 if (!obj)
595 return -EINVAL;
596
597 switch (info->head.block) {
598 case AMDGPU_RAS_BLOCK__UMC:
599 if (adev->umc.funcs->query_ras_error_count)
600 adev->umc.funcs->query_ras_error_count(adev, &err_data);
601
602
603
604 if (adev->umc.funcs->query_ras_error_address)
605 adev->umc.funcs->query_ras_error_address(adev, &err_data);
606 break;
607 case AMDGPU_RAS_BLOCK__GFX:
608 if (adev->gfx.funcs->query_ras_error_count)
609 adev->gfx.funcs->query_ras_error_count(adev, &err_data);
610 break;
611 case AMDGPU_RAS_BLOCK__MMHUB:
612 if (adev->mmhub_funcs->query_ras_error_count)
613 adev->mmhub_funcs->query_ras_error_count(adev, &err_data);
614 break;
615 default:
616 break;
617 }
618
619 obj->err_data.ue_count += err_data.ue_count;
620 obj->err_data.ce_count += err_data.ce_count;
621
622 info->ue_count = obj->err_data.ue_count;
623 info->ce_count = obj->err_data.ce_count;
624
625 if (err_data.ce_count)
626 dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
627 obj->err_data.ce_count, ras_block_str(info->head.block));
628 if (err_data.ue_count)
629 dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
630 obj->err_data.ue_count, ras_block_str(info->head.block));
631
632 return 0;
633 }
634
635
636 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
637 struct ras_inject_if *info)
638 {
639 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
640 struct ta_ras_trigger_error_input block_info = {
641 .block_id = amdgpu_ras_block_to_ta(info->head.block),
642 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
643 .sub_block_index = info->head.sub_block_index,
644 .address = info->address,
645 .value = info->value,
646 };
647 int ret = 0;
648
649 if (!obj)
650 return -EINVAL;
651
652 switch (info->head.block) {
653 case AMDGPU_RAS_BLOCK__GFX:
654 if (adev->gfx.funcs->ras_error_inject)
655 ret = adev->gfx.funcs->ras_error_inject(adev, info);
656 else
657 ret = -EINVAL;
658 break;
659 case AMDGPU_RAS_BLOCK__UMC:
660 case AMDGPU_RAS_BLOCK__MMHUB:
661 ret = psp_ras_trigger_error(&adev->psp, &block_info);
662 break;
663 default:
664 DRM_INFO("%s error injection is not supported yet\n",
665 ras_block_str(info->head.block));
666 ret = -EINVAL;
667 }
668
669 if (ret)
670 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
671 ras_block_str(info->head.block),
672 ret);
673
674 return ret;
675 }
676
677 int amdgpu_ras_error_cure(struct amdgpu_device *adev,
678 struct ras_cure_if *info)
679 {
680
681 return 0;
682 }
683
684
685 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
686 bool is_ce)
687 {
688 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
689 struct ras_manager *obj;
690 struct ras_err_data data = {0, 0};
691
692 if (!con)
693 return 0;
694
695 list_for_each_entry(obj, &con->head, node) {
696 struct ras_query_if info = {
697 .head = obj->head,
698 };
699
700 if (amdgpu_ras_error_query(adev, &info))
701 return 0;
702
703 data.ce_count += info.ce_count;
704 data.ue_count += info.ue_count;
705 }
706
707 return is_ce ? data.ce_count : data.ue_count;
708 }
709
710
711
712
713
714 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
715 struct ras_badpage **bps, unsigned int *count);
716
717 static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
718 {
719 switch (flags) {
720 case 0:
721 return "R";
722 case 1:
723 return "P";
724 case 2:
725 default:
726 return "F";
727 };
728 }
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
754 struct kobject *kobj, struct bin_attribute *attr,
755 char *buf, loff_t ppos, size_t count)
756 {
757 struct amdgpu_ras *con =
758 container_of(attr, struct amdgpu_ras, badpages_attr);
759 struct amdgpu_device *adev = con->adev;
760 const unsigned int element_size =
761 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
762 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
763 unsigned int end = div64_ul(ppos + count - 1, element_size);
764 ssize_t s = 0;
765 struct ras_badpage *bps = NULL;
766 unsigned int bps_count = 0;
767
768 memset(buf, 0, count);
769
770 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
771 return 0;
772
773 for (; start < end && start < bps_count; start++)
774 s += scnprintf(&buf[s], element_size + 1,
775 "0x%08x : 0x%08x : %1s\n",
776 bps[start].bp,
777 bps[start].size,
778 amdgpu_ras_badpage_flags_str(bps[start].flags));
779
780 kfree(bps);
781
782 return s;
783 }
784
785 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
786 struct device_attribute *attr, char *buf)
787 {
788 struct amdgpu_ras *con =
789 container_of(attr, struct amdgpu_ras, features_attr);
790
791 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
792 }
793
794 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
795 {
796 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
797 struct attribute *attrs[] = {
798 &con->features_attr.attr,
799 NULL
800 };
801 struct bin_attribute *bin_attrs[] = {
802 &con->badpages_attr,
803 NULL
804 };
805 struct attribute_group group = {
806 .name = "ras",
807 .attrs = attrs,
808 .bin_attrs = bin_attrs,
809 };
810
811 con->features_attr = (struct device_attribute) {
812 .attr = {
813 .name = "features",
814 .mode = S_IRUGO,
815 },
816 .show = amdgpu_ras_sysfs_features_read,
817 };
818
819 con->badpages_attr = (struct bin_attribute) {
820 .attr = {
821 .name = "gpu_vram_bad_pages",
822 .mode = S_IRUGO,
823 },
824 .size = 0,
825 .private = NULL,
826 .read = amdgpu_ras_sysfs_badpages_read,
827 };
828
829 sysfs_attr_init(attrs[0]);
830 sysfs_bin_attr_init(bin_attrs[0]);
831
832 return sysfs_create_group(&adev->dev->kobj, &group);
833 }
834
835 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
836 {
837 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
838 struct attribute *attrs[] = {
839 &con->features_attr.attr,
840 NULL
841 };
842 struct bin_attribute *bin_attrs[] = {
843 &con->badpages_attr,
844 NULL
845 };
846 struct attribute_group group = {
847 .name = "ras",
848 .attrs = attrs,
849 .bin_attrs = bin_attrs,
850 };
851
852 sysfs_remove_group(&adev->dev->kobj, &group);
853
854 return 0;
855 }
856
857 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
858 struct ras_fs_if *head)
859 {
860 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
861
862 if (!obj || obj->attr_inuse)
863 return -EINVAL;
864
865 get_obj(obj);
866
867 memcpy(obj->fs_data.sysfs_name,
868 head->sysfs_name,
869 sizeof(obj->fs_data.sysfs_name));
870
871 obj->sysfs_attr = (struct device_attribute){
872 .attr = {
873 .name = obj->fs_data.sysfs_name,
874 .mode = S_IRUGO,
875 },
876 .show = amdgpu_ras_sysfs_read,
877 };
878 sysfs_attr_init(&obj->sysfs_attr.attr);
879
880 if (sysfs_add_file_to_group(&adev->dev->kobj,
881 &obj->sysfs_attr.attr,
882 "ras")) {
883 put_obj(obj);
884 return -EINVAL;
885 }
886
887 obj->attr_inuse = 1;
888
889 return 0;
890 }
891
892 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
893 struct ras_common_if *head)
894 {
895 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
896
897 if (!obj || !obj->attr_inuse)
898 return -EINVAL;
899
900 sysfs_remove_file_from_group(&adev->dev->kobj,
901 &obj->sysfs_attr.attr,
902 "ras");
903 obj->attr_inuse = 0;
904 put_obj(obj);
905
906 return 0;
907 }
908
909 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
910 {
911 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
912 struct ras_manager *obj, *tmp;
913
914 list_for_each_entry_safe(obj, tmp, &con->head, node) {
915 amdgpu_ras_sysfs_remove(adev, &obj->head);
916 }
917
918 amdgpu_ras_sysfs_remove_feature_node(adev);
919
920 return 0;
921 }
922
923
924
925 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
926 {
927 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
928 struct drm_minor *minor = adev->ddev->primary;
929
930 con->dir = debugfs_create_dir("ras", minor->debugfs_root);
931 con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
932 adev, &amdgpu_ras_debugfs_ctrl_ops);
933 }
934
935 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
936 struct ras_fs_if *head)
937 {
938 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
939 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
940
941 if (!obj || obj->ent)
942 return;
943
944 get_obj(obj);
945
946 memcpy(obj->fs_data.debugfs_name,
947 head->debugfs_name,
948 sizeof(obj->fs_data.debugfs_name));
949
950 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
951 S_IWUGO | S_IRUGO, con->dir, obj,
952 &amdgpu_ras_debugfs_ops);
953 }
954
955 void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
956 struct ras_common_if *head)
957 {
958 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
959
960 if (!obj || !obj->ent)
961 return;
962
963 debugfs_remove(obj->ent);
964 obj->ent = NULL;
965 put_obj(obj);
966 }
967
968 static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
969 {
970 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
971 struct ras_manager *obj, *tmp;
972
973 list_for_each_entry_safe(obj, tmp, &con->head, node) {
974 amdgpu_ras_debugfs_remove(adev, &obj->head);
975 }
976
977 debugfs_remove(con->ent);
978 debugfs_remove(con->dir);
979 con->dir = NULL;
980 con->ent = NULL;
981 }
982
983
984
985
986 static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
987 {
988 amdgpu_ras_sysfs_create_feature_node(adev);
989 amdgpu_ras_debugfs_create_ctrl_node(adev);
990
991 return 0;
992 }
993
994 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
995 {
996 amdgpu_ras_debugfs_remove_all(adev);
997 amdgpu_ras_sysfs_remove_all(adev);
998 return 0;
999 }
1000
1001
1002
1003 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1004 {
1005 struct ras_ih_data *data = &obj->ih_data;
1006 struct amdgpu_iv_entry entry;
1007 int ret;
1008 struct ras_err_data err_data = {0, 0, 0, NULL};
1009
1010 while (data->rptr != data->wptr) {
1011 rmb();
1012 memcpy(&entry, &data->ring[data->rptr],
1013 data->element_size);
1014
1015 wmb();
1016 data->rptr = (data->aligned_element_size +
1017 data->rptr) % data->ring_size;
1018
1019
1020
1021
1022 if (data->cb) {
1023 ret = data->cb(obj->adev, &err_data, &entry);
1024
1025
1026
1027
1028
1029 if (ret == AMDGPU_RAS_SUCCESS) {
1030
1031
1032
1033 obj->err_data.ue_count += err_data.ue_count;
1034 obj->err_data.ce_count += err_data.ce_count;
1035 }
1036 }
1037 }
1038 }
1039
1040 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1041 {
1042 struct ras_ih_data *data =
1043 container_of(work, struct ras_ih_data, ih_work);
1044 struct ras_manager *obj =
1045 container_of(data, struct ras_manager, ih_data);
1046
1047 amdgpu_ras_interrupt_handler(obj);
1048 }
1049
1050 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1051 struct ras_dispatch_if *info)
1052 {
1053 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1054 struct ras_ih_data *data = &obj->ih_data;
1055
1056 if (!obj)
1057 return -EINVAL;
1058
1059 if (data->inuse == 0)
1060 return 0;
1061
1062
1063 memcpy(&data->ring[data->wptr], info->entry,
1064 data->element_size);
1065
1066 wmb();
1067 data->wptr = (data->aligned_element_size +
1068 data->wptr) % data->ring_size;
1069
1070 schedule_work(&data->ih_work);
1071
1072 return 0;
1073 }
1074
1075 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1076 struct ras_ih_if *info)
1077 {
1078 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1079 struct ras_ih_data *data;
1080
1081 if (!obj)
1082 return -EINVAL;
1083
1084 data = &obj->ih_data;
1085 if (data->inuse == 0)
1086 return 0;
1087
1088 cancel_work_sync(&data->ih_work);
1089
1090 kfree(data->ring);
1091 memset(data, 0, sizeof(*data));
1092 put_obj(obj);
1093
1094 return 0;
1095 }
1096
1097 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1098 struct ras_ih_if *info)
1099 {
1100 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1101 struct ras_ih_data *data;
1102
1103 if (!obj) {
1104
1105 obj = amdgpu_ras_create_obj(adev, &info->head);
1106 if (!obj)
1107 return -EINVAL;
1108 } else
1109 get_obj(obj);
1110
1111 data = &obj->ih_data;
1112
1113 *data = (struct ras_ih_data) {
1114 .inuse = 0,
1115 .cb = info->cb,
1116 .element_size = sizeof(struct amdgpu_iv_entry),
1117 .rptr = 0,
1118 .wptr = 0,
1119 };
1120
1121 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1122
1123 data->aligned_element_size = ALIGN(data->element_size, 8);
1124
1125 data->ring_size = 64 * data->aligned_element_size;
1126 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1127 if (!data->ring) {
1128 put_obj(obj);
1129 return -ENOMEM;
1130 }
1131
1132
1133 data->inuse = 1;
1134
1135 return 0;
1136 }
1137
1138 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1139 {
1140 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1141 struct ras_manager *obj, *tmp;
1142
1143 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1144 struct ras_ih_if info = {
1145 .head = obj->head,
1146 };
1147 amdgpu_ras_interrupt_remove_handler(adev, &info);
1148 }
1149
1150 return 0;
1151 }
1152
1153
1154
1155
1156
1157
1158
1159 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1160 struct ras_badpage **bps, unsigned int *count)
1161 {
1162 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1163 struct ras_err_handler_data *data;
1164 int i = 0;
1165 int ret = 0;
1166
1167 if (!con || !con->eh_data || !bps || !count)
1168 return -EINVAL;
1169
1170 mutex_lock(&con->recovery_lock);
1171 data = con->eh_data;
1172 if (!data || data->count == 0) {
1173 *bps = NULL;
1174 goto out;
1175 }
1176
1177 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1178 if (!*bps) {
1179 ret = -ENOMEM;
1180 goto out;
1181 }
1182
1183 for (; i < data->count; i++) {
1184 (*bps)[i] = (struct ras_badpage){
1185 .bp = data->bps[i].bp,
1186 .size = AMDGPU_GPU_PAGE_SIZE,
1187 .flags = 0,
1188 };
1189
1190 if (data->last_reserved <= i)
1191 (*bps)[i].flags = 1;
1192 else if (data->bps[i].bo == NULL)
1193 (*bps)[i].flags = 2;
1194 }
1195
1196 *count = data->count;
1197 out:
1198 mutex_unlock(&con->recovery_lock);
1199 return ret;
1200 }
1201
1202 static void amdgpu_ras_do_recovery(struct work_struct *work)
1203 {
1204 struct amdgpu_ras *ras =
1205 container_of(work, struct amdgpu_ras, recovery_work);
1206
1207 amdgpu_device_gpu_recover(ras->adev, 0);
1208 atomic_set(&ras->in_recovery, 0);
1209 }
1210
1211
1212 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1213 struct ras_err_handler_data *data, int pages)
1214 {
1215 unsigned int old_space = data->count + data->space_left;
1216 unsigned int new_space = old_space + pages;
1217 unsigned int align_space = ALIGN(new_space, 1024);
1218 void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1219
1220 if (!tmp)
1221 return -ENOMEM;
1222
1223 if (data->bps) {
1224 memcpy(tmp, data->bps,
1225 data->count * sizeof(*data->bps));
1226 kfree(data->bps);
1227 }
1228
1229 data->bps = tmp;
1230 data->space_left += align_space - old_space;
1231 return 0;
1232 }
1233
1234
1235 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1236 unsigned long *bps, int pages)
1237 {
1238 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1239 struct ras_err_handler_data *data;
1240 int i = pages;
1241 int ret = 0;
1242
1243 if (!con || !con->eh_data || !bps || pages <= 0)
1244 return 0;
1245
1246 mutex_lock(&con->recovery_lock);
1247 data = con->eh_data;
1248 if (!data)
1249 goto out;
1250
1251 if (data->space_left <= pages)
1252 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1253 ret = -ENOMEM;
1254 goto out;
1255 }
1256
1257 while (i--)
1258 data->bps[data->count++].bp = bps[i];
1259
1260 data->space_left -= pages;
1261 out:
1262 mutex_unlock(&con->recovery_lock);
1263
1264 return ret;
1265 }
1266
1267
1268 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1269 {
1270 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1271 struct ras_err_handler_data *data;
1272 uint64_t bp;
1273 struct amdgpu_bo *bo = NULL;
1274 int i;
1275
1276 if (!con || !con->eh_data)
1277 return 0;
1278
1279 mutex_lock(&con->recovery_lock);
1280 data = con->eh_data;
1281 if (!data)
1282 goto out;
1283
1284 for (i = data->last_reserved; i < data->count; i++) {
1285 bp = data->bps[i].bp;
1286
1287 if (amdgpu_bo_create_kernel_at(adev, bp << PAGE_SHIFT, PAGE_SIZE,
1288 AMDGPU_GEM_DOMAIN_VRAM,
1289 &bo, NULL))
1290 DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1291
1292 data->bps[i].bo = bo;
1293 data->last_reserved = i + 1;
1294 bo = NULL;
1295 }
1296 out:
1297 mutex_unlock(&con->recovery_lock);
1298 return 0;
1299 }
1300
1301
1302 static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1303 {
1304 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1305 struct ras_err_handler_data *data;
1306 struct amdgpu_bo *bo;
1307 int i;
1308
1309 if (!con || !con->eh_data)
1310 return 0;
1311
1312 mutex_lock(&con->recovery_lock);
1313 data = con->eh_data;
1314 if (!data)
1315 goto out;
1316
1317 for (i = data->last_reserved - 1; i >= 0; i--) {
1318 bo = data->bps[i].bo;
1319
1320 amdgpu_bo_free_kernel(&bo, NULL, NULL);
1321
1322 data->bps[i].bo = bo;
1323 data->last_reserved = i;
1324 }
1325 out:
1326 mutex_unlock(&con->recovery_lock);
1327 return 0;
1328 }
1329
1330 static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1331 {
1332
1333
1334
1335 return 0;
1336 }
1337
1338 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1339 {
1340
1341
1342
1343 return 0;
1344 }
1345
1346 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1347 {
1348 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1349 struct ras_err_handler_data **data = &con->eh_data;
1350
1351 *data = kmalloc(sizeof(**data),
1352 GFP_KERNEL|__GFP_ZERO);
1353 if (!*data)
1354 return -ENOMEM;
1355
1356 mutex_init(&con->recovery_lock);
1357 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1358 atomic_set(&con->in_recovery, 0);
1359 con->adev = adev;
1360
1361 amdgpu_ras_load_bad_pages(adev);
1362 amdgpu_ras_reserve_bad_pages(adev);
1363
1364 return 0;
1365 }
1366
1367 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1368 {
1369 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1370 struct ras_err_handler_data *data = con->eh_data;
1371
1372 cancel_work_sync(&con->recovery_work);
1373 amdgpu_ras_save_bad_pages(adev);
1374 amdgpu_ras_release_bad_pages(adev);
1375
1376 mutex_lock(&con->recovery_lock);
1377 con->eh_data = NULL;
1378 kfree(data->bps);
1379 kfree(data);
1380 mutex_unlock(&con->recovery_lock);
1381
1382 return 0;
1383 }
1384
1385
1386
1387 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1388 unsigned int block)
1389 {
1390 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1391
1392 if (!ras)
1393 return -EINVAL;
1394
1395 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1396 return 0;
1397 }
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408 static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1409 uint32_t *hw_supported, uint32_t *supported)
1410 {
1411 *hw_supported = 0;
1412 *supported = 0;
1413
1414 if (amdgpu_sriov_vf(adev) ||
1415 adev->asic_type != CHIP_VEGA20)
1416 return;
1417
1418 if (adev->is_atom_fw &&
1419 (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1420 amdgpu_atomfirmware_sram_ecc_supported(adev)))
1421 *hw_supported = AMDGPU_RAS_BLOCK_MASK;
1422
1423 *supported = amdgpu_ras_enable == 0 ?
1424 0 : *hw_supported & amdgpu_ras_mask;
1425 }
1426
1427 int amdgpu_ras_init(struct amdgpu_device *adev)
1428 {
1429 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1430
1431 if (con)
1432 return 0;
1433
1434 con = kmalloc(sizeof(struct amdgpu_ras) +
1435 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1436 GFP_KERNEL|__GFP_ZERO);
1437 if (!con)
1438 return -ENOMEM;
1439
1440 con->objs = (struct ras_manager *)(con + 1);
1441
1442 amdgpu_ras_set_context(adev, con);
1443
1444 amdgpu_ras_check_supported(adev, &con->hw_supported,
1445 &con->supported);
1446 if (!con->hw_supported) {
1447 amdgpu_ras_set_context(adev, NULL);
1448 kfree(con);
1449 return 0;
1450 }
1451
1452 con->features = 0;
1453 INIT_LIST_HEAD(&con->head);
1454
1455 con->flags = RAS_DEFAULT_FLAGS;
1456
1457 if (amdgpu_ras_recovery_init(adev))
1458 goto recovery_out;
1459
1460 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1461
1462 if (amdgpu_ras_fs_init(adev))
1463 goto fs_out;
1464
1465
1466 if (adev->umc.funcs->ras_init)
1467 adev->umc.funcs->ras_init(adev);
1468
1469 DRM_INFO("RAS INFO: ras initialized successfully, "
1470 "hardware ability[%x] ras_mask[%x]\n",
1471 con->hw_supported, con->supported);
1472 return 0;
1473 fs_out:
1474 amdgpu_ras_recovery_fini(adev);
1475 recovery_out:
1476 amdgpu_ras_set_context(adev, NULL);
1477 kfree(con);
1478
1479 return -EINVAL;
1480 }
1481
1482
1483
1484
1485 void amdgpu_ras_resume(struct amdgpu_device *adev)
1486 {
1487 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1488 struct ras_manager *obj, *tmp;
1489
1490 if (!con)
1491 return;
1492
1493 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1494
1495
1496
1497
1498
1499 amdgpu_ras_enable_all_features(adev, 1);
1500
1501
1502
1503
1504
1505 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1506 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1507 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1508
1509 WARN_ON(alive_obj(obj));
1510 }
1511 }
1512 }
1513
1514 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1515 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1516
1517
1518
1519
1520
1521
1522
1523 amdgpu_ras_disable_all_features(adev, 1);
1524 amdgpu_ras_reset_gpu(adev, 0);
1525 }
1526 }
1527
1528 void amdgpu_ras_suspend(struct amdgpu_device *adev)
1529 {
1530 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1531
1532 if (!con)
1533 return;
1534
1535 amdgpu_ras_disable_all_features(adev, 0);
1536
1537 if (con->features)
1538 amdgpu_ras_disable_all_features(adev, 1);
1539 }
1540
1541
1542 int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1543 {
1544 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1545
1546 if (!con)
1547 return 0;
1548
1549
1550 amdgpu_ras_disable_all_features(adev, 0);
1551 amdgpu_ras_recovery_fini(adev);
1552 return 0;
1553 }
1554
1555 int amdgpu_ras_fini(struct amdgpu_device *adev)
1556 {
1557 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1558
1559 if (!con)
1560 return 0;
1561
1562 amdgpu_ras_fs_fini(adev);
1563 amdgpu_ras_interrupt_remove_all(adev);
1564
1565 WARN(con->features, "Feature mask is not cleared");
1566
1567 if (con->features)
1568 amdgpu_ras_disable_all_features(adev, 1);
1569
1570 amdgpu_ras_set_context(adev, NULL);
1571 kfree(con);
1572
1573 return 0;
1574 }