This source file includes following definitions.
- amdgpu_ras_debugfs_read
- amdgpu_ras_find_block_id_by_name
- amdgpu_ras_debugfs_ctrl_parse_data
- amdgpu_ras_debugfs_ctrl_write
- amdgpu_ras_sysfs_read
- put_obj
- amdgpu_ras_create_obj
- amdgpu_ras_find_obj
- amdgpu_ras_is_feature_allowed
- amdgpu_ras_is_feature_enabled
- __amdgpu_ras_feature_enable
- amdgpu_ras_feature_enable
- amdgpu_ras_feature_enable_on_boot
- amdgpu_ras_disable_all_features
- amdgpu_ras_enable_all_features
- amdgpu_ras_error_query
- amdgpu_ras_error_inject
- amdgpu_ras_error_cure
- amdgpu_ras_query_error_count
- amdgpu_ras_badpage_flags_str
- amdgpu_ras_sysfs_badpages_read
- amdgpu_ras_sysfs_features_read
- amdgpu_ras_sysfs_create_feature_node
- amdgpu_ras_sysfs_remove_feature_node
- amdgpu_ras_sysfs_create
- amdgpu_ras_sysfs_remove
- amdgpu_ras_sysfs_remove_all
- amdgpu_ras_debugfs_create_ctrl_node
- amdgpu_ras_debugfs_create
- amdgpu_ras_debugfs_remove
- amdgpu_ras_debugfs_remove_all
- amdgpu_ras_fs_init
- amdgpu_ras_fs_fini
- amdgpu_ras_interrupt_handler
- amdgpu_ras_interrupt_process_handler
- amdgpu_ras_interrupt_dispatch
- amdgpu_ras_interrupt_remove_handler
- amdgpu_ras_interrupt_add_handler
- amdgpu_ras_interrupt_remove_all
- amdgpu_ras_badpages_read
- amdgpu_ras_do_recovery
- amdgpu_ras_realloc_eh_data_space
- amdgpu_ras_add_bad_pages
- amdgpu_ras_reserve_bad_pages
- amdgpu_ras_release_bad_pages
- amdgpu_ras_save_bad_pages
- amdgpu_ras_load_bad_pages
- amdgpu_ras_recovery_init
- amdgpu_ras_recovery_fini
- amdgpu_ras_request_reset_on_boot
- amdgpu_ras_check_supported
- amdgpu_ras_init
- amdgpu_ras_resume
- amdgpu_ras_suspend
- amdgpu_ras_pre_fini
- amdgpu_ras_fini
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 #include <linux/debugfs.h>
  25 #include <linux/list.h>
  26 #include <linux/module.h>
  27 #include <linux/uaccess.h>
  28 
  29 #include "amdgpu.h"
  30 #include "amdgpu_ras.h"
  31 #include "amdgpu_atomfirmware.h"
  32 
  33 const char *ras_error_string[] = {
  34         "none",
  35         "parity",
  36         "single_correctable",
  37         "multi_uncorrectable",
  38         "poison",
  39 };
  40 
  41 const char *ras_block_string[] = {
  42         "umc",
  43         "sdma",
  44         "gfx",
  45         "mmhub",
  46         "athub",
  47         "pcie_bif",
  48         "hdp",
  49         "xgmi_wafl",
  50         "df",
  51         "smn",
  52         "sem",
  53         "mp0",
  54         "mp1",
  55         "fuse",
  56 };
  57 
  58 #define ras_err_str(i) (ras_error_string[ffs(i)])
  59 #define ras_block_str(i) (ras_block_string[i])
  60 
  61 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS           1
  62 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET         2
  63 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
  64 
  65 
  66 #define RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
  67 
  68 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
  69                                         size_t size, loff_t *pos)
  70 {
  71         struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
  72         struct ras_query_if info = {
  73                 .head = obj->head,
  74         };
  75         ssize_t s;
  76         char val[128];
  77 
  78         if (amdgpu_ras_error_query(obj->adev, &info))
  79                 return -EINVAL;
  80 
  81         s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
  82                         "ue", info.ue_count,
  83                         "ce", info.ce_count);
  84         if (*pos >= s)
  85                 return 0;
  86 
  87         s -= *pos;
  88         s = min_t(u64, s, size);
  89 
  90 
  91         if (copy_to_user(buf, &val[*pos], s))
  92                 return -EINVAL;
  93 
  94         *pos += s;
  95 
  96         return s;
  97 }
  98 
  99 static const struct file_operations amdgpu_ras_debugfs_ops = {
 100         .owner = THIS_MODULE,
 101         .read = amdgpu_ras_debugfs_read,
 102         .write = NULL,
 103         .llseek = default_llseek
 104 };
 105 
 106 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
 107 {
 108         int i;
 109 
 110         for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
 111                 *block_id = i;
 112                 if (strcmp(name, ras_block_str(i)) == 0)
 113                         return 0;
 114         }
 115         return -EINVAL;
 116 }
 117 
 118 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 119                 const char __user *buf, size_t size,
 120                 loff_t *pos, struct ras_debug_if *data)
 121 {
 122         ssize_t s = min_t(u64, 64, size);
 123         char str[65];
 124         char block_name[33];
 125         char err[9] = "ue";
 126         int op = -1;
 127         int block_id;
 128         uint32_t sub_block;
 129         u64 address, value;
 130 
 131         if (*pos)
 132                 return -EINVAL;
 133         *pos = size;
 134 
 135         memset(str, 0, sizeof(str));
 136         memset(data, 0, sizeof(*data));
 137 
 138         if (copy_from_user(str, buf, s))
 139                 return -EINVAL;
 140 
 141         if (sscanf(str, "disable %32s", block_name) == 1)
 142                 op = 0;
 143         else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
 144                 op = 1;
 145         else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 146                 op = 2;
 147         else if (str[0] && str[1] && str[2] && str[3])
 148                 
 149                 return -EINVAL;
 150 
 151         if (op != -1) {
 152                 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
 153                         return -EINVAL;
 154 
 155                 data->head.block = block_id;
 156                 
 157                 if (!memcmp("ue", err, 2))
 158                         data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
 159                 else if (!memcmp("ce", err, 2))
 160                         data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
 161                 else
 162                         return -EINVAL;
 163 
 164                 data->op = op;
 165 
 166                 if (op == 2) {
 167                         if (sscanf(str, "%*s %*s %*s %u %llu %llu",
 168                                                 &sub_block, &address, &value) != 3)
 169                                 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
 170                                                         &sub_block, &address, &value) != 3)
 171                                         return -EINVAL;
 172                         data->head.sub_block_index = sub_block;
 173                         data->inject.address = address;
 174                         data->inject.value = value;
 175                 }
 176         } else {
 177                 if (size < sizeof(*data))
 178                         return -EINVAL;
 179 
 180                 if (copy_from_user(data, buf, sizeof(*data)))
 181                         return -EINVAL;
 182         }
 183 
 184         return 0;
 185 }
 186 
 187 
 188 
 189 
 190 
 191 
 192 
 193 
 194 
 195 
 196 
 197 
 198 
 199 
 200 
 201 
 202 
 203 
 204 
 205 
 206 
 207 
 208 
 209 
 210 
 211 
 212 
 213 
 214 
 215 
 216 
 217 
 218 
 219 
 220 
 221 
 222 
 223 
 224 
 225 
 226 
 227 
 228 
 229 
 230 
 231 
 232 
 233 
 234 
 235 
 236 
 237 
 238 
 239 
 240 
 241 
 242 
 243 
 244 
 245 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
 246                 size_t size, loff_t *pos)
 247 {
 248         struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
 249         struct ras_debug_if data;
 250         int ret = 0;
 251 
 252         ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
 253         if (ret)
 254                 return -EINVAL;
 255 
 256         if (!amdgpu_ras_is_supported(adev, data.head.block))
 257                 return -EINVAL;
 258 
 259         switch (data.op) {
 260         case 0:
 261                 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
 262                 break;
 263         case 1:
 264                 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
 265                 break;
 266         case 2:
 267                 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
 268                     (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
 269                         ret = -EINVAL;
 270                         break;
 271                 }
 272 
 273                 
 274                 ret = amdgpu_ras_error_inject(adev, &data.inject);
 275                 break;
 276         default:
 277                 ret = -EINVAL;
 278                 break;
 279         };
 280 
 281         if (ret)
 282                 return -EINVAL;
 283 
 284         return size;
 285 }
 286 
 287 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 288         .owner = THIS_MODULE,
 289         .read = NULL,
 290         .write = amdgpu_ras_debugfs_ctrl_write,
 291         .llseek = default_llseek
 292 };
 293 
 294 static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
 295                 struct device_attribute *attr, char *buf)
 296 {
 297         struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
 298         struct ras_query_if info = {
 299                 .head = obj->head,
 300         };
 301 
 302         if (amdgpu_ras_error_query(obj->adev, &info))
 303                 return -EINVAL;
 304 
 305         return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
 306                         "ue", info.ue_count,
 307                         "ce", info.ce_count);
 308 }
 309 
 310 
 311 
 312 #define get_obj(obj) do { (obj)->use++; } while (0)
 313 #define alive_obj(obj) ((obj)->use)
 314 
 315 static inline void put_obj(struct ras_manager *obj)
 316 {
 317         if (obj && --obj->use == 0)
 318                 list_del(&obj->node);
 319         if (obj && obj->use < 0) {
 320                  DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
 321         }
 322 }
 323 
 324 
 325 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
 326                 struct ras_common_if *head)
 327 {
 328         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 329         struct ras_manager *obj;
 330 
 331         if (!con)
 332                 return NULL;
 333 
 334         if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
 335                 return NULL;
 336 
 337         obj = &con->objs[head->block];
 338         
 339         if (alive_obj(obj))
 340                 return NULL;
 341 
 342         obj->head = *head;
 343         obj->adev = adev;
 344         list_add(&obj->node, &con->head);
 345         get_obj(obj);
 346 
 347         return obj;
 348 }
 349 
 350 
 351 static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 352                 struct ras_common_if *head)
 353 {
 354         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 355         struct ras_manager *obj;
 356         int i;
 357 
 358         if (!con)
 359                 return NULL;
 360 
 361         if (head) {
 362                 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
 363                         return NULL;
 364 
 365                 obj = &con->objs[head->block];
 366 
 367                 if (alive_obj(obj)) {
 368                         WARN_ON(head->block != obj->head.block);
 369                         return obj;
 370                 }
 371         } else {
 372                 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
 373                         obj = &con->objs[i];
 374                         if (alive_obj(obj)) {
 375                                 WARN_ON(i != obj->head.block);
 376                                 return obj;
 377                         }
 378                 }
 379         }
 380 
 381         return NULL;
 382 }
 383 
 384 
 385 
 386 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
 387                 struct ras_common_if *head)
 388 {
 389         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 390 
 391         return con->hw_supported & BIT(head->block);
 392 }
 393 
 394 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
 395                 struct ras_common_if *head)
 396 {
 397         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 398 
 399         return con->features & BIT(head->block);
 400 }
 401 
 402 
 403 
 404 
 405 
 406 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 407                 struct ras_common_if *head, int enable)
 408 {
 409         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 410         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 411 
 412         
 413 
 414 
 415 
 416 
 417 
 418         if (!amdgpu_ras_is_feature_allowed(adev, head))
 419                 return 0;
 420         if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
 421                 return 0;
 422 
 423         if (enable) {
 424                 if (!obj) {
 425                         obj = amdgpu_ras_create_obj(adev, head);
 426                         if (!obj)
 427                                 return -EINVAL;
 428                 } else {
 429                         
 430                         get_obj(obj);
 431                 }
 432                 con->features |= BIT(head->block);
 433         } else {
 434                 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
 435                         con->features &= ~BIT(head->block);
 436                         put_obj(obj);
 437                 }
 438         }
 439 
 440         return 0;
 441 }
 442 
 443 
 444 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 445                 struct ras_common_if *head, bool enable)
 446 {
 447         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 448         union ta_ras_cmd_input info;
 449         int ret;
 450 
 451         if (!con)
 452                 return -EINVAL;
 453 
 454         if (!enable) {
 455                 info.disable_features = (struct ta_ras_disable_features_input) {
 456                         .block_id =  amdgpu_ras_block_to_ta(head->block),
 457                         .error_type = amdgpu_ras_error_to_ta(head->type),
 458                 };
 459         } else {
 460                 info.enable_features = (struct ta_ras_enable_features_input) {
 461                         .block_id =  amdgpu_ras_block_to_ta(head->block),
 462                         .error_type = amdgpu_ras_error_to_ta(head->type),
 463                 };
 464         }
 465 
 466         
 467         WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
 468         
 469         if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
 470                 return 0;
 471 
 472         ret = psp_ras_enable_features(&adev->psp, &info, enable);
 473         if (ret) {
 474                 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
 475                                 enable ? "enable":"disable",
 476                                 ras_block_str(head->block),
 477                                 ret);
 478                 if (ret == TA_RAS_STATUS__RESET_NEEDED)
 479                         return -EAGAIN;
 480                 return -EINVAL;
 481         }
 482 
 483         
 484         __amdgpu_ras_feature_enable(adev, head, enable);
 485 
 486         return 0;
 487 }
 488 
 489 
 490 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
 491                 struct ras_common_if *head, bool enable)
 492 {
 493         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 494         int ret;
 495 
 496         if (!con)
 497                 return -EINVAL;
 498 
 499         if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
 500                 if (enable) {
 501                         
 502 
 503 
 504 
 505 
 506 
 507                         ret = amdgpu_ras_feature_enable(adev, head, 1);
 508                         
 509 
 510 
 511 
 512                         if (ret == -EINVAL) {
 513                                 ret = __amdgpu_ras_feature_enable(adev, head, 1);
 514                                 if (!ret)
 515                                         DRM_INFO("RAS INFO: %s setup object\n",
 516                                                 ras_block_str(head->block));
 517                         }
 518                 } else {
 519                         
 520                         ret = __amdgpu_ras_feature_enable(adev, head, 1);
 521                         if (ret)
 522                                 return ret;
 523 
 524                         ret = amdgpu_ras_feature_enable(adev, head, 0);
 525                 }
 526         } else
 527                 ret = amdgpu_ras_feature_enable(adev, head, enable);
 528 
 529         return ret;
 530 }
 531 
 532 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
 533                 bool bypass)
 534 {
 535         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 536         struct ras_manager *obj, *tmp;
 537 
 538         list_for_each_entry_safe(obj, tmp, &con->head, node) {
 539                 
 540 
 541 
 542                 if (bypass) {
 543                         if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
 544                                 break;
 545                 } else {
 546                         if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
 547                                 break;
 548                 }
 549         }
 550 
 551         return con->features;
 552 }
 553 
 554 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
 555                 bool bypass)
 556 {
 557         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 558         int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
 559         int i;
 560         const enum amdgpu_ras_error_type default_ras_type =
 561                 AMDGPU_RAS_ERROR__NONE;
 562 
 563         for (i = 0; i < ras_block_count; i++) {
 564                 struct ras_common_if head = {
 565                         .block = i,
 566                         .type = default_ras_type,
 567                         .sub_block_index = 0,
 568                 };
 569                 strcpy(head.name, ras_block_str(i));
 570                 if (bypass) {
 571                         
 572 
 573 
 574 
 575                         if (__amdgpu_ras_feature_enable(adev, &head, 1))
 576                                 break;
 577                 } else {
 578                         if (amdgpu_ras_feature_enable(adev, &head, 1))
 579                                 break;
 580                 }
 581         }
 582 
 583         return con->features;
 584 }
 585 
 586 
 587 
 588 int amdgpu_ras_error_query(struct amdgpu_device *adev,
 589                 struct ras_query_if *info)
 590 {
 591         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 592         struct ras_err_data err_data = {0, 0, 0, NULL};
 593 
 594         if (!obj)
 595                 return -EINVAL;
 596 
 597         switch (info->head.block) {
 598         case AMDGPU_RAS_BLOCK__UMC:
 599                 if (adev->umc.funcs->query_ras_error_count)
 600                         adev->umc.funcs->query_ras_error_count(adev, &err_data);
 601                 
 602 
 603 
 604                 if (adev->umc.funcs->query_ras_error_address)
 605                         adev->umc.funcs->query_ras_error_address(adev, &err_data);
 606                 break;
 607         case AMDGPU_RAS_BLOCK__GFX:
 608                 if (adev->gfx.funcs->query_ras_error_count)
 609                         adev->gfx.funcs->query_ras_error_count(adev, &err_data);
 610                 break;
 611         case AMDGPU_RAS_BLOCK__MMHUB:
 612                 if (adev->mmhub_funcs->query_ras_error_count)
 613                         adev->mmhub_funcs->query_ras_error_count(adev, &err_data);
 614                 break;
 615         default:
 616                 break;
 617         }
 618 
 619         obj->err_data.ue_count += err_data.ue_count;
 620         obj->err_data.ce_count += err_data.ce_count;
 621 
 622         info->ue_count = obj->err_data.ue_count;
 623         info->ce_count = obj->err_data.ce_count;
 624 
 625         if (err_data.ce_count)
 626                 dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
 627                          obj->err_data.ce_count, ras_block_str(info->head.block));
 628         if (err_data.ue_count)
 629                 dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
 630                          obj->err_data.ue_count, ras_block_str(info->head.block));
 631 
 632         return 0;
 633 }
 634 
 635 
 636 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 637                 struct ras_inject_if *info)
 638 {
 639         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 640         struct ta_ras_trigger_error_input block_info = {
 641                 .block_id =  amdgpu_ras_block_to_ta(info->head.block),
 642                 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
 643                 .sub_block_index = info->head.sub_block_index,
 644                 .address = info->address,
 645                 .value = info->value,
 646         };
 647         int ret = 0;
 648 
 649         if (!obj)
 650                 return -EINVAL;
 651 
 652         switch (info->head.block) {
 653         case AMDGPU_RAS_BLOCK__GFX:
 654                 if (adev->gfx.funcs->ras_error_inject)
 655                         ret = adev->gfx.funcs->ras_error_inject(adev, info);
 656                 else
 657                         ret = -EINVAL;
 658                 break;
 659         case AMDGPU_RAS_BLOCK__UMC:
 660         case AMDGPU_RAS_BLOCK__MMHUB:
 661                 ret = psp_ras_trigger_error(&adev->psp, &block_info);
 662                 break;
 663         default:
 664                 DRM_INFO("%s error injection is not supported yet\n",
 665                          ras_block_str(info->head.block));
 666                 ret = -EINVAL;
 667         }
 668 
 669         if (ret)
 670                 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
 671                                 ras_block_str(info->head.block),
 672                                 ret);
 673 
 674         return ret;
 675 }
 676 
 677 int amdgpu_ras_error_cure(struct amdgpu_device *adev,
 678                 struct ras_cure_if *info)
 679 {
 680         
 681         return 0;
 682 }
 683 
 684 
 685 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 686                 bool is_ce)
 687 {
 688         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 689         struct ras_manager *obj;
 690         struct ras_err_data data = {0, 0};
 691 
 692         if (!con)
 693                 return 0;
 694 
 695         list_for_each_entry(obj, &con->head, node) {
 696                 struct ras_query_if info = {
 697                         .head = obj->head,
 698                 };
 699 
 700                 if (amdgpu_ras_error_query(adev, &info))
 701                         return 0;
 702 
 703                 data.ce_count += info.ce_count;
 704                 data.ue_count += info.ue_count;
 705         }
 706 
 707         return is_ce ? data.ce_count : data.ue_count;
 708 }
 709 
 710 
 711 
 712 
 713 
 714 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 715                 struct ras_badpage **bps, unsigned int *count);
 716 
 717 static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
 718 {
 719         switch (flags) {
 720         case 0:
 721                 return "R";
 722         case 1:
 723                 return "P";
 724         case 2:
 725         default:
 726                 return "F";
 727         };
 728 }
 729 
 730 
 731 
 732 
 733 
 734 
 735 
 736 
 737 
 738 
 739 
 740 
 741 
 742 
 743 
 744 
 745 
 746 
 747 
 748 
 749 
 750 
 751 
 752 
 753 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
 754                 struct kobject *kobj, struct bin_attribute *attr,
 755                 char *buf, loff_t ppos, size_t count)
 756 {
 757         struct amdgpu_ras *con =
 758                 container_of(attr, struct amdgpu_ras, badpages_attr);
 759         struct amdgpu_device *adev = con->adev;
 760         const unsigned int element_size =
 761                 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
 762         unsigned int start = div64_ul(ppos + element_size - 1, element_size);
 763         unsigned int end = div64_ul(ppos + count - 1, element_size);
 764         ssize_t s = 0;
 765         struct ras_badpage *bps = NULL;
 766         unsigned int bps_count = 0;
 767 
 768         memset(buf, 0, count);
 769 
 770         if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
 771                 return 0;
 772 
 773         for (; start < end && start < bps_count; start++)
 774                 s += scnprintf(&buf[s], element_size + 1,
 775                                 "0x%08x : 0x%08x : %1s\n",
 776                                 bps[start].bp,
 777                                 bps[start].size,
 778                                 amdgpu_ras_badpage_flags_str(bps[start].flags));
 779 
 780         kfree(bps);
 781 
 782         return s;
 783 }
 784 
 785 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
 786                 struct device_attribute *attr, char *buf)
 787 {
 788         struct amdgpu_ras *con =
 789                 container_of(attr, struct amdgpu_ras, features_attr);
 790 
 791         return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
 792 }
 793 
 794 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
 795 {
 796         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 797         struct attribute *attrs[] = {
 798                 &con->features_attr.attr,
 799                 NULL
 800         };
 801         struct bin_attribute *bin_attrs[] = {
 802                 &con->badpages_attr,
 803                 NULL
 804         };
 805         struct attribute_group group = {
 806                 .name = "ras",
 807                 .attrs = attrs,
 808                 .bin_attrs = bin_attrs,
 809         };
 810 
 811         con->features_attr = (struct device_attribute) {
 812                 .attr = {
 813                         .name = "features",
 814                         .mode = S_IRUGO,
 815                 },
 816                         .show = amdgpu_ras_sysfs_features_read,
 817         };
 818 
 819         con->badpages_attr = (struct bin_attribute) {
 820                 .attr = {
 821                         .name = "gpu_vram_bad_pages",
 822                         .mode = S_IRUGO,
 823                 },
 824                 .size = 0,
 825                 .private = NULL,
 826                 .read = amdgpu_ras_sysfs_badpages_read,
 827         };
 828 
 829         sysfs_attr_init(attrs[0]);
 830         sysfs_bin_attr_init(bin_attrs[0]);
 831 
 832         return sysfs_create_group(&adev->dev->kobj, &group);
 833 }
 834 
 835 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
 836 {
 837         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 838         struct attribute *attrs[] = {
 839                 &con->features_attr.attr,
 840                 NULL
 841         };
 842         struct bin_attribute *bin_attrs[] = {
 843                 &con->badpages_attr,
 844                 NULL
 845         };
 846         struct attribute_group group = {
 847                 .name = "ras",
 848                 .attrs = attrs,
 849                 .bin_attrs = bin_attrs,
 850         };
 851 
 852         sysfs_remove_group(&adev->dev->kobj, &group);
 853 
 854         return 0;
 855 }
 856 
 857 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
 858                 struct ras_fs_if *head)
 859 {
 860         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
 861 
 862         if (!obj || obj->attr_inuse)
 863                 return -EINVAL;
 864 
 865         get_obj(obj);
 866 
 867         memcpy(obj->fs_data.sysfs_name,
 868                         head->sysfs_name,
 869                         sizeof(obj->fs_data.sysfs_name));
 870 
 871         obj->sysfs_attr = (struct device_attribute){
 872                 .attr = {
 873                         .name = obj->fs_data.sysfs_name,
 874                         .mode = S_IRUGO,
 875                 },
 876                         .show = amdgpu_ras_sysfs_read,
 877         };
 878         sysfs_attr_init(&obj->sysfs_attr.attr);
 879 
 880         if (sysfs_add_file_to_group(&adev->dev->kobj,
 881                                 &obj->sysfs_attr.attr,
 882                                 "ras")) {
 883                 put_obj(obj);
 884                 return -EINVAL;
 885         }
 886 
 887         obj->attr_inuse = 1;
 888 
 889         return 0;
 890 }
 891 
 892 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
 893                 struct ras_common_if *head)
 894 {
 895         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 896 
 897         if (!obj || !obj->attr_inuse)
 898                 return -EINVAL;
 899 
 900         sysfs_remove_file_from_group(&adev->dev->kobj,
 901                                 &obj->sysfs_attr.attr,
 902                                 "ras");
 903         obj->attr_inuse = 0;
 904         put_obj(obj);
 905 
 906         return 0;
 907 }
 908 
 909 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
 910 {
 911         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 912         struct ras_manager *obj, *tmp;
 913 
 914         list_for_each_entry_safe(obj, tmp, &con->head, node) {
 915                 amdgpu_ras_sysfs_remove(adev, &obj->head);
 916         }
 917 
 918         amdgpu_ras_sysfs_remove_feature_node(adev);
 919 
 920         return 0;
 921 }
 922 
 923 
 924 
 925 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 926 {
 927         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 928         struct drm_minor *minor = adev->ddev->primary;
 929 
 930         con->dir = debugfs_create_dir("ras", minor->debugfs_root);
 931         con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
 932                                        adev, &amdgpu_ras_debugfs_ctrl_ops);
 933 }
 934 
 935 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
 936                 struct ras_fs_if *head)
 937 {
 938         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 939         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
 940 
 941         if (!obj || obj->ent)
 942                 return;
 943 
 944         get_obj(obj);
 945 
 946         memcpy(obj->fs_data.debugfs_name,
 947                         head->debugfs_name,
 948                         sizeof(obj->fs_data.debugfs_name));
 949 
 950         obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
 951                                        S_IWUGO | S_IRUGO, con->dir, obj,
 952                                        &amdgpu_ras_debugfs_ops);
 953 }
 954 
 955 void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
 956                 struct ras_common_if *head)
 957 {
 958         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 959 
 960         if (!obj || !obj->ent)
 961                 return;
 962 
 963         debugfs_remove(obj->ent);
 964         obj->ent = NULL;
 965         put_obj(obj);
 966 }
 967 
 968 static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
 969 {
 970         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 971         struct ras_manager *obj, *tmp;
 972 
 973         list_for_each_entry_safe(obj, tmp, &con->head, node) {
 974                 amdgpu_ras_debugfs_remove(adev, &obj->head);
 975         }
 976 
 977         debugfs_remove(con->ent);
 978         debugfs_remove(con->dir);
 979         con->dir = NULL;
 980         con->ent = NULL;
 981 }
 982 
 983 
 984 
 985 
 986 static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
 987 {
 988         amdgpu_ras_sysfs_create_feature_node(adev);
 989         amdgpu_ras_debugfs_create_ctrl_node(adev);
 990 
 991         return 0;
 992 }
 993 
 994 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 995 {
 996         amdgpu_ras_debugfs_remove_all(adev);
 997         amdgpu_ras_sysfs_remove_all(adev);
 998         return 0;
 999 }
1000 
1001 
1002 
1003 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1004 {
1005         struct ras_ih_data *data = &obj->ih_data;
1006         struct amdgpu_iv_entry entry;
1007         int ret;
1008         struct ras_err_data err_data = {0, 0, 0, NULL};
1009 
1010         while (data->rptr != data->wptr) {
1011                 rmb();
1012                 memcpy(&entry, &data->ring[data->rptr],
1013                                 data->element_size);
1014 
1015                 wmb();
1016                 data->rptr = (data->aligned_element_size +
1017                                 data->rptr) % data->ring_size;
1018 
1019                 
1020 
1021 
1022                 if (data->cb) {
1023                         ret = data->cb(obj->adev, &err_data, &entry);
1024                         
1025 
1026 
1027 
1028 
1029                         if (ret == AMDGPU_RAS_SUCCESS) {
1030                                 
1031 
1032 
1033                                 obj->err_data.ue_count += err_data.ue_count;
1034                                 obj->err_data.ce_count += err_data.ce_count;
1035                         }
1036                 }
1037         }
1038 }
1039 
1040 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1041 {
1042         struct ras_ih_data *data =
1043                 container_of(work, struct ras_ih_data, ih_work);
1044         struct ras_manager *obj =
1045                 container_of(data, struct ras_manager, ih_data);
1046 
1047         amdgpu_ras_interrupt_handler(obj);
1048 }
1049 
1050 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1051                 struct ras_dispatch_if *info)
1052 {
1053         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1054         struct ras_ih_data *data = &obj->ih_data;
1055 
1056         if (!obj)
1057                 return -EINVAL;
1058 
1059         if (data->inuse == 0)
1060                 return 0;
1061 
1062         
1063         memcpy(&data->ring[data->wptr], info->entry,
1064                         data->element_size);
1065 
1066         wmb();
1067         data->wptr = (data->aligned_element_size +
1068                         data->wptr) % data->ring_size;
1069 
1070         schedule_work(&data->ih_work);
1071 
1072         return 0;
1073 }
1074 
1075 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1076                 struct ras_ih_if *info)
1077 {
1078         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1079         struct ras_ih_data *data;
1080 
1081         if (!obj)
1082                 return -EINVAL;
1083 
1084         data = &obj->ih_data;
1085         if (data->inuse == 0)
1086                 return 0;
1087 
1088         cancel_work_sync(&data->ih_work);
1089 
1090         kfree(data->ring);
1091         memset(data, 0, sizeof(*data));
1092         put_obj(obj);
1093 
1094         return 0;
1095 }
1096 
1097 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1098                 struct ras_ih_if *info)
1099 {
1100         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1101         struct ras_ih_data *data;
1102 
1103         if (!obj) {
1104                 
1105                 obj = amdgpu_ras_create_obj(adev, &info->head);
1106                 if (!obj)
1107                         return -EINVAL;
1108         } else
1109                 get_obj(obj);
1110 
1111         data = &obj->ih_data;
1112         
1113         *data = (struct ras_ih_data) {
1114                 .inuse = 0,
1115                 .cb = info->cb,
1116                 .element_size = sizeof(struct amdgpu_iv_entry),
1117                 .rptr = 0,
1118                 .wptr = 0,
1119         };
1120 
1121         INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1122 
1123         data->aligned_element_size = ALIGN(data->element_size, 8);
1124         
1125         data->ring_size = 64 * data->aligned_element_size;
1126         data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1127         if (!data->ring) {
1128                 put_obj(obj);
1129                 return -ENOMEM;
1130         }
1131 
1132         
1133         data->inuse = 1;
1134 
1135         return 0;
1136 }
1137 
1138 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1139 {
1140         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1141         struct ras_manager *obj, *tmp;
1142 
1143         list_for_each_entry_safe(obj, tmp, &con->head, node) {
1144                 struct ras_ih_if info = {
1145                         .head = obj->head,
1146                 };
1147                 amdgpu_ras_interrupt_remove_handler(adev, &info);
1148         }
1149 
1150         return 0;
1151 }
1152 
1153 
1154 
1155 
1156 
1157 
1158 
1159 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1160                 struct ras_badpage **bps, unsigned int *count)
1161 {
1162         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1163         struct ras_err_handler_data *data;
1164         int i = 0;
1165         int ret = 0;
1166 
1167         if (!con || !con->eh_data || !bps || !count)
1168                 return -EINVAL;
1169 
1170         mutex_lock(&con->recovery_lock);
1171         data = con->eh_data;
1172         if (!data || data->count == 0) {
1173                 *bps = NULL;
1174                 goto out;
1175         }
1176 
1177         *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1178         if (!*bps) {
1179                 ret = -ENOMEM;
1180                 goto out;
1181         }
1182 
1183         for (; i < data->count; i++) {
1184                 (*bps)[i] = (struct ras_badpage){
1185                         .bp = data->bps[i].bp,
1186                         .size = AMDGPU_GPU_PAGE_SIZE,
1187                         .flags = 0,
1188                 };
1189 
1190                 if (data->last_reserved <= i)
1191                         (*bps)[i].flags = 1;
1192                 else if (data->bps[i].bo == NULL)
1193                         (*bps)[i].flags = 2;
1194         }
1195 
1196         *count = data->count;
1197 out:
1198         mutex_unlock(&con->recovery_lock);
1199         return ret;
1200 }
1201 
1202 static void amdgpu_ras_do_recovery(struct work_struct *work)
1203 {
1204         struct amdgpu_ras *ras =
1205                 container_of(work, struct amdgpu_ras, recovery_work);
1206 
1207         amdgpu_device_gpu_recover(ras->adev, 0);
1208         atomic_set(&ras->in_recovery, 0);
1209 }
1210 
1211 
1212 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1213                 struct ras_err_handler_data *data, int pages)
1214 {
1215         unsigned int old_space = data->count + data->space_left;
1216         unsigned int new_space = old_space + pages;
1217         unsigned int align_space = ALIGN(new_space, 1024);
1218         void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1219 
1220         if (!tmp)
1221                 return -ENOMEM;
1222 
1223         if (data->bps) {
1224                 memcpy(tmp, data->bps,
1225                                 data->count * sizeof(*data->bps));
1226                 kfree(data->bps);
1227         }
1228 
1229         data->bps = tmp;
1230         data->space_left += align_space - old_space;
1231         return 0;
1232 }
1233 
1234 
1235 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1236                 unsigned long *bps, int pages)
1237 {
1238         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1239         struct ras_err_handler_data *data;
1240         int i = pages;
1241         int ret = 0;
1242 
1243         if (!con || !con->eh_data || !bps || pages <= 0)
1244                 return 0;
1245 
1246         mutex_lock(&con->recovery_lock);
1247         data = con->eh_data;
1248         if (!data)
1249                 goto out;
1250 
1251         if (data->space_left <= pages)
1252                 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1253                         ret = -ENOMEM;
1254                         goto out;
1255                 }
1256 
1257         while (i--)
1258                 data->bps[data->count++].bp = bps[i];
1259 
1260         data->space_left -= pages;
1261 out:
1262         mutex_unlock(&con->recovery_lock);
1263 
1264         return ret;
1265 }
1266 
1267 
1268 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1269 {
1270         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1271         struct ras_err_handler_data *data;
1272         uint64_t bp;
1273         struct amdgpu_bo *bo = NULL;
1274         int i;
1275 
1276         if (!con || !con->eh_data)
1277                 return 0;
1278 
1279         mutex_lock(&con->recovery_lock);
1280         data = con->eh_data;
1281         if (!data)
1282                 goto out;
1283         
1284         for (i = data->last_reserved; i < data->count; i++) {
1285                 bp = data->bps[i].bp;
1286 
1287                 if (amdgpu_bo_create_kernel_at(adev, bp << PAGE_SHIFT, PAGE_SIZE,
1288                                                AMDGPU_GEM_DOMAIN_VRAM,
1289                                                &bo, NULL))
1290                         DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1291 
1292                 data->bps[i].bo = bo;
1293                 data->last_reserved = i + 1;
1294                 bo = NULL;
1295         }
1296 out:
1297         mutex_unlock(&con->recovery_lock);
1298         return 0;
1299 }
1300 
1301 
1302 static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1303 {
1304         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1305         struct ras_err_handler_data *data;
1306         struct amdgpu_bo *bo;
1307         int i;
1308 
1309         if (!con || !con->eh_data)
1310                 return 0;
1311 
1312         mutex_lock(&con->recovery_lock);
1313         data = con->eh_data;
1314         if (!data)
1315                 goto out;
1316 
1317         for (i = data->last_reserved - 1; i >= 0; i--) {
1318                 bo = data->bps[i].bo;
1319 
1320                 amdgpu_bo_free_kernel(&bo, NULL, NULL);
1321 
1322                 data->bps[i].bo = bo;
1323                 data->last_reserved = i;
1324         }
1325 out:
1326         mutex_unlock(&con->recovery_lock);
1327         return 0;
1328 }
1329 
1330 static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1331 {
1332         
1333 
1334 
1335         return 0;
1336 }
1337 
1338 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1339 {
1340         
1341 
1342 
1343         return 0;
1344 }
1345 
1346 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1347 {
1348         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1349         struct ras_err_handler_data **data = &con->eh_data;
1350 
1351         *data = kmalloc(sizeof(**data),
1352                         GFP_KERNEL|__GFP_ZERO);
1353         if (!*data)
1354                 return -ENOMEM;
1355 
1356         mutex_init(&con->recovery_lock);
1357         INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1358         atomic_set(&con->in_recovery, 0);
1359         con->adev = adev;
1360 
1361         amdgpu_ras_load_bad_pages(adev);
1362         amdgpu_ras_reserve_bad_pages(adev);
1363 
1364         return 0;
1365 }
1366 
1367 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1368 {
1369         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1370         struct ras_err_handler_data *data = con->eh_data;
1371 
1372         cancel_work_sync(&con->recovery_work);
1373         amdgpu_ras_save_bad_pages(adev);
1374         amdgpu_ras_release_bad_pages(adev);
1375 
1376         mutex_lock(&con->recovery_lock);
1377         con->eh_data = NULL;
1378         kfree(data->bps);
1379         kfree(data);
1380         mutex_unlock(&con->recovery_lock);
1381 
1382         return 0;
1383 }
1384 
1385 
1386 
1387 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1388                 unsigned int block)
1389 {
1390         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1391 
1392         if (!ras)
1393                 return -EINVAL;
1394 
1395         ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1396         return 0;
1397 }
1398 
1399 
1400 
1401 
1402 
1403 
1404 
1405 
1406 
1407 
1408 static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1409                 uint32_t *hw_supported, uint32_t *supported)
1410 {
1411         *hw_supported = 0;
1412         *supported = 0;
1413 
1414         if (amdgpu_sriov_vf(adev) ||
1415                         adev->asic_type != CHIP_VEGA20)
1416                 return;
1417 
1418         if (adev->is_atom_fw &&
1419                         (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1420                          amdgpu_atomfirmware_sram_ecc_supported(adev)))
1421                 *hw_supported = AMDGPU_RAS_BLOCK_MASK;
1422 
1423         *supported = amdgpu_ras_enable == 0 ?
1424                                 0 : *hw_supported & amdgpu_ras_mask;
1425 }
1426 
1427 int amdgpu_ras_init(struct amdgpu_device *adev)
1428 {
1429         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1430 
1431         if (con)
1432                 return 0;
1433 
1434         con = kmalloc(sizeof(struct amdgpu_ras) +
1435                         sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1436                         GFP_KERNEL|__GFP_ZERO);
1437         if (!con)
1438                 return -ENOMEM;
1439 
1440         con->objs = (struct ras_manager *)(con + 1);
1441 
1442         amdgpu_ras_set_context(adev, con);
1443 
1444         amdgpu_ras_check_supported(adev, &con->hw_supported,
1445                         &con->supported);
1446         if (!con->hw_supported) {
1447                 amdgpu_ras_set_context(adev, NULL);
1448                 kfree(con);
1449                 return 0;
1450         }
1451 
1452         con->features = 0;
1453         INIT_LIST_HEAD(&con->head);
1454         
1455         con->flags = RAS_DEFAULT_FLAGS;
1456 
1457         if (amdgpu_ras_recovery_init(adev))
1458                 goto recovery_out;
1459 
1460         amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1461 
1462         if (amdgpu_ras_fs_init(adev))
1463                 goto fs_out;
1464 
1465         
1466         if (adev->umc.funcs->ras_init)
1467                 adev->umc.funcs->ras_init(adev);
1468 
1469         DRM_INFO("RAS INFO: ras initialized successfully, "
1470                         "hardware ability[%x] ras_mask[%x]\n",
1471                         con->hw_supported, con->supported);
1472         return 0;
1473 fs_out:
1474         amdgpu_ras_recovery_fini(adev);
1475 recovery_out:
1476         amdgpu_ras_set_context(adev, NULL);
1477         kfree(con);
1478 
1479         return -EINVAL;
1480 }
1481 
1482 
1483 
1484 
1485 void amdgpu_ras_resume(struct amdgpu_device *adev)
1486 {
1487         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1488         struct ras_manager *obj, *tmp;
1489 
1490         if (!con)
1491                 return;
1492 
1493         if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1494                 
1495 
1496 
1497 
1498 
1499                 amdgpu_ras_enable_all_features(adev, 1);
1500 
1501                 
1502 
1503 
1504 
1505                 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1506                         if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1507                                 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1508                                 
1509                                 WARN_ON(alive_obj(obj));
1510                         }
1511                 }
1512         }
1513 
1514         if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1515                 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1516                 
1517 
1518 
1519 
1520 
1521 
1522 
1523                 amdgpu_ras_disable_all_features(adev, 1);
1524                 amdgpu_ras_reset_gpu(adev, 0);
1525         }
1526 }
1527 
1528 void amdgpu_ras_suspend(struct amdgpu_device *adev)
1529 {
1530         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1531 
1532         if (!con)
1533                 return;
1534 
1535         amdgpu_ras_disable_all_features(adev, 0);
1536         
1537         if (con->features)
1538                 amdgpu_ras_disable_all_features(adev, 1);
1539 }
1540 
1541 
1542 int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1543 {
1544         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1545 
1546         if (!con)
1547                 return 0;
1548 
1549         
1550         amdgpu_ras_disable_all_features(adev, 0);
1551         amdgpu_ras_recovery_fini(adev);
1552         return 0;
1553 }
1554 
1555 int amdgpu_ras_fini(struct amdgpu_device *adev)
1556 {
1557         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1558 
1559         if (!con)
1560                 return 0;
1561 
1562         amdgpu_ras_fs_fini(adev);
1563         amdgpu_ras_interrupt_remove_all(adev);
1564 
1565         WARN(con->features, "Feature mask is not cleared");
1566 
1567         if (con->features)
1568                 amdgpu_ras_disable_all_features(adev, 1);
1569 
1570         amdgpu_ras_set_context(adev, NULL);
1571         kfree(con);
1572 
1573         return 0;
1574 }