root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. amdgpu_ras_debugfs_read
  2. amdgpu_ras_find_block_id_by_name
  3. amdgpu_ras_debugfs_ctrl_parse_data
  4. amdgpu_ras_debugfs_ctrl_write
  5. amdgpu_ras_sysfs_read
  6. put_obj
  7. amdgpu_ras_create_obj
  8. amdgpu_ras_find_obj
  9. amdgpu_ras_is_feature_allowed
  10. amdgpu_ras_is_feature_enabled
  11. __amdgpu_ras_feature_enable
  12. amdgpu_ras_feature_enable
  13. amdgpu_ras_feature_enable_on_boot
  14. amdgpu_ras_disable_all_features
  15. amdgpu_ras_enable_all_features
  16. amdgpu_ras_error_query
  17. amdgpu_ras_error_inject
  18. amdgpu_ras_error_cure
  19. amdgpu_ras_query_error_count
  20. amdgpu_ras_badpage_flags_str
  21. amdgpu_ras_sysfs_badpages_read
  22. amdgpu_ras_sysfs_features_read
  23. amdgpu_ras_sysfs_create_feature_node
  24. amdgpu_ras_sysfs_remove_feature_node
  25. amdgpu_ras_sysfs_create
  26. amdgpu_ras_sysfs_remove
  27. amdgpu_ras_sysfs_remove_all
  28. amdgpu_ras_debugfs_create_ctrl_node
  29. amdgpu_ras_debugfs_create
  30. amdgpu_ras_debugfs_remove
  31. amdgpu_ras_debugfs_remove_all
  32. amdgpu_ras_fs_init
  33. amdgpu_ras_fs_fini
  34. amdgpu_ras_interrupt_handler
  35. amdgpu_ras_interrupt_process_handler
  36. amdgpu_ras_interrupt_dispatch
  37. amdgpu_ras_interrupt_remove_handler
  38. amdgpu_ras_interrupt_add_handler
  39. amdgpu_ras_interrupt_remove_all
  40. amdgpu_ras_badpages_read
  41. amdgpu_ras_do_recovery
  42. amdgpu_ras_realloc_eh_data_space
  43. amdgpu_ras_add_bad_pages
  44. amdgpu_ras_reserve_bad_pages
  45. amdgpu_ras_release_bad_pages
  46. amdgpu_ras_save_bad_pages
  47. amdgpu_ras_load_bad_pages
  48. amdgpu_ras_recovery_init
  49. amdgpu_ras_recovery_fini
  50. amdgpu_ras_request_reset_on_boot
  51. amdgpu_ras_check_supported
  52. amdgpu_ras_init
  53. amdgpu_ras_resume
  54. amdgpu_ras_suspend
  55. amdgpu_ras_pre_fini
  56. amdgpu_ras_fini

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  *
  23  */
  24 #include <linux/debugfs.h>
  25 #include <linux/list.h>
  26 #include <linux/module.h>
  27 #include <linux/uaccess.h>
  28 
  29 #include "amdgpu.h"
  30 #include "amdgpu_ras.h"
  31 #include "amdgpu_atomfirmware.h"
  32 
  33 const char *ras_error_string[] = {
  34         "none",
  35         "parity",
  36         "single_correctable",
  37         "multi_uncorrectable",
  38         "poison",
  39 };
  40 
  41 const char *ras_block_string[] = {
  42         "umc",
  43         "sdma",
  44         "gfx",
  45         "mmhub",
  46         "athub",
  47         "pcie_bif",
  48         "hdp",
  49         "xgmi_wafl",
  50         "df",
  51         "smn",
  52         "sem",
  53         "mp0",
  54         "mp1",
  55         "fuse",
  56 };
  57 
  58 #define ras_err_str(i) (ras_error_string[ffs(i)])
  59 #define ras_block_str(i) (ras_block_string[i])
  60 
  61 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS           1
  62 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET         2
  63 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
  64 
  65 /* inject address is 52 bits */
  66 #define RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
  67 
  68 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
  69                                         size_t size, loff_t *pos)
  70 {
  71         struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
  72         struct ras_query_if info = {
  73                 .head = obj->head,
  74         };
  75         ssize_t s;
  76         char val[128];
  77 
  78         if (amdgpu_ras_error_query(obj->adev, &info))
  79                 return -EINVAL;
  80 
  81         s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
  82                         "ue", info.ue_count,
  83                         "ce", info.ce_count);
  84         if (*pos >= s)
  85                 return 0;
  86 
  87         s -= *pos;
  88         s = min_t(u64, s, size);
  89 
  90 
  91         if (copy_to_user(buf, &val[*pos], s))
  92                 return -EINVAL;
  93 
  94         *pos += s;
  95 
  96         return s;
  97 }
  98 
  99 static const struct file_operations amdgpu_ras_debugfs_ops = {
 100         .owner = THIS_MODULE,
 101         .read = amdgpu_ras_debugfs_read,
 102         .write = NULL,
 103         .llseek = default_llseek
 104 };
 105 
 106 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
 107 {
 108         int i;
 109 
 110         for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
 111                 *block_id = i;
 112                 if (strcmp(name, ras_block_str(i)) == 0)
 113                         return 0;
 114         }
 115         return -EINVAL;
 116 }
 117 
 118 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 119                 const char __user *buf, size_t size,
 120                 loff_t *pos, struct ras_debug_if *data)
 121 {
 122         ssize_t s = min_t(u64, 64, size);
 123         char str[65];
 124         char block_name[33];
 125         char err[9] = "ue";
 126         int op = -1;
 127         int block_id;
 128         uint32_t sub_block;
 129         u64 address, value;
 130 
 131         if (*pos)
 132                 return -EINVAL;
 133         *pos = size;
 134 
 135         memset(str, 0, sizeof(str));
 136         memset(data, 0, sizeof(*data));
 137 
 138         if (copy_from_user(str, buf, s))
 139                 return -EINVAL;
 140 
 141         if (sscanf(str, "disable %32s", block_name) == 1)
 142                 op = 0;
 143         else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
 144                 op = 1;
 145         else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 146                 op = 2;
 147         else if (str[0] && str[1] && str[2] && str[3])
 148                 /* ascii string, but commands are not matched. */
 149                 return -EINVAL;
 150 
 151         if (op != -1) {
 152                 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
 153                         return -EINVAL;
 154 
 155                 data->head.block = block_id;
 156                 /* only ue and ce errors are supported */
 157                 if (!memcmp("ue", err, 2))
 158                         data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
 159                 else if (!memcmp("ce", err, 2))
 160                         data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
 161                 else
 162                         return -EINVAL;
 163 
 164                 data->op = op;
 165 
 166                 if (op == 2) {
 167                         if (sscanf(str, "%*s %*s %*s %u %llu %llu",
 168                                                 &sub_block, &address, &value) != 3)
 169                                 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
 170                                                         &sub_block, &address, &value) != 3)
 171                                         return -EINVAL;
 172                         data->head.sub_block_index = sub_block;
 173                         data->inject.address = address;
 174                         data->inject.value = value;
 175                 }
 176         } else {
 177                 if (size < sizeof(*data))
 178                         return -EINVAL;
 179 
 180                 if (copy_from_user(data, buf, sizeof(*data)))
 181                         return -EINVAL;
 182         }
 183 
 184         return 0;
 185 }
 186 /**
 187  * DOC: AMDGPU RAS debugfs control interface
 188  *
 189  * It accepts struct ras_debug_if who has two members.
 190  *
 191  * First member: ras_debug_if::head or ras_debug_if::inject.
 192  *
 193  * head is used to indicate which IP block will be under control.
 194  *
 195  * head has four members, they are block, type, sub_block_index, name.
 196  * block: which IP will be under control.
 197  * type: what kind of error will be enabled/disabled/injected.
 198  * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
 199  * name: the name of IP.
 200  *
 201  * inject has two more members than head, they are address, value.
 202  * As their names indicate, inject operation will write the
 203  * value to the address.
 204  *
 205  * Second member: struct ras_debug_if::op.
 206  * It has three kinds of operations.
 207  *  0: disable RAS on the block. Take ::head as its data.
 208  *  1: enable RAS on the block. Take ::head as its data.
 209  *  2: inject errors on the block. Take ::inject as its data.
 210  *
 211  * How to use the interface?
 212  * programs:
 213  * copy the struct ras_debug_if in your codes and initialize it.
 214  * write the struct to the control node.
 215  *
 216  * bash:
 217  * echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl
 218  *      op: disable, enable, inject
 219  *              disable: only block is needed
 220  *              enable: block and error are needed
 221  *              inject: error, address, value are needed
 222  *      block: umc, smda, gfx, .........
 223  *              see ras_block_string[] for details
 224  *      error: ue, ce
 225  *              ue: multi_uncorrectable
 226  *              ce: single_correctable
 227  *      sub_block: sub block index, pass 0 if there is no sub block
 228  *
 229  * here are some examples for bash commands,
 230  *      echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
 231  *      echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
 232  *      echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
 233  *
 234  * How to check the result?
 235  *
 236  * For disable/enable, please check ras features at
 237  * /sys/class/drm/card[0/1/2...]/device/ras/features
 238  *
 239  * For inject, please check corresponding err count at
 240  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
 241  *
 242  * NOTE: operation is only allowed on blocks which are supported.
 243  * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
 244  */
 245 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
 246                 size_t size, loff_t *pos)
 247 {
 248         struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
 249         struct ras_debug_if data;
 250         int ret = 0;
 251 
 252         ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
 253         if (ret)
 254                 return -EINVAL;
 255 
 256         if (!amdgpu_ras_is_supported(adev, data.head.block))
 257                 return -EINVAL;
 258 
 259         switch (data.op) {
 260         case 0:
 261                 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
 262                 break;
 263         case 1:
 264                 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
 265                 break;
 266         case 2:
 267                 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
 268                     (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
 269                         ret = -EINVAL;
 270                         break;
 271                 }
 272 
 273                 /* data.inject.address is offset instead of absolute gpu address */
 274                 ret = amdgpu_ras_error_inject(adev, &data.inject);
 275                 break;
 276         default:
 277                 ret = -EINVAL;
 278                 break;
 279         };
 280 
 281         if (ret)
 282                 return -EINVAL;
 283 
 284         return size;
 285 }
 286 
 287 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 288         .owner = THIS_MODULE,
 289         .read = NULL,
 290         .write = amdgpu_ras_debugfs_ctrl_write,
 291         .llseek = default_llseek
 292 };
 293 
 294 static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
 295                 struct device_attribute *attr, char *buf)
 296 {
 297         struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
 298         struct ras_query_if info = {
 299                 .head = obj->head,
 300         };
 301 
 302         if (amdgpu_ras_error_query(obj->adev, &info))
 303                 return -EINVAL;
 304 
 305         return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
 306                         "ue", info.ue_count,
 307                         "ce", info.ce_count);
 308 }
 309 
 310 /* obj begin */
 311 
 312 #define get_obj(obj) do { (obj)->use++; } while (0)
 313 #define alive_obj(obj) ((obj)->use)
 314 
 315 static inline void put_obj(struct ras_manager *obj)
 316 {
 317         if (obj && --obj->use == 0)
 318                 list_del(&obj->node);
 319         if (obj && obj->use < 0) {
 320                  DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
 321         }
 322 }
 323 
 324 /* make one obj and return it. */
 325 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
 326                 struct ras_common_if *head)
 327 {
 328         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 329         struct ras_manager *obj;
 330 
 331         if (!con)
 332                 return NULL;
 333 
 334         if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
 335                 return NULL;
 336 
 337         obj = &con->objs[head->block];
 338         /* already exist. return obj? */
 339         if (alive_obj(obj))
 340                 return NULL;
 341 
 342         obj->head = *head;
 343         obj->adev = adev;
 344         list_add(&obj->node, &con->head);
 345         get_obj(obj);
 346 
 347         return obj;
 348 }
 349 
 350 /* return an obj equal to head, or the first when head is NULL */
 351 static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 352                 struct ras_common_if *head)
 353 {
 354         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 355         struct ras_manager *obj;
 356         int i;
 357 
 358         if (!con)
 359                 return NULL;
 360 
 361         if (head) {
 362                 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
 363                         return NULL;
 364 
 365                 obj = &con->objs[head->block];
 366 
 367                 if (alive_obj(obj)) {
 368                         WARN_ON(head->block != obj->head.block);
 369                         return obj;
 370                 }
 371         } else {
 372                 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
 373                         obj = &con->objs[i];
 374                         if (alive_obj(obj)) {
 375                                 WARN_ON(i != obj->head.block);
 376                                 return obj;
 377                         }
 378                 }
 379         }
 380 
 381         return NULL;
 382 }
 383 /* obj end */
 384 
 385 /* feature ctl begin */
 386 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
 387                 struct ras_common_if *head)
 388 {
 389         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 390 
 391         return con->hw_supported & BIT(head->block);
 392 }
 393 
 394 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
 395                 struct ras_common_if *head)
 396 {
 397         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 398 
 399         return con->features & BIT(head->block);
 400 }
 401 
 402 /*
 403  * if obj is not created, then create one.
 404  * set feature enable flag.
 405  */
 406 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 407                 struct ras_common_if *head, int enable)
 408 {
 409         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 410         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 411 
 412         /* If hardware does not support ras, then do not create obj.
 413          * But if hardware support ras, we can create the obj.
 414          * Ras framework checks con->hw_supported to see if it need do
 415          * corresponding initialization.
 416          * IP checks con->support to see if it need disable ras.
 417          */
 418         if (!amdgpu_ras_is_feature_allowed(adev, head))
 419                 return 0;
 420         if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
 421                 return 0;
 422 
 423         if (enable) {
 424                 if (!obj) {
 425                         obj = amdgpu_ras_create_obj(adev, head);
 426                         if (!obj)
 427                                 return -EINVAL;
 428                 } else {
 429                         /* In case we create obj somewhere else */
 430                         get_obj(obj);
 431                 }
 432                 con->features |= BIT(head->block);
 433         } else {
 434                 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
 435                         con->features &= ~BIT(head->block);
 436                         put_obj(obj);
 437                 }
 438         }
 439 
 440         return 0;
 441 }
 442 
 443 /* wrapper of psp_ras_enable_features */
 444 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 445                 struct ras_common_if *head, bool enable)
 446 {
 447         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 448         union ta_ras_cmd_input info;
 449         int ret;
 450 
 451         if (!con)
 452                 return -EINVAL;
 453 
 454         if (!enable) {
 455                 info.disable_features = (struct ta_ras_disable_features_input) {
 456                         .block_id =  amdgpu_ras_block_to_ta(head->block),
 457                         .error_type = amdgpu_ras_error_to_ta(head->type),
 458                 };
 459         } else {
 460                 info.enable_features = (struct ta_ras_enable_features_input) {
 461                         .block_id =  amdgpu_ras_block_to_ta(head->block),
 462                         .error_type = amdgpu_ras_error_to_ta(head->type),
 463                 };
 464         }
 465 
 466         /* Do not enable if it is not allowed. */
 467         WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
 468         /* Are we alerady in that state we are going to set? */
 469         if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
 470                 return 0;
 471 
 472         ret = psp_ras_enable_features(&adev->psp, &info, enable);
 473         if (ret) {
 474                 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
 475                                 enable ? "enable":"disable",
 476                                 ras_block_str(head->block),
 477                                 ret);
 478                 if (ret == TA_RAS_STATUS__RESET_NEEDED)
 479                         return -EAGAIN;
 480                 return -EINVAL;
 481         }
 482 
 483         /* setup the obj */
 484         __amdgpu_ras_feature_enable(adev, head, enable);
 485 
 486         return 0;
 487 }
 488 
 489 /* Only used in device probe stage and called only once. */
 490 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
 491                 struct ras_common_if *head, bool enable)
 492 {
 493         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 494         int ret;
 495 
 496         if (!con)
 497                 return -EINVAL;
 498 
 499         if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
 500                 if (enable) {
 501                         /* There is no harm to issue a ras TA cmd regardless of
 502                          * the currecnt ras state.
 503                          * If current state == target state, it will do nothing
 504                          * But sometimes it requests driver to reset and repost
 505                          * with error code -EAGAIN.
 506                          */
 507                         ret = amdgpu_ras_feature_enable(adev, head, 1);
 508                         /* With old ras TA, we might fail to enable ras.
 509                          * Log it and just setup the object.
 510                          * TODO need remove this WA in the future.
 511                          */
 512                         if (ret == -EINVAL) {
 513                                 ret = __amdgpu_ras_feature_enable(adev, head, 1);
 514                                 if (!ret)
 515                                         DRM_INFO("RAS INFO: %s setup object\n",
 516                                                 ras_block_str(head->block));
 517                         }
 518                 } else {
 519                         /* setup the object then issue a ras TA disable cmd.*/
 520                         ret = __amdgpu_ras_feature_enable(adev, head, 1);
 521                         if (ret)
 522                                 return ret;
 523 
 524                         ret = amdgpu_ras_feature_enable(adev, head, 0);
 525                 }
 526         } else
 527                 ret = amdgpu_ras_feature_enable(adev, head, enable);
 528 
 529         return ret;
 530 }
 531 
 532 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
 533                 bool bypass)
 534 {
 535         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 536         struct ras_manager *obj, *tmp;
 537 
 538         list_for_each_entry_safe(obj, tmp, &con->head, node) {
 539                 /* bypass psp.
 540                  * aka just release the obj and corresponding flags
 541                  */
 542                 if (bypass) {
 543                         if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
 544                                 break;
 545                 } else {
 546                         if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
 547                                 break;
 548                 }
 549         }
 550 
 551         return con->features;
 552 }
 553 
 554 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
 555                 bool bypass)
 556 {
 557         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 558         int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
 559         int i;
 560         const enum amdgpu_ras_error_type default_ras_type =
 561                 AMDGPU_RAS_ERROR__NONE;
 562 
 563         for (i = 0; i < ras_block_count; i++) {
 564                 struct ras_common_if head = {
 565                         .block = i,
 566                         .type = default_ras_type,
 567                         .sub_block_index = 0,
 568                 };
 569                 strcpy(head.name, ras_block_str(i));
 570                 if (bypass) {
 571                         /*
 572                          * bypass psp. vbios enable ras for us.
 573                          * so just create the obj
 574                          */
 575                         if (__amdgpu_ras_feature_enable(adev, &head, 1))
 576                                 break;
 577                 } else {
 578                         if (amdgpu_ras_feature_enable(adev, &head, 1))
 579                                 break;
 580                 }
 581         }
 582 
 583         return con->features;
 584 }
 585 /* feature ctl end */
 586 
 587 /* query/inject/cure begin */
 588 int amdgpu_ras_error_query(struct amdgpu_device *adev,
 589                 struct ras_query_if *info)
 590 {
 591         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 592         struct ras_err_data err_data = {0, 0, 0, NULL};
 593 
 594         if (!obj)
 595                 return -EINVAL;
 596 
 597         switch (info->head.block) {
 598         case AMDGPU_RAS_BLOCK__UMC:
 599                 if (adev->umc.funcs->query_ras_error_count)
 600                         adev->umc.funcs->query_ras_error_count(adev, &err_data);
 601                 /* umc query_ras_error_address is also responsible for clearing
 602                  * error status
 603                  */
 604                 if (adev->umc.funcs->query_ras_error_address)
 605                         adev->umc.funcs->query_ras_error_address(adev, &err_data);
 606                 break;
 607         case AMDGPU_RAS_BLOCK__GFX:
 608                 if (adev->gfx.funcs->query_ras_error_count)
 609                         adev->gfx.funcs->query_ras_error_count(adev, &err_data);
 610                 break;
 611         case AMDGPU_RAS_BLOCK__MMHUB:
 612                 if (adev->mmhub_funcs->query_ras_error_count)
 613                         adev->mmhub_funcs->query_ras_error_count(adev, &err_data);
 614                 break;
 615         default:
 616                 break;
 617         }
 618 
 619         obj->err_data.ue_count += err_data.ue_count;
 620         obj->err_data.ce_count += err_data.ce_count;
 621 
 622         info->ue_count = obj->err_data.ue_count;
 623         info->ce_count = obj->err_data.ce_count;
 624 
 625         if (err_data.ce_count)
 626                 dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
 627                          obj->err_data.ce_count, ras_block_str(info->head.block));
 628         if (err_data.ue_count)
 629                 dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
 630                          obj->err_data.ue_count, ras_block_str(info->head.block));
 631 
 632         return 0;
 633 }
 634 
 635 /* wrapper of psp_ras_trigger_error */
 636 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 637                 struct ras_inject_if *info)
 638 {
 639         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 640         struct ta_ras_trigger_error_input block_info = {
 641                 .block_id =  amdgpu_ras_block_to_ta(info->head.block),
 642                 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
 643                 .sub_block_index = info->head.sub_block_index,
 644                 .address = info->address,
 645                 .value = info->value,
 646         };
 647         int ret = 0;
 648 
 649         if (!obj)
 650                 return -EINVAL;
 651 
 652         switch (info->head.block) {
 653         case AMDGPU_RAS_BLOCK__GFX:
 654                 if (adev->gfx.funcs->ras_error_inject)
 655                         ret = adev->gfx.funcs->ras_error_inject(adev, info);
 656                 else
 657                         ret = -EINVAL;
 658                 break;
 659         case AMDGPU_RAS_BLOCK__UMC:
 660         case AMDGPU_RAS_BLOCK__MMHUB:
 661                 ret = psp_ras_trigger_error(&adev->psp, &block_info);
 662                 break;
 663         default:
 664                 DRM_INFO("%s error injection is not supported yet\n",
 665                          ras_block_str(info->head.block));
 666                 ret = -EINVAL;
 667         }
 668 
 669         if (ret)
 670                 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
 671                                 ras_block_str(info->head.block),
 672                                 ret);
 673 
 674         return ret;
 675 }
 676 
 677 int amdgpu_ras_error_cure(struct amdgpu_device *adev,
 678                 struct ras_cure_if *info)
 679 {
 680         /* psp fw has no cure interface for now. */
 681         return 0;
 682 }
 683 
 684 /* get the total error counts on all IPs */
 685 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 686                 bool is_ce)
 687 {
 688         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 689         struct ras_manager *obj;
 690         struct ras_err_data data = {0, 0};
 691 
 692         if (!con)
 693                 return 0;
 694 
 695         list_for_each_entry(obj, &con->head, node) {
 696                 struct ras_query_if info = {
 697                         .head = obj->head,
 698                 };
 699 
 700                 if (amdgpu_ras_error_query(adev, &info))
 701                         return 0;
 702 
 703                 data.ce_count += info.ce_count;
 704                 data.ue_count += info.ue_count;
 705         }
 706 
 707         return is_ce ? data.ce_count : data.ue_count;
 708 }
 709 /* query/inject/cure end */
 710 
 711 
 712 /* sysfs begin */
 713 
 714 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 715                 struct ras_badpage **bps, unsigned int *count);
 716 
 717 static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
 718 {
 719         switch (flags) {
 720         case 0:
 721                 return "R";
 722         case 1:
 723                 return "P";
 724         case 2:
 725         default:
 726                 return "F";
 727         };
 728 }
 729 
 730 /*
 731  * DOC: ras sysfs gpu_vram_bad_pages interface
 732  *
 733  * It allows user to read the bad pages of vram on the gpu through
 734  * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
 735  *
 736  * It outputs multiple lines, and each line stands for one gpu page.
 737  *
 738  * The format of one line is below,
 739  * gpu pfn : gpu page size : flags
 740  *
 741  * gpu pfn and gpu page size are printed in hex format.
 742  * flags can be one of below character,
 743  * R: reserved, this gpu page is reserved and not able to use.
 744  * P: pending for reserve, this gpu page is marked as bad, will be reserved
 745  *    in next window of page_reserve.
 746  * F: unable to reserve. this gpu page can't be reserved due to some reasons.
 747  *
 748  * examples:
 749  * 0x00000001 : 0x00001000 : R
 750  * 0x00000002 : 0x00001000 : P
 751  */
 752 
 753 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
 754                 struct kobject *kobj, struct bin_attribute *attr,
 755                 char *buf, loff_t ppos, size_t count)
 756 {
 757         struct amdgpu_ras *con =
 758                 container_of(attr, struct amdgpu_ras, badpages_attr);
 759         struct amdgpu_device *adev = con->adev;
 760         const unsigned int element_size =
 761                 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
 762         unsigned int start = div64_ul(ppos + element_size - 1, element_size);
 763         unsigned int end = div64_ul(ppos + count - 1, element_size);
 764         ssize_t s = 0;
 765         struct ras_badpage *bps = NULL;
 766         unsigned int bps_count = 0;
 767 
 768         memset(buf, 0, count);
 769 
 770         if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
 771                 return 0;
 772 
 773         for (; start < end && start < bps_count; start++)
 774                 s += scnprintf(&buf[s], element_size + 1,
 775                                 "0x%08x : 0x%08x : %1s\n",
 776                                 bps[start].bp,
 777                                 bps[start].size,
 778                                 amdgpu_ras_badpage_flags_str(bps[start].flags));
 779 
 780         kfree(bps);
 781 
 782         return s;
 783 }
 784 
 785 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
 786                 struct device_attribute *attr, char *buf)
 787 {
 788         struct amdgpu_ras *con =
 789                 container_of(attr, struct amdgpu_ras, features_attr);
 790 
 791         return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
 792 }
 793 
 794 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
 795 {
 796         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 797         struct attribute *attrs[] = {
 798                 &con->features_attr.attr,
 799                 NULL
 800         };
 801         struct bin_attribute *bin_attrs[] = {
 802                 &con->badpages_attr,
 803                 NULL
 804         };
 805         struct attribute_group group = {
 806                 .name = "ras",
 807                 .attrs = attrs,
 808                 .bin_attrs = bin_attrs,
 809         };
 810 
 811         con->features_attr = (struct device_attribute) {
 812                 .attr = {
 813                         .name = "features",
 814                         .mode = S_IRUGO,
 815                 },
 816                         .show = amdgpu_ras_sysfs_features_read,
 817         };
 818 
 819         con->badpages_attr = (struct bin_attribute) {
 820                 .attr = {
 821                         .name = "gpu_vram_bad_pages",
 822                         .mode = S_IRUGO,
 823                 },
 824                 .size = 0,
 825                 .private = NULL,
 826                 .read = amdgpu_ras_sysfs_badpages_read,
 827         };
 828 
 829         sysfs_attr_init(attrs[0]);
 830         sysfs_bin_attr_init(bin_attrs[0]);
 831 
 832         return sysfs_create_group(&adev->dev->kobj, &group);
 833 }
 834 
 835 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
 836 {
 837         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 838         struct attribute *attrs[] = {
 839                 &con->features_attr.attr,
 840                 NULL
 841         };
 842         struct bin_attribute *bin_attrs[] = {
 843                 &con->badpages_attr,
 844                 NULL
 845         };
 846         struct attribute_group group = {
 847                 .name = "ras",
 848                 .attrs = attrs,
 849                 .bin_attrs = bin_attrs,
 850         };
 851 
 852         sysfs_remove_group(&adev->dev->kobj, &group);
 853 
 854         return 0;
 855 }
 856 
 857 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
 858                 struct ras_fs_if *head)
 859 {
 860         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
 861 
 862         if (!obj || obj->attr_inuse)
 863                 return -EINVAL;
 864 
 865         get_obj(obj);
 866 
 867         memcpy(obj->fs_data.sysfs_name,
 868                         head->sysfs_name,
 869                         sizeof(obj->fs_data.sysfs_name));
 870 
 871         obj->sysfs_attr = (struct device_attribute){
 872                 .attr = {
 873                         .name = obj->fs_data.sysfs_name,
 874                         .mode = S_IRUGO,
 875                 },
 876                         .show = amdgpu_ras_sysfs_read,
 877         };
 878         sysfs_attr_init(&obj->sysfs_attr.attr);
 879 
 880         if (sysfs_add_file_to_group(&adev->dev->kobj,
 881                                 &obj->sysfs_attr.attr,
 882                                 "ras")) {
 883                 put_obj(obj);
 884                 return -EINVAL;
 885         }
 886 
 887         obj->attr_inuse = 1;
 888 
 889         return 0;
 890 }
 891 
 892 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
 893                 struct ras_common_if *head)
 894 {
 895         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 896 
 897         if (!obj || !obj->attr_inuse)
 898                 return -EINVAL;
 899 
 900         sysfs_remove_file_from_group(&adev->dev->kobj,
 901                                 &obj->sysfs_attr.attr,
 902                                 "ras");
 903         obj->attr_inuse = 0;
 904         put_obj(obj);
 905 
 906         return 0;
 907 }
 908 
 909 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
 910 {
 911         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 912         struct ras_manager *obj, *tmp;
 913 
 914         list_for_each_entry_safe(obj, tmp, &con->head, node) {
 915                 amdgpu_ras_sysfs_remove(adev, &obj->head);
 916         }
 917 
 918         amdgpu_ras_sysfs_remove_feature_node(adev);
 919 
 920         return 0;
 921 }
 922 /* sysfs end */
 923 
 924 /* debugfs begin */
 925 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 926 {
 927         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 928         struct drm_minor *minor = adev->ddev->primary;
 929 
 930         con->dir = debugfs_create_dir("ras", minor->debugfs_root);
 931         con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
 932                                        adev, &amdgpu_ras_debugfs_ctrl_ops);
 933 }
 934 
 935 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
 936                 struct ras_fs_if *head)
 937 {
 938         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 939         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
 940 
 941         if (!obj || obj->ent)
 942                 return;
 943 
 944         get_obj(obj);
 945 
 946         memcpy(obj->fs_data.debugfs_name,
 947                         head->debugfs_name,
 948                         sizeof(obj->fs_data.debugfs_name));
 949 
 950         obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
 951                                        S_IWUGO | S_IRUGO, con->dir, obj,
 952                                        &amdgpu_ras_debugfs_ops);
 953 }
 954 
 955 void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
 956                 struct ras_common_if *head)
 957 {
 958         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 959 
 960         if (!obj || !obj->ent)
 961                 return;
 962 
 963         debugfs_remove(obj->ent);
 964         obj->ent = NULL;
 965         put_obj(obj);
 966 }
 967 
 968 static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
 969 {
 970         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 971         struct ras_manager *obj, *tmp;
 972 
 973         list_for_each_entry_safe(obj, tmp, &con->head, node) {
 974                 amdgpu_ras_debugfs_remove(adev, &obj->head);
 975         }
 976 
 977         debugfs_remove(con->ent);
 978         debugfs_remove(con->dir);
 979         con->dir = NULL;
 980         con->ent = NULL;
 981 }
 982 /* debugfs end */
 983 
 984 /* ras fs */
 985 
 986 static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
 987 {
 988         amdgpu_ras_sysfs_create_feature_node(adev);
 989         amdgpu_ras_debugfs_create_ctrl_node(adev);
 990 
 991         return 0;
 992 }
 993 
 994 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 995 {
 996         amdgpu_ras_debugfs_remove_all(adev);
 997         amdgpu_ras_sysfs_remove_all(adev);
 998         return 0;
 999 }
1000 /* ras fs end */
1001 
1002 /* ih begin */
1003 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1004 {
1005         struct ras_ih_data *data = &obj->ih_data;
1006         struct amdgpu_iv_entry entry;
1007         int ret;
1008         struct ras_err_data err_data = {0, 0, 0, NULL};
1009 
1010         while (data->rptr != data->wptr) {
1011                 rmb();
1012                 memcpy(&entry, &data->ring[data->rptr],
1013                                 data->element_size);
1014 
1015                 wmb();
1016                 data->rptr = (data->aligned_element_size +
1017                                 data->rptr) % data->ring_size;
1018 
1019                 /* Let IP handle its data, maybe we need get the output
1020                  * from the callback to udpate the error type/count, etc
1021                  */
1022                 if (data->cb) {
1023                         ret = data->cb(obj->adev, &err_data, &entry);
1024                         /* ue will trigger an interrupt, and in that case
1025                          * we need do a reset to recovery the whole system.
1026                          * But leave IP do that recovery, here we just dispatch
1027                          * the error.
1028                          */
1029                         if (ret == AMDGPU_RAS_SUCCESS) {
1030                                 /* these counts could be left as 0 if
1031                                  * some blocks do not count error number
1032                                  */
1033                                 obj->err_data.ue_count += err_data.ue_count;
1034                                 obj->err_data.ce_count += err_data.ce_count;
1035                         }
1036                 }
1037         }
1038 }
1039 
1040 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1041 {
1042         struct ras_ih_data *data =
1043                 container_of(work, struct ras_ih_data, ih_work);
1044         struct ras_manager *obj =
1045                 container_of(data, struct ras_manager, ih_data);
1046 
1047         amdgpu_ras_interrupt_handler(obj);
1048 }
1049 
1050 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1051                 struct ras_dispatch_if *info)
1052 {
1053         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1054         struct ras_ih_data *data = &obj->ih_data;
1055 
1056         if (!obj)
1057                 return -EINVAL;
1058 
1059         if (data->inuse == 0)
1060                 return 0;
1061 
1062         /* Might be overflow... */
1063         memcpy(&data->ring[data->wptr], info->entry,
1064                         data->element_size);
1065 
1066         wmb();
1067         data->wptr = (data->aligned_element_size +
1068                         data->wptr) % data->ring_size;
1069 
1070         schedule_work(&data->ih_work);
1071 
1072         return 0;
1073 }
1074 
1075 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1076                 struct ras_ih_if *info)
1077 {
1078         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1079         struct ras_ih_data *data;
1080 
1081         if (!obj)
1082                 return -EINVAL;
1083 
1084         data = &obj->ih_data;
1085         if (data->inuse == 0)
1086                 return 0;
1087 
1088         cancel_work_sync(&data->ih_work);
1089 
1090         kfree(data->ring);
1091         memset(data, 0, sizeof(*data));
1092         put_obj(obj);
1093 
1094         return 0;
1095 }
1096 
1097 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1098                 struct ras_ih_if *info)
1099 {
1100         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1101         struct ras_ih_data *data;
1102 
1103         if (!obj) {
1104                 /* in case we registe the IH before enable ras feature */
1105                 obj = amdgpu_ras_create_obj(adev, &info->head);
1106                 if (!obj)
1107                         return -EINVAL;
1108         } else
1109                 get_obj(obj);
1110 
1111         data = &obj->ih_data;
1112         /* add the callback.etc */
1113         *data = (struct ras_ih_data) {
1114                 .inuse = 0,
1115                 .cb = info->cb,
1116                 .element_size = sizeof(struct amdgpu_iv_entry),
1117                 .rptr = 0,
1118                 .wptr = 0,
1119         };
1120 
1121         INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1122 
1123         data->aligned_element_size = ALIGN(data->element_size, 8);
1124         /* the ring can store 64 iv entries. */
1125         data->ring_size = 64 * data->aligned_element_size;
1126         data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1127         if (!data->ring) {
1128                 put_obj(obj);
1129                 return -ENOMEM;
1130         }
1131 
1132         /* IH is ready */
1133         data->inuse = 1;
1134 
1135         return 0;
1136 }
1137 
1138 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1139 {
1140         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1141         struct ras_manager *obj, *tmp;
1142 
1143         list_for_each_entry_safe(obj, tmp, &con->head, node) {
1144                 struct ras_ih_if info = {
1145                         .head = obj->head,
1146                 };
1147                 amdgpu_ras_interrupt_remove_handler(adev, &info);
1148         }
1149 
1150         return 0;
1151 }
1152 /* ih end */
1153 
1154 /* recovery begin */
1155 
1156 /* return 0 on success.
1157  * caller need free bps.
1158  */
1159 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1160                 struct ras_badpage **bps, unsigned int *count)
1161 {
1162         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1163         struct ras_err_handler_data *data;
1164         int i = 0;
1165         int ret = 0;
1166 
1167         if (!con || !con->eh_data || !bps || !count)
1168                 return -EINVAL;
1169 
1170         mutex_lock(&con->recovery_lock);
1171         data = con->eh_data;
1172         if (!data || data->count == 0) {
1173                 *bps = NULL;
1174                 goto out;
1175         }
1176 
1177         *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1178         if (!*bps) {
1179                 ret = -ENOMEM;
1180                 goto out;
1181         }
1182 
1183         for (; i < data->count; i++) {
1184                 (*bps)[i] = (struct ras_badpage){
1185                         .bp = data->bps[i].bp,
1186                         .size = AMDGPU_GPU_PAGE_SIZE,
1187                         .flags = 0,
1188                 };
1189 
1190                 if (data->last_reserved <= i)
1191                         (*bps)[i].flags = 1;
1192                 else if (data->bps[i].bo == NULL)
1193                         (*bps)[i].flags = 2;
1194         }
1195 
1196         *count = data->count;
1197 out:
1198         mutex_unlock(&con->recovery_lock);
1199         return ret;
1200 }
1201 
1202 static void amdgpu_ras_do_recovery(struct work_struct *work)
1203 {
1204         struct amdgpu_ras *ras =
1205                 container_of(work, struct amdgpu_ras, recovery_work);
1206 
1207         amdgpu_device_gpu_recover(ras->adev, 0);
1208         atomic_set(&ras->in_recovery, 0);
1209 }
1210 
1211 /* alloc/realloc bps array */
1212 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1213                 struct ras_err_handler_data *data, int pages)
1214 {
1215         unsigned int old_space = data->count + data->space_left;
1216         unsigned int new_space = old_space + pages;
1217         unsigned int align_space = ALIGN(new_space, 1024);
1218         void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1219 
1220         if (!tmp)
1221                 return -ENOMEM;
1222 
1223         if (data->bps) {
1224                 memcpy(tmp, data->bps,
1225                                 data->count * sizeof(*data->bps));
1226                 kfree(data->bps);
1227         }
1228 
1229         data->bps = tmp;
1230         data->space_left += align_space - old_space;
1231         return 0;
1232 }
1233 
1234 /* it deal with vram only. */
1235 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1236                 unsigned long *bps, int pages)
1237 {
1238         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1239         struct ras_err_handler_data *data;
1240         int i = pages;
1241         int ret = 0;
1242 
1243         if (!con || !con->eh_data || !bps || pages <= 0)
1244                 return 0;
1245 
1246         mutex_lock(&con->recovery_lock);
1247         data = con->eh_data;
1248         if (!data)
1249                 goto out;
1250 
1251         if (data->space_left <= pages)
1252                 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1253                         ret = -ENOMEM;
1254                         goto out;
1255                 }
1256 
1257         while (i--)
1258                 data->bps[data->count++].bp = bps[i];
1259 
1260         data->space_left -= pages;
1261 out:
1262         mutex_unlock(&con->recovery_lock);
1263 
1264         return ret;
1265 }
1266 
1267 /* called in gpu recovery/init */
1268 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1269 {
1270         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1271         struct ras_err_handler_data *data;
1272         uint64_t bp;
1273         struct amdgpu_bo *bo = NULL;
1274         int i;
1275 
1276         if (!con || !con->eh_data)
1277                 return 0;
1278 
1279         mutex_lock(&con->recovery_lock);
1280         data = con->eh_data;
1281         if (!data)
1282                 goto out;
1283         /* reserve vram at driver post stage. */
1284         for (i = data->last_reserved; i < data->count; i++) {
1285                 bp = data->bps[i].bp;
1286 
1287                 if (amdgpu_bo_create_kernel_at(adev, bp << PAGE_SHIFT, PAGE_SIZE,
1288                                                AMDGPU_GEM_DOMAIN_VRAM,
1289                                                &bo, NULL))
1290                         DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1291 
1292                 data->bps[i].bo = bo;
1293                 data->last_reserved = i + 1;
1294                 bo = NULL;
1295         }
1296 out:
1297         mutex_unlock(&con->recovery_lock);
1298         return 0;
1299 }
1300 
1301 /* called when driver unload */
1302 static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1303 {
1304         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1305         struct ras_err_handler_data *data;
1306         struct amdgpu_bo *bo;
1307         int i;
1308 
1309         if (!con || !con->eh_data)
1310                 return 0;
1311 
1312         mutex_lock(&con->recovery_lock);
1313         data = con->eh_data;
1314         if (!data)
1315                 goto out;
1316 
1317         for (i = data->last_reserved - 1; i >= 0; i--) {
1318                 bo = data->bps[i].bo;
1319 
1320                 amdgpu_bo_free_kernel(&bo, NULL, NULL);
1321 
1322                 data->bps[i].bo = bo;
1323                 data->last_reserved = i;
1324         }
1325 out:
1326         mutex_unlock(&con->recovery_lock);
1327         return 0;
1328 }
1329 
1330 static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1331 {
1332         /* TODO
1333          * write the array to eeprom when SMU disabled.
1334          */
1335         return 0;
1336 }
1337 
1338 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1339 {
1340         /* TODO
1341          * read the array to eeprom when SMU disabled.
1342          */
1343         return 0;
1344 }
1345 
1346 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1347 {
1348         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1349         struct ras_err_handler_data **data = &con->eh_data;
1350 
1351         *data = kmalloc(sizeof(**data),
1352                         GFP_KERNEL|__GFP_ZERO);
1353         if (!*data)
1354                 return -ENOMEM;
1355 
1356         mutex_init(&con->recovery_lock);
1357         INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1358         atomic_set(&con->in_recovery, 0);
1359         con->adev = adev;
1360 
1361         amdgpu_ras_load_bad_pages(adev);
1362         amdgpu_ras_reserve_bad_pages(adev);
1363 
1364         return 0;
1365 }
1366 
1367 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1368 {
1369         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1370         struct ras_err_handler_data *data = con->eh_data;
1371 
1372         cancel_work_sync(&con->recovery_work);
1373         amdgpu_ras_save_bad_pages(adev);
1374         amdgpu_ras_release_bad_pages(adev);
1375 
1376         mutex_lock(&con->recovery_lock);
1377         con->eh_data = NULL;
1378         kfree(data->bps);
1379         kfree(data);
1380         mutex_unlock(&con->recovery_lock);
1381 
1382         return 0;
1383 }
1384 /* recovery end */
1385 
1386 /* return 0 if ras will reset gpu and repost.*/
1387 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1388                 unsigned int block)
1389 {
1390         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1391 
1392         if (!ras)
1393                 return -EINVAL;
1394 
1395         ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1396         return 0;
1397 }
1398 
1399 /*
1400  * check hardware's ras ability which will be saved in hw_supported.
1401  * if hardware does not support ras, we can skip some ras initializtion and
1402  * forbid some ras operations from IP.
1403  * if software itself, say boot parameter, limit the ras ability. We still
1404  * need allow IP do some limited operations, like disable. In such case,
1405  * we have to initialize ras as normal. but need check if operation is
1406  * allowed or not in each function.
1407  */
1408 static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1409                 uint32_t *hw_supported, uint32_t *supported)
1410 {
1411         *hw_supported = 0;
1412         *supported = 0;
1413 
1414         if (amdgpu_sriov_vf(adev) ||
1415                         adev->asic_type != CHIP_VEGA20)
1416                 return;
1417 
1418         if (adev->is_atom_fw &&
1419                         (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1420                          amdgpu_atomfirmware_sram_ecc_supported(adev)))
1421                 *hw_supported = AMDGPU_RAS_BLOCK_MASK;
1422 
1423         *supported = amdgpu_ras_enable == 0 ?
1424                                 0 : *hw_supported & amdgpu_ras_mask;
1425 }
1426 
1427 int amdgpu_ras_init(struct amdgpu_device *adev)
1428 {
1429         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1430 
1431         if (con)
1432                 return 0;
1433 
1434         con = kmalloc(sizeof(struct amdgpu_ras) +
1435                         sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1436                         GFP_KERNEL|__GFP_ZERO);
1437         if (!con)
1438                 return -ENOMEM;
1439 
1440         con->objs = (struct ras_manager *)(con + 1);
1441 
1442         amdgpu_ras_set_context(adev, con);
1443 
1444         amdgpu_ras_check_supported(adev, &con->hw_supported,
1445                         &con->supported);
1446         if (!con->hw_supported) {
1447                 amdgpu_ras_set_context(adev, NULL);
1448                 kfree(con);
1449                 return 0;
1450         }
1451 
1452         con->features = 0;
1453         INIT_LIST_HEAD(&con->head);
1454         /* Might need get this flag from vbios. */
1455         con->flags = RAS_DEFAULT_FLAGS;
1456 
1457         if (amdgpu_ras_recovery_init(adev))
1458                 goto recovery_out;
1459 
1460         amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1461 
1462         if (amdgpu_ras_fs_init(adev))
1463                 goto fs_out;
1464 
1465         /* ras init for each ras block */
1466         if (adev->umc.funcs->ras_init)
1467                 adev->umc.funcs->ras_init(adev);
1468 
1469         DRM_INFO("RAS INFO: ras initialized successfully, "
1470                         "hardware ability[%x] ras_mask[%x]\n",
1471                         con->hw_supported, con->supported);
1472         return 0;
1473 fs_out:
1474         amdgpu_ras_recovery_fini(adev);
1475 recovery_out:
1476         amdgpu_ras_set_context(adev, NULL);
1477         kfree(con);
1478 
1479         return -EINVAL;
1480 }
1481 
1482 /* do some init work after IP late init as dependence.
1483  * and it runs in resume/gpu reset/booting up cases.
1484  */
1485 void amdgpu_ras_resume(struct amdgpu_device *adev)
1486 {
1487         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1488         struct ras_manager *obj, *tmp;
1489 
1490         if (!con)
1491                 return;
1492 
1493         if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1494                 /* Set up all other IPs which are not implemented. There is a
1495                  * tricky thing that IP's actual ras error type should be
1496                  * MULTI_UNCORRECTABLE, but as driver does not handle it, so
1497                  * ERROR_NONE make sense anyway.
1498                  */
1499                 amdgpu_ras_enable_all_features(adev, 1);
1500 
1501                 /* We enable ras on all hw_supported block, but as boot
1502                  * parameter might disable some of them and one or more IP has
1503                  * not implemented yet. So we disable them on behalf.
1504                  */
1505                 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1506                         if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1507                                 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1508                                 /* there should be no any reference. */
1509                                 WARN_ON(alive_obj(obj));
1510                         }
1511                 }
1512         }
1513 
1514         if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1515                 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1516                 /* setup ras obj state as disabled.
1517                  * for init_by_vbios case.
1518                  * if we want to enable ras, just enable it in a normal way.
1519                  * If we want do disable it, need setup ras obj as enabled,
1520                  * then issue another TA disable cmd.
1521                  * See feature_enable_on_boot
1522                  */
1523                 amdgpu_ras_disable_all_features(adev, 1);
1524                 amdgpu_ras_reset_gpu(adev, 0);
1525         }
1526 }
1527 
1528 void amdgpu_ras_suspend(struct amdgpu_device *adev)
1529 {
1530         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1531 
1532         if (!con)
1533                 return;
1534 
1535         amdgpu_ras_disable_all_features(adev, 0);
1536         /* Make sure all ras objects are disabled. */
1537         if (con->features)
1538                 amdgpu_ras_disable_all_features(adev, 1);
1539 }
1540 
1541 /* do some fini work before IP fini as dependence */
1542 int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1543 {
1544         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1545 
1546         if (!con)
1547                 return 0;
1548 
1549         /* Need disable ras on all IPs here before ip [hw/sw]fini */
1550         amdgpu_ras_disable_all_features(adev, 0);
1551         amdgpu_ras_recovery_fini(adev);
1552         return 0;
1553 }
1554 
1555 int amdgpu_ras_fini(struct amdgpu_device *adev)
1556 {
1557         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1558 
1559         if (!con)
1560                 return 0;
1561 
1562         amdgpu_ras_fs_fini(adev);
1563         amdgpu_ras_interrupt_remove_all(adev);
1564 
1565         WARN(con->features, "Feature mask is not cleared");
1566 
1567         if (con->features)
1568                 amdgpu_ras_disable_all_features(adev, 1);
1569 
1570         amdgpu_ras_set_context(adev, NULL);
1571         kfree(con);
1572 
1573         return 0;
1574 }

/* [<][>][^][v][top][bottom][index][help] */