root/drivers/gpu/drm/i915/gt/selftest_hangcheck.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. hang_init
  2. hws_address
  3. move_to_active
  4. hang_create_request
  5. hws_seqno
  6. hang_fini
  7. wait_until_running
  8. igt_hang_sanitycheck
  9. wait_for_idle
  10. igt_reset_nop
  11. igt_reset_nop_engine
  12. __igt_reset_engine
  13. igt_reset_idle_engine
  14. igt_reset_active_engine
  15. active_request_put
  16. active_engine
  17. __igt_reset_engines
  18. igt_reset_engines
  19. fake_hangcheck
  20. igt_reset_wait
  21. evict_vma
  22. evict_fence
  23. __igt_reset_evict_vma
  24. igt_reset_evict_ggtt
  25. igt_reset_evict_ppgtt
  26. igt_reset_evict_fence
  27. wait_for_others
  28. igt_reset_queue
  29. igt_handle_error
  30. __igt_atomic_reset_engine
  31. igt_atomic_reset_engine
  32. igt_reset_engines_atomic
  33. intel_hangcheck_live_selftests

   1 /*
   2  * Copyright © 2016 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24 
  25 #include <linux/kthread.h>
  26 
  27 #include "gem/i915_gem_context.h"
  28 #include "gt/intel_gt.h"
  29 #include "intel_engine_pm.h"
  30 
  31 #include "i915_selftest.h"
  32 #include "selftests/i915_random.h"
  33 #include "selftests/igt_flush_test.h"
  34 #include "selftests/igt_reset.h"
  35 #include "selftests/igt_atomic.h"
  36 
  37 #include "selftests/mock_drm.h"
  38 
  39 #include "gem/selftests/mock_context.h"
  40 #include "gem/selftests/igt_gem_utils.h"
  41 
  42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
  43 
  44 struct hang {
  45         struct intel_gt *gt;
  46         struct drm_i915_gem_object *hws;
  47         struct drm_i915_gem_object *obj;
  48         struct i915_gem_context *ctx;
  49         u32 *seqno;
  50         u32 *batch;
  51 };
  52 
  53 static int hang_init(struct hang *h, struct intel_gt *gt)
  54 {
  55         void *vaddr;
  56         int err;
  57 
  58         memset(h, 0, sizeof(*h));
  59         h->gt = gt;
  60 
  61         h->ctx = kernel_context(gt->i915);
  62         if (IS_ERR(h->ctx))
  63                 return PTR_ERR(h->ctx);
  64 
  65         GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
  66 
  67         h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  68         if (IS_ERR(h->hws)) {
  69                 err = PTR_ERR(h->hws);
  70                 goto err_ctx;
  71         }
  72 
  73         h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  74         if (IS_ERR(h->obj)) {
  75                 err = PTR_ERR(h->obj);
  76                 goto err_hws;
  77         }
  78 
  79         i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
  80         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  81         if (IS_ERR(vaddr)) {
  82                 err = PTR_ERR(vaddr);
  83                 goto err_obj;
  84         }
  85         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  86 
  87         vaddr = i915_gem_object_pin_map(h->obj,
  88                                         i915_coherent_map_type(gt->i915));
  89         if (IS_ERR(vaddr)) {
  90                 err = PTR_ERR(vaddr);
  91                 goto err_unpin_hws;
  92         }
  93         h->batch = vaddr;
  94 
  95         return 0;
  96 
  97 err_unpin_hws:
  98         i915_gem_object_unpin_map(h->hws);
  99 err_obj:
 100         i915_gem_object_put(h->obj);
 101 err_hws:
 102         i915_gem_object_put(h->hws);
 103 err_ctx:
 104         kernel_context_close(h->ctx);
 105         return err;
 106 }
 107 
 108 static u64 hws_address(const struct i915_vma *hws,
 109                        const struct i915_request *rq)
 110 {
 111         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
 112 }
 113 
 114 static int move_to_active(struct i915_vma *vma,
 115                           struct i915_request *rq,
 116                           unsigned int flags)
 117 {
 118         int err;
 119 
 120         i915_vma_lock(vma);
 121         err = i915_request_await_object(rq, vma->obj,
 122                                         flags & EXEC_OBJECT_WRITE);
 123         if (err == 0)
 124                 err = i915_vma_move_to_active(vma, rq, flags);
 125         i915_vma_unlock(vma);
 126 
 127         return err;
 128 }
 129 
 130 static struct i915_request *
 131 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 132 {
 133         struct intel_gt *gt = h->gt;
 134         struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm;
 135         struct drm_i915_gem_object *obj;
 136         struct i915_request *rq = NULL;
 137         struct i915_vma *hws, *vma;
 138         unsigned int flags;
 139         void *vaddr;
 140         u32 *batch;
 141         int err;
 142 
 143         obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
 144         if (IS_ERR(obj))
 145                 return ERR_CAST(obj);
 146 
 147         vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
 148         if (IS_ERR(vaddr)) {
 149                 i915_gem_object_put(obj);
 150                 return ERR_CAST(vaddr);
 151         }
 152 
 153         i915_gem_object_unpin_map(h->obj);
 154         i915_gem_object_put(h->obj);
 155 
 156         h->obj = obj;
 157         h->batch = vaddr;
 158 
 159         vma = i915_vma_instance(h->obj, vm, NULL);
 160         if (IS_ERR(vma))
 161                 return ERR_CAST(vma);
 162 
 163         hws = i915_vma_instance(h->hws, vm, NULL);
 164         if (IS_ERR(hws))
 165                 return ERR_CAST(hws);
 166 
 167         err = i915_vma_pin(vma, 0, 0, PIN_USER);
 168         if (err)
 169                 return ERR_PTR(err);
 170 
 171         err = i915_vma_pin(hws, 0, 0, PIN_USER);
 172         if (err)
 173                 goto unpin_vma;
 174 
 175         rq = igt_request_alloc(h->ctx, engine);
 176         if (IS_ERR(rq)) {
 177                 err = PTR_ERR(rq);
 178                 goto unpin_hws;
 179         }
 180 
 181         err = move_to_active(vma, rq, 0);
 182         if (err)
 183                 goto cancel_rq;
 184 
 185         err = move_to_active(hws, rq, 0);
 186         if (err)
 187                 goto cancel_rq;
 188 
 189         batch = h->batch;
 190         if (INTEL_GEN(gt->i915) >= 8) {
 191                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 192                 *batch++ = lower_32_bits(hws_address(hws, rq));
 193                 *batch++ = upper_32_bits(hws_address(hws, rq));
 194                 *batch++ = rq->fence.seqno;
 195                 *batch++ = MI_ARB_CHECK;
 196 
 197                 memset(batch, 0, 1024);
 198                 batch += 1024 / sizeof(*batch);
 199 
 200                 *batch++ = MI_ARB_CHECK;
 201                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 202                 *batch++ = lower_32_bits(vma->node.start);
 203                 *batch++ = upper_32_bits(vma->node.start);
 204         } else if (INTEL_GEN(gt->i915) >= 6) {
 205                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 206                 *batch++ = 0;
 207                 *batch++ = lower_32_bits(hws_address(hws, rq));
 208                 *batch++ = rq->fence.seqno;
 209                 *batch++ = MI_ARB_CHECK;
 210 
 211                 memset(batch, 0, 1024);
 212                 batch += 1024 / sizeof(*batch);
 213 
 214                 *batch++ = MI_ARB_CHECK;
 215                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 216                 *batch++ = lower_32_bits(vma->node.start);
 217         } else if (INTEL_GEN(gt->i915) >= 4) {
 218                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 219                 *batch++ = 0;
 220                 *batch++ = lower_32_bits(hws_address(hws, rq));
 221                 *batch++ = rq->fence.seqno;
 222                 *batch++ = MI_ARB_CHECK;
 223 
 224                 memset(batch, 0, 1024);
 225                 batch += 1024 / sizeof(*batch);
 226 
 227                 *batch++ = MI_ARB_CHECK;
 228                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 229                 *batch++ = lower_32_bits(vma->node.start);
 230         } else {
 231                 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 232                 *batch++ = lower_32_bits(hws_address(hws, rq));
 233                 *batch++ = rq->fence.seqno;
 234                 *batch++ = MI_ARB_CHECK;
 235 
 236                 memset(batch, 0, 1024);
 237                 batch += 1024 / sizeof(*batch);
 238 
 239                 *batch++ = MI_ARB_CHECK;
 240                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 241                 *batch++ = lower_32_bits(vma->node.start);
 242         }
 243         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 244         intel_gt_chipset_flush(engine->gt);
 245 
 246         if (rq->engine->emit_init_breadcrumb) {
 247                 err = rq->engine->emit_init_breadcrumb(rq);
 248                 if (err)
 249                         goto cancel_rq;
 250         }
 251 
 252         flags = 0;
 253         if (INTEL_GEN(gt->i915) <= 5)
 254                 flags |= I915_DISPATCH_SECURE;
 255 
 256         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 257 
 258 cancel_rq:
 259         if (err) {
 260                 i915_request_skip(rq, err);
 261                 i915_request_add(rq);
 262         }
 263 unpin_hws:
 264         i915_vma_unpin(hws);
 265 unpin_vma:
 266         i915_vma_unpin(vma);
 267         return err ? ERR_PTR(err) : rq;
 268 }
 269 
 270 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 271 {
 272         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 273 }
 274 
 275 static void hang_fini(struct hang *h)
 276 {
 277         *h->batch = MI_BATCH_BUFFER_END;
 278         intel_gt_chipset_flush(h->gt);
 279 
 280         i915_gem_object_unpin_map(h->obj);
 281         i915_gem_object_put(h->obj);
 282 
 283         i915_gem_object_unpin_map(h->hws);
 284         i915_gem_object_put(h->hws);
 285 
 286         kernel_context_close(h->ctx);
 287 
 288         igt_flush_test(h->gt->i915, I915_WAIT_LOCKED);
 289 }
 290 
 291 static bool wait_until_running(struct hang *h, struct i915_request *rq)
 292 {
 293         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 294                                                rq->fence.seqno),
 295                              10) &&
 296                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
 297                                             rq->fence.seqno),
 298                           1000));
 299 }
 300 
 301 static int igt_hang_sanitycheck(void *arg)
 302 {
 303         struct intel_gt *gt = arg;
 304         struct i915_request *rq;
 305         struct intel_engine_cs *engine;
 306         enum intel_engine_id id;
 307         struct hang h;
 308         int err;
 309 
 310         /* Basic check that we can execute our hanging batch */
 311 
 312         mutex_lock(&gt->i915->drm.struct_mutex);
 313         err = hang_init(&h, gt);
 314         if (err)
 315                 goto unlock;
 316 
 317         for_each_engine(engine, gt->i915, id) {
 318                 struct intel_wedge_me w;
 319                 long timeout;
 320 
 321                 if (!intel_engine_can_store_dword(engine))
 322                         continue;
 323 
 324                 rq = hang_create_request(&h, engine);
 325                 if (IS_ERR(rq)) {
 326                         err = PTR_ERR(rq);
 327                         pr_err("Failed to create request for %s, err=%d\n",
 328                                engine->name, err);
 329                         goto fini;
 330                 }
 331 
 332                 i915_request_get(rq);
 333 
 334                 *h.batch = MI_BATCH_BUFFER_END;
 335                 intel_gt_chipset_flush(engine->gt);
 336 
 337                 i915_request_add(rq);
 338 
 339                 timeout = 0;
 340                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
 341                         timeout = i915_request_wait(rq, 0,
 342                                                     MAX_SCHEDULE_TIMEOUT);
 343                 if (intel_gt_is_wedged(gt))
 344                         timeout = -EIO;
 345 
 346                 i915_request_put(rq);
 347 
 348                 if (timeout < 0) {
 349                         err = timeout;
 350                         pr_err("Wait for request failed on %s, err=%d\n",
 351                                engine->name, err);
 352                         goto fini;
 353                 }
 354         }
 355 
 356 fini:
 357         hang_fini(&h);
 358 unlock:
 359         mutex_unlock(&gt->i915->drm.struct_mutex);
 360         return err;
 361 }
 362 
 363 static bool wait_for_idle(struct intel_engine_cs *engine)
 364 {
 365         return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
 366 }
 367 
 368 static int igt_reset_nop(void *arg)
 369 {
 370         struct intel_gt *gt = arg;
 371         struct i915_gpu_error *global = &gt->i915->gpu_error;
 372         struct intel_engine_cs *engine;
 373         struct i915_gem_context *ctx;
 374         unsigned int reset_count, count;
 375         enum intel_engine_id id;
 376         struct drm_file *file;
 377         IGT_TIMEOUT(end_time);
 378         int err = 0;
 379 
 380         /* Check that we can reset during non-user portions of requests */
 381 
 382         file = mock_file(gt->i915);
 383         if (IS_ERR(file))
 384                 return PTR_ERR(file);
 385 
 386         mutex_lock(&gt->i915->drm.struct_mutex);
 387         ctx = live_context(gt->i915, file);
 388         mutex_unlock(&gt->i915->drm.struct_mutex);
 389         if (IS_ERR(ctx)) {
 390                 err = PTR_ERR(ctx);
 391                 goto out;
 392         }
 393 
 394         i915_gem_context_clear_bannable(ctx);
 395         reset_count = i915_reset_count(global);
 396         count = 0;
 397         do {
 398                 mutex_lock(&gt->i915->drm.struct_mutex);
 399 
 400                 for_each_engine(engine, gt->i915, id) {
 401                         int i;
 402 
 403                         for (i = 0; i < 16; i++) {
 404                                 struct i915_request *rq;
 405 
 406                                 rq = igt_request_alloc(ctx, engine);
 407                                 if (IS_ERR(rq)) {
 408                                         err = PTR_ERR(rq);
 409                                         break;
 410                                 }
 411 
 412                                 i915_request_add(rq);
 413                         }
 414                 }
 415 
 416                 igt_global_reset_lock(gt);
 417                 intel_gt_reset(gt, ALL_ENGINES, NULL);
 418                 igt_global_reset_unlock(gt);
 419 
 420                 mutex_unlock(&gt->i915->drm.struct_mutex);
 421                 if (intel_gt_is_wedged(gt)) {
 422                         err = -EIO;
 423                         break;
 424                 }
 425 
 426                 if (i915_reset_count(global) != reset_count + ++count) {
 427                         pr_err("Full GPU reset not recorded!\n");
 428                         err = -EINVAL;
 429                         break;
 430                 }
 431 
 432                 err = igt_flush_test(gt->i915, 0);
 433                 if (err)
 434                         break;
 435         } while (time_before(jiffies, end_time));
 436         pr_info("%s: %d resets\n", __func__, count);
 437 
 438         mutex_lock(&gt->i915->drm.struct_mutex);
 439         err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
 440         mutex_unlock(&gt->i915->drm.struct_mutex);
 441 
 442 out:
 443         mock_file_free(gt->i915, file);
 444         if (intel_gt_is_wedged(gt))
 445                 err = -EIO;
 446         return err;
 447 }
 448 
 449 static int igt_reset_nop_engine(void *arg)
 450 {
 451         struct intel_gt *gt = arg;
 452         struct i915_gpu_error *global = &gt->i915->gpu_error;
 453         struct intel_engine_cs *engine;
 454         struct i915_gem_context *ctx;
 455         enum intel_engine_id id;
 456         struct drm_file *file;
 457         int err = 0;
 458 
 459         /* Check that we can engine-reset during non-user portions */
 460 
 461         if (!intel_has_reset_engine(gt->i915))
 462                 return 0;
 463 
 464         file = mock_file(gt->i915);
 465         if (IS_ERR(file))
 466                 return PTR_ERR(file);
 467 
 468         mutex_lock(&gt->i915->drm.struct_mutex);
 469         ctx = live_context(gt->i915, file);
 470         mutex_unlock(&gt->i915->drm.struct_mutex);
 471         if (IS_ERR(ctx)) {
 472                 err = PTR_ERR(ctx);
 473                 goto out;
 474         }
 475 
 476         i915_gem_context_clear_bannable(ctx);
 477         for_each_engine(engine, gt->i915, id) {
 478                 unsigned int reset_count, reset_engine_count;
 479                 unsigned int count;
 480                 IGT_TIMEOUT(end_time);
 481 
 482                 reset_count = i915_reset_count(global);
 483                 reset_engine_count = i915_reset_engine_count(global, engine);
 484                 count = 0;
 485 
 486                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 487                 do {
 488                         int i;
 489 
 490                         if (!wait_for_idle(engine)) {
 491                                 pr_err("%s failed to idle before reset\n",
 492                                        engine->name);
 493                                 err = -EIO;
 494                                 break;
 495                         }
 496 
 497                         mutex_lock(&gt->i915->drm.struct_mutex);
 498                         for (i = 0; i < 16; i++) {
 499                                 struct i915_request *rq;
 500 
 501                                 rq = igt_request_alloc(ctx, engine);
 502                                 if (IS_ERR(rq)) {
 503                                         err = PTR_ERR(rq);
 504                                         break;
 505                                 }
 506 
 507                                 i915_request_add(rq);
 508                         }
 509                         err = intel_engine_reset(engine, NULL);
 510                         mutex_unlock(&gt->i915->drm.struct_mutex);
 511                         if (err) {
 512                                 pr_err("i915_reset_engine failed\n");
 513                                 break;
 514                         }
 515 
 516                         if (i915_reset_count(global) != reset_count) {
 517                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
 518                                 err = -EINVAL;
 519                                 break;
 520                         }
 521 
 522                         if (i915_reset_engine_count(global, engine) !=
 523                             reset_engine_count + ++count) {
 524                                 pr_err("%s engine reset not recorded!\n",
 525                                        engine->name);
 526                                 err = -EINVAL;
 527                                 break;
 528                         }
 529                 } while (time_before(jiffies, end_time));
 530                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 531                 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 532 
 533                 if (err)
 534                         break;
 535 
 536                 err = igt_flush_test(gt->i915, 0);
 537                 if (err)
 538                         break;
 539         }
 540 
 541         mutex_lock(&gt->i915->drm.struct_mutex);
 542         err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
 543         mutex_unlock(&gt->i915->drm.struct_mutex);
 544 
 545 out:
 546         mock_file_free(gt->i915, file);
 547         if (intel_gt_is_wedged(gt))
 548                 err = -EIO;
 549         return err;
 550 }
 551 
 552 static int __igt_reset_engine(struct intel_gt *gt, bool active)
 553 {
 554         struct i915_gpu_error *global = &gt->i915->gpu_error;
 555         struct intel_engine_cs *engine;
 556         enum intel_engine_id id;
 557         struct hang h;
 558         int err = 0;
 559 
 560         /* Check that we can issue an engine reset on an idle engine (no-op) */
 561 
 562         if (!intel_has_reset_engine(gt->i915))
 563                 return 0;
 564 
 565         if (active) {
 566                 mutex_lock(&gt->i915->drm.struct_mutex);
 567                 err = hang_init(&h, gt);
 568                 mutex_unlock(&gt->i915->drm.struct_mutex);
 569                 if (err)
 570                         return err;
 571         }
 572 
 573         for_each_engine(engine, gt->i915, id) {
 574                 unsigned int reset_count, reset_engine_count;
 575                 IGT_TIMEOUT(end_time);
 576 
 577                 if (active && !intel_engine_can_store_dword(engine))
 578                         continue;
 579 
 580                 if (!wait_for_idle(engine)) {
 581                         pr_err("%s failed to idle before reset\n",
 582                                engine->name);
 583                         err = -EIO;
 584                         break;
 585                 }
 586 
 587                 reset_count = i915_reset_count(global);
 588                 reset_engine_count = i915_reset_engine_count(global, engine);
 589 
 590                 intel_engine_pm_get(engine);
 591                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 592                 do {
 593                         if (active) {
 594                                 struct i915_request *rq;
 595 
 596                                 mutex_lock(&gt->i915->drm.struct_mutex);
 597                                 rq = hang_create_request(&h, engine);
 598                                 if (IS_ERR(rq)) {
 599                                         err = PTR_ERR(rq);
 600                                         mutex_unlock(&gt->i915->drm.struct_mutex);
 601                                         break;
 602                                 }
 603 
 604                                 i915_request_get(rq);
 605                                 i915_request_add(rq);
 606                                 mutex_unlock(&gt->i915->drm.struct_mutex);
 607 
 608                                 if (!wait_until_running(&h, rq)) {
 609                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
 610 
 611                                         pr_err("%s: Failed to start request %llx, at %x\n",
 612                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
 613                                         intel_engine_dump(engine, &p,
 614                                                           "%s\n", engine->name);
 615 
 616                                         i915_request_put(rq);
 617                                         err = -EIO;
 618                                         break;
 619                                 }
 620 
 621                                 i915_request_put(rq);
 622                         }
 623 
 624                         err = intel_engine_reset(engine, NULL);
 625                         if (err) {
 626                                 pr_err("i915_reset_engine failed\n");
 627                                 break;
 628                         }
 629 
 630                         if (i915_reset_count(global) != reset_count) {
 631                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
 632                                 err = -EINVAL;
 633                                 break;
 634                         }
 635 
 636                         if (i915_reset_engine_count(global, engine) !=
 637                             ++reset_engine_count) {
 638                                 pr_err("%s engine reset not recorded!\n",
 639                                        engine->name);
 640                                 err = -EINVAL;
 641                                 break;
 642                         }
 643                 } while (time_before(jiffies, end_time));
 644                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 645                 intel_engine_pm_put(engine);
 646 
 647                 if (err)
 648                         break;
 649 
 650                 err = igt_flush_test(gt->i915, 0);
 651                 if (err)
 652                         break;
 653         }
 654 
 655         if (intel_gt_is_wedged(gt))
 656                 err = -EIO;
 657 
 658         if (active) {
 659                 mutex_lock(&gt->i915->drm.struct_mutex);
 660                 hang_fini(&h);
 661                 mutex_unlock(&gt->i915->drm.struct_mutex);
 662         }
 663 
 664         return err;
 665 }
 666 
 667 static int igt_reset_idle_engine(void *arg)
 668 {
 669         return __igt_reset_engine(arg, false);
 670 }
 671 
 672 static int igt_reset_active_engine(void *arg)
 673 {
 674         return __igt_reset_engine(arg, true);
 675 }
 676 
 677 struct active_engine {
 678         struct task_struct *task;
 679         struct intel_engine_cs *engine;
 680         unsigned long resets;
 681         unsigned int flags;
 682 };
 683 
 684 #define TEST_ACTIVE     BIT(0)
 685 #define TEST_OTHERS     BIT(1)
 686 #define TEST_SELF       BIT(2)
 687 #define TEST_PRIORITY   BIT(3)
 688 
 689 static int active_request_put(struct i915_request *rq)
 690 {
 691         int err = 0;
 692 
 693         if (!rq)
 694                 return 0;
 695 
 696         if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
 697                 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
 698                           rq->engine->name,
 699                           rq->fence.context,
 700                           rq->fence.seqno);
 701                 GEM_TRACE_DUMP();
 702 
 703                 intel_gt_set_wedged(rq->engine->gt);
 704                 err = -EIO;
 705         }
 706 
 707         i915_request_put(rq);
 708 
 709         return err;
 710 }
 711 
 712 static int active_engine(void *data)
 713 {
 714         I915_RND_STATE(prng);
 715         struct active_engine *arg = data;
 716         struct intel_engine_cs *engine = arg->engine;
 717         struct i915_request *rq[8] = {};
 718         struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
 719         struct drm_file *file;
 720         unsigned long count = 0;
 721         int err = 0;
 722 
 723         file = mock_file(engine->i915);
 724         if (IS_ERR(file))
 725                 return PTR_ERR(file);
 726 
 727         for (count = 0; count < ARRAY_SIZE(ctx); count++) {
 728                 mutex_lock(&engine->i915->drm.struct_mutex);
 729                 ctx[count] = live_context(engine->i915, file);
 730                 mutex_unlock(&engine->i915->drm.struct_mutex);
 731                 if (IS_ERR(ctx[count])) {
 732                         err = PTR_ERR(ctx[count]);
 733                         while (--count)
 734                                 i915_gem_context_put(ctx[count]);
 735                         goto err_file;
 736                 }
 737         }
 738 
 739         while (!kthread_should_stop()) {
 740                 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
 741                 struct i915_request *old = rq[idx];
 742                 struct i915_request *new;
 743 
 744                 mutex_lock(&engine->i915->drm.struct_mutex);
 745                 new = igt_request_alloc(ctx[idx], engine);
 746                 if (IS_ERR(new)) {
 747                         mutex_unlock(&engine->i915->drm.struct_mutex);
 748                         err = PTR_ERR(new);
 749                         break;
 750                 }
 751 
 752                 if (arg->flags & TEST_PRIORITY)
 753                         ctx[idx]->sched.priority =
 754                                 i915_prandom_u32_max_state(512, &prng);
 755 
 756                 rq[idx] = i915_request_get(new);
 757                 i915_request_add(new);
 758                 mutex_unlock(&engine->i915->drm.struct_mutex);
 759 
 760                 err = active_request_put(old);
 761                 if (err)
 762                         break;
 763 
 764                 cond_resched();
 765         }
 766 
 767         for (count = 0; count < ARRAY_SIZE(rq); count++) {
 768                 int err__ = active_request_put(rq[count]);
 769 
 770                 /* Keep the first error */
 771                 if (!err)
 772                         err = err__;
 773         }
 774 
 775 err_file:
 776         mock_file_free(engine->i915, file);
 777         return err;
 778 }
 779 
 780 static int __igt_reset_engines(struct intel_gt *gt,
 781                                const char *test_name,
 782                                unsigned int flags)
 783 {
 784         struct i915_gpu_error *global = &gt->i915->gpu_error;
 785         struct intel_engine_cs *engine, *other;
 786         enum intel_engine_id id, tmp;
 787         struct hang h;
 788         int err = 0;
 789 
 790         /* Check that issuing a reset on one engine does not interfere
 791          * with any other engine.
 792          */
 793 
 794         if (!intel_has_reset_engine(gt->i915))
 795                 return 0;
 796 
 797         if (flags & TEST_ACTIVE) {
 798                 mutex_lock(&gt->i915->drm.struct_mutex);
 799                 err = hang_init(&h, gt);
 800                 mutex_unlock(&gt->i915->drm.struct_mutex);
 801                 if (err)
 802                         return err;
 803 
 804                 if (flags & TEST_PRIORITY)
 805                         h.ctx->sched.priority = 1024;
 806         }
 807 
 808         for_each_engine(engine, gt->i915, id) {
 809                 struct active_engine threads[I915_NUM_ENGINES] = {};
 810                 unsigned long device = i915_reset_count(global);
 811                 unsigned long count = 0, reported;
 812                 IGT_TIMEOUT(end_time);
 813 
 814                 if (flags & TEST_ACTIVE &&
 815                     !intel_engine_can_store_dword(engine))
 816                         continue;
 817 
 818                 if (!wait_for_idle(engine)) {
 819                         pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
 820                                engine->name, test_name);
 821                         err = -EIO;
 822                         break;
 823                 }
 824 
 825                 memset(threads, 0, sizeof(threads));
 826                 for_each_engine(other, gt->i915, tmp) {
 827                         struct task_struct *tsk;
 828 
 829                         threads[tmp].resets =
 830                                 i915_reset_engine_count(global, other);
 831 
 832                         if (!(flags & TEST_OTHERS))
 833                                 continue;
 834 
 835                         if (other == engine && !(flags & TEST_SELF))
 836                                 continue;
 837 
 838                         threads[tmp].engine = other;
 839                         threads[tmp].flags = flags;
 840 
 841                         tsk = kthread_run(active_engine, &threads[tmp],
 842                                           "igt/%s", other->name);
 843                         if (IS_ERR(tsk)) {
 844                                 err = PTR_ERR(tsk);
 845                                 goto unwind;
 846                         }
 847 
 848                         threads[tmp].task = tsk;
 849                         get_task_struct(tsk);
 850                 }
 851 
 852                 intel_engine_pm_get(engine);
 853                 set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 854                 do {
 855                         struct i915_request *rq = NULL;
 856 
 857                         if (flags & TEST_ACTIVE) {
 858                                 mutex_lock(&gt->i915->drm.struct_mutex);
 859                                 rq = hang_create_request(&h, engine);
 860                                 if (IS_ERR(rq)) {
 861                                         err = PTR_ERR(rq);
 862                                         mutex_unlock(&gt->i915->drm.struct_mutex);
 863                                         break;
 864                                 }
 865 
 866                                 i915_request_get(rq);
 867                                 i915_request_add(rq);
 868                                 mutex_unlock(&gt->i915->drm.struct_mutex);
 869 
 870                                 if (!wait_until_running(&h, rq)) {
 871                                         struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
 872 
 873                                         pr_err("%s: Failed to start request %llx, at %x\n",
 874                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
 875                                         intel_engine_dump(engine, &p,
 876                                                           "%s\n", engine->name);
 877 
 878                                         i915_request_put(rq);
 879                                         err = -EIO;
 880                                         break;
 881                                 }
 882                         }
 883 
 884                         err = intel_engine_reset(engine, NULL);
 885                         if (err) {
 886                                 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
 887                                        engine->name, test_name, err);
 888                                 break;
 889                         }
 890 
 891                         count++;
 892 
 893                         if (rq) {
 894                                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 895                                         struct drm_printer p =
 896                                                 drm_info_printer(gt->i915->drm.dev);
 897 
 898                                         pr_err("i915_reset_engine(%s:%s):"
 899                                                " failed to complete request after reset\n",
 900                                                engine->name, test_name);
 901                                         intel_engine_dump(engine, &p,
 902                                                           "%s\n", engine->name);
 903                                         i915_request_put(rq);
 904 
 905                                         GEM_TRACE_DUMP();
 906                                         intel_gt_set_wedged(gt);
 907                                         err = -EIO;
 908                                         break;
 909                                 }
 910 
 911                                 i915_request_put(rq);
 912                         }
 913 
 914                         if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
 915                                 struct drm_printer p =
 916                                         drm_info_printer(gt->i915->drm.dev);
 917 
 918                                 pr_err("i915_reset_engine(%s:%s):"
 919                                        " failed to idle after reset\n",
 920                                        engine->name, test_name);
 921                                 intel_engine_dump(engine, &p,
 922                                                   "%s\n", engine->name);
 923 
 924                                 err = -EIO;
 925                                 break;
 926                         }
 927                 } while (time_before(jiffies, end_time));
 928                 clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 929                 intel_engine_pm_put(engine);
 930                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
 931                         engine->name, test_name, count);
 932 
 933                 reported = i915_reset_engine_count(global, engine);
 934                 reported -= threads[engine->id].resets;
 935                 if (reported != count) {
 936                         pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
 937                                engine->name, test_name, count, reported);
 938                         if (!err)
 939                                 err = -EINVAL;
 940                 }
 941 
 942 unwind:
 943                 for_each_engine(other, gt->i915, tmp) {
 944                         int ret;
 945 
 946                         if (!threads[tmp].task)
 947                                 continue;
 948 
 949                         ret = kthread_stop(threads[tmp].task);
 950                         if (ret) {
 951                                 pr_err("kthread for other engine %s failed, err=%d\n",
 952                                        other->name, ret);
 953                                 if (!err)
 954                                         err = ret;
 955                         }
 956                         put_task_struct(threads[tmp].task);
 957 
 958                         if (other->uabi_class != engine->uabi_class &&
 959                             threads[tmp].resets !=
 960                             i915_reset_engine_count(global, other)) {
 961                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
 962                                        other->name,
 963                                        i915_reset_engine_count(global, other) -
 964                                        threads[tmp].resets);
 965                                 if (!err)
 966                                         err = -EINVAL;
 967                         }
 968                 }
 969 
 970                 if (device != i915_reset_count(global)) {
 971                         pr_err("Global reset (count=%ld)!\n",
 972                                i915_reset_count(global) - device);
 973                         if (!err)
 974                                 err = -EINVAL;
 975                 }
 976 
 977                 if (err)
 978                         break;
 979 
 980                 mutex_lock(&gt->i915->drm.struct_mutex);
 981                 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
 982                 mutex_unlock(&gt->i915->drm.struct_mutex);
 983                 if (err)
 984                         break;
 985         }
 986 
 987         if (intel_gt_is_wedged(gt))
 988                 err = -EIO;
 989 
 990         if (flags & TEST_ACTIVE) {
 991                 mutex_lock(&gt->i915->drm.struct_mutex);
 992                 hang_fini(&h);
 993                 mutex_unlock(&gt->i915->drm.struct_mutex);
 994         }
 995 
 996         return err;
 997 }
 998 
 999 static int igt_reset_engines(void *arg)
1000 {
1001         static const struct {
1002                 const char *name;
1003                 unsigned int flags;
1004         } phases[] = {
1005                 { "idle", 0 },
1006                 { "active", TEST_ACTIVE },
1007                 { "others-idle", TEST_OTHERS },
1008                 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1009                 {
1010                         "others-priority",
1011                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1012                 },
1013                 {
1014                         "self-priority",
1015                         TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1016                 },
1017                 { }
1018         };
1019         struct intel_gt *gt = arg;
1020         typeof(*phases) *p;
1021         int err;
1022 
1023         for (p = phases; p->name; p++) {
1024                 if (p->flags & TEST_PRIORITY) {
1025                         if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1026                                 continue;
1027                 }
1028 
1029                 err = __igt_reset_engines(arg, p->name, p->flags);
1030                 if (err)
1031                         return err;
1032         }
1033 
1034         return 0;
1035 }
1036 
1037 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1038 {
1039         u32 count = i915_reset_count(&gt->i915->gpu_error);
1040 
1041         intel_gt_reset(gt, mask, NULL);
1042 
1043         return count;
1044 }
1045 
1046 static int igt_reset_wait(void *arg)
1047 {
1048         struct intel_gt *gt = arg;
1049         struct i915_gpu_error *global = &gt->i915->gpu_error;
1050         struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1051         struct i915_request *rq;
1052         unsigned int reset_count;
1053         struct hang h;
1054         long timeout;
1055         int err;
1056 
1057         if (!engine || !intel_engine_can_store_dword(engine))
1058                 return 0;
1059 
1060         /* Check that we detect a stuck waiter and issue a reset */
1061 
1062         igt_global_reset_lock(gt);
1063 
1064         mutex_lock(&gt->i915->drm.struct_mutex);
1065         err = hang_init(&h, gt);
1066         if (err)
1067                 goto unlock;
1068 
1069         rq = hang_create_request(&h, engine);
1070         if (IS_ERR(rq)) {
1071                 err = PTR_ERR(rq);
1072                 goto fini;
1073         }
1074 
1075         i915_request_get(rq);
1076         i915_request_add(rq);
1077 
1078         if (!wait_until_running(&h, rq)) {
1079                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1080 
1081                 pr_err("%s: Failed to start request %llx, at %x\n",
1082                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1083                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1084 
1085                 intel_gt_set_wedged(gt);
1086 
1087                 err = -EIO;
1088                 goto out_rq;
1089         }
1090 
1091         reset_count = fake_hangcheck(gt, ALL_ENGINES);
1092 
1093         timeout = i915_request_wait(rq, 0, 10);
1094         if (timeout < 0) {
1095                 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1096                        timeout);
1097                 err = timeout;
1098                 goto out_rq;
1099         }
1100 
1101         if (i915_reset_count(global) == reset_count) {
1102                 pr_err("No GPU reset recorded!\n");
1103                 err = -EINVAL;
1104                 goto out_rq;
1105         }
1106 
1107 out_rq:
1108         i915_request_put(rq);
1109 fini:
1110         hang_fini(&h);
1111 unlock:
1112         mutex_unlock(&gt->i915->drm.struct_mutex);
1113         igt_global_reset_unlock(gt);
1114 
1115         if (intel_gt_is_wedged(gt))
1116                 return -EIO;
1117 
1118         return err;
1119 }
1120 
1121 struct evict_vma {
1122         struct completion completion;
1123         struct i915_vma *vma;
1124 };
1125 
1126 static int evict_vma(void *data)
1127 {
1128         struct evict_vma *arg = data;
1129         struct i915_address_space *vm = arg->vma->vm;
1130         struct drm_i915_private *i915 = vm->i915;
1131         struct drm_mm_node evict = arg->vma->node;
1132         int err;
1133 
1134         complete(&arg->completion);
1135 
1136         mutex_lock(&i915->drm.struct_mutex);
1137         err = i915_gem_evict_for_node(vm, &evict, 0);
1138         mutex_unlock(&i915->drm.struct_mutex);
1139 
1140         return err;
1141 }
1142 
1143 static int evict_fence(void *data)
1144 {
1145         struct evict_vma *arg = data;
1146         struct drm_i915_private *i915 = arg->vma->vm->i915;
1147         int err;
1148 
1149         complete(&arg->completion);
1150 
1151         mutex_lock(&i915->drm.struct_mutex);
1152 
1153         /* Mark the fence register as dirty to force the mmio update. */
1154         err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1155         if (err) {
1156                 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1157                 goto out_unlock;
1158         }
1159 
1160         err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1161         if (err) {
1162                 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1163                 goto out_unlock;
1164         }
1165 
1166         err = i915_vma_pin_fence(arg->vma);
1167         i915_vma_unpin(arg->vma);
1168         if (err) {
1169                 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1170                 goto out_unlock;
1171         }
1172 
1173         i915_vma_unpin_fence(arg->vma);
1174 
1175 out_unlock:
1176         mutex_unlock(&i915->drm.struct_mutex);
1177 
1178         return err;
1179 }
1180 
1181 static int __igt_reset_evict_vma(struct intel_gt *gt,
1182                                  struct i915_address_space *vm,
1183                                  int (*fn)(void *),
1184                                  unsigned int flags)
1185 {
1186         struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1187         struct drm_i915_gem_object *obj;
1188         struct task_struct *tsk = NULL;
1189         struct i915_request *rq;
1190         struct evict_vma arg;
1191         struct hang h;
1192         int err;
1193 
1194         if (!engine || !intel_engine_can_store_dword(engine))
1195                 return 0;
1196 
1197         /* Check that we can recover an unbind stuck on a hanging request */
1198 
1199         mutex_lock(&gt->i915->drm.struct_mutex);
1200         err = hang_init(&h, gt);
1201         if (err)
1202                 goto unlock;
1203 
1204         obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1205         if (IS_ERR(obj)) {
1206                 err = PTR_ERR(obj);
1207                 goto fini;
1208         }
1209 
1210         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1211                 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1212                 if (err) {
1213                         pr_err("Invalid X-tiling settings; err:%d\n", err);
1214                         goto out_obj;
1215                 }
1216         }
1217 
1218         arg.vma = i915_vma_instance(obj, vm, NULL);
1219         if (IS_ERR(arg.vma)) {
1220                 err = PTR_ERR(arg.vma);
1221                 goto out_obj;
1222         }
1223 
1224         rq = hang_create_request(&h, engine);
1225         if (IS_ERR(rq)) {
1226                 err = PTR_ERR(rq);
1227                 goto out_obj;
1228         }
1229 
1230         err = i915_vma_pin(arg.vma, 0, 0,
1231                            i915_vma_is_ggtt(arg.vma) ?
1232                            PIN_GLOBAL | PIN_MAPPABLE :
1233                            PIN_USER);
1234         if (err) {
1235                 i915_request_add(rq);
1236                 goto out_obj;
1237         }
1238 
1239         if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1240                 err = i915_vma_pin_fence(arg.vma);
1241                 if (err) {
1242                         pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1243                         i915_vma_unpin(arg.vma);
1244                         i915_request_add(rq);
1245                         goto out_obj;
1246                 }
1247         }
1248 
1249         i915_vma_lock(arg.vma);
1250         err = i915_request_await_object(rq, arg.vma->obj,
1251                                         flags & EXEC_OBJECT_WRITE);
1252         if (err == 0)
1253                 err = i915_vma_move_to_active(arg.vma, rq, flags);
1254         i915_vma_unlock(arg.vma);
1255 
1256         if (flags & EXEC_OBJECT_NEEDS_FENCE)
1257                 i915_vma_unpin_fence(arg.vma);
1258         i915_vma_unpin(arg.vma);
1259 
1260         i915_request_get(rq);
1261         i915_request_add(rq);
1262         if (err)
1263                 goto out_rq;
1264 
1265         mutex_unlock(&gt->i915->drm.struct_mutex);
1266 
1267         if (!wait_until_running(&h, rq)) {
1268                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1269 
1270                 pr_err("%s: Failed to start request %llx, at %x\n",
1271                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1272                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1273 
1274                 intel_gt_set_wedged(gt);
1275                 goto out_reset;
1276         }
1277 
1278         init_completion(&arg.completion);
1279 
1280         tsk = kthread_run(fn, &arg, "igt/evict_vma");
1281         if (IS_ERR(tsk)) {
1282                 err = PTR_ERR(tsk);
1283                 tsk = NULL;
1284                 goto out_reset;
1285         }
1286         get_task_struct(tsk);
1287 
1288         wait_for_completion(&arg.completion);
1289 
1290         if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1291                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1292 
1293                 pr_err("igt/evict_vma kthread did not wait\n");
1294                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1295 
1296                 intel_gt_set_wedged(gt);
1297                 goto out_reset;
1298         }
1299 
1300 out_reset:
1301         igt_global_reset_lock(gt);
1302         fake_hangcheck(gt, rq->engine->mask);
1303         igt_global_reset_unlock(gt);
1304 
1305         if (tsk) {
1306                 struct intel_wedge_me w;
1307 
1308                 /* The reset, even indirectly, should take less than 10ms. */
1309                 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1310                         err = kthread_stop(tsk);
1311 
1312                 put_task_struct(tsk);
1313         }
1314 
1315         mutex_lock(&gt->i915->drm.struct_mutex);
1316 out_rq:
1317         i915_request_put(rq);
1318 out_obj:
1319         i915_gem_object_put(obj);
1320 fini:
1321         hang_fini(&h);
1322 unlock:
1323         mutex_unlock(&gt->i915->drm.struct_mutex);
1324 
1325         if (intel_gt_is_wedged(gt))
1326                 return -EIO;
1327 
1328         return err;
1329 }
1330 
1331 static int igt_reset_evict_ggtt(void *arg)
1332 {
1333         struct intel_gt *gt = arg;
1334 
1335         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1336                                      evict_vma, EXEC_OBJECT_WRITE);
1337 }
1338 
1339 static int igt_reset_evict_ppgtt(void *arg)
1340 {
1341         struct intel_gt *gt = arg;
1342         struct i915_gem_context *ctx;
1343         struct drm_file *file;
1344         int err;
1345 
1346         file = mock_file(gt->i915);
1347         if (IS_ERR(file))
1348                 return PTR_ERR(file);
1349 
1350         mutex_lock(&gt->i915->drm.struct_mutex);
1351         ctx = live_context(gt->i915, file);
1352         mutex_unlock(&gt->i915->drm.struct_mutex);
1353         if (IS_ERR(ctx)) {
1354                 err = PTR_ERR(ctx);
1355                 goto out;
1356         }
1357 
1358         err = 0;
1359         if (ctx->vm) /* aliasing == global gtt locking, covered above */
1360                 err = __igt_reset_evict_vma(gt, ctx->vm,
1361                                             evict_vma, EXEC_OBJECT_WRITE);
1362 
1363 out:
1364         mock_file_free(gt->i915, file);
1365         return err;
1366 }
1367 
1368 static int igt_reset_evict_fence(void *arg)
1369 {
1370         struct intel_gt *gt = arg;
1371 
1372         return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1373                                      evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1374 }
1375 
1376 static int wait_for_others(struct intel_gt *gt,
1377                            struct intel_engine_cs *exclude)
1378 {
1379         struct intel_engine_cs *engine;
1380         enum intel_engine_id id;
1381 
1382         for_each_engine(engine, gt->i915, id) {
1383                 if (engine == exclude)
1384                         continue;
1385 
1386                 if (!wait_for_idle(engine))
1387                         return -EIO;
1388         }
1389 
1390         return 0;
1391 }
1392 
1393 static int igt_reset_queue(void *arg)
1394 {
1395         struct intel_gt *gt = arg;
1396         struct i915_gpu_error *global = &gt->i915->gpu_error;
1397         struct intel_engine_cs *engine;
1398         enum intel_engine_id id;
1399         struct hang h;
1400         int err;
1401 
1402         /* Check that we replay pending requests following a hang */
1403 
1404         igt_global_reset_lock(gt);
1405 
1406         mutex_lock(&gt->i915->drm.struct_mutex);
1407         err = hang_init(&h, gt);
1408         if (err)
1409                 goto unlock;
1410 
1411         for_each_engine(engine, gt->i915, id) {
1412                 struct i915_request *prev;
1413                 IGT_TIMEOUT(end_time);
1414                 unsigned int count;
1415 
1416                 if (!intel_engine_can_store_dword(engine))
1417                         continue;
1418 
1419                 prev = hang_create_request(&h, engine);
1420                 if (IS_ERR(prev)) {
1421                         err = PTR_ERR(prev);
1422                         goto fini;
1423                 }
1424 
1425                 i915_request_get(prev);
1426                 i915_request_add(prev);
1427 
1428                 count = 0;
1429                 do {
1430                         struct i915_request *rq;
1431                         unsigned int reset_count;
1432 
1433                         rq = hang_create_request(&h, engine);
1434                         if (IS_ERR(rq)) {
1435                                 err = PTR_ERR(rq);
1436                                 goto fini;
1437                         }
1438 
1439                         i915_request_get(rq);
1440                         i915_request_add(rq);
1441 
1442                         /*
1443                          * XXX We don't handle resetting the kernel context
1444                          * very well. If we trigger a device reset twice in
1445                          * quick succession while the kernel context is
1446                          * executing, we may end up skipping the breadcrumb.
1447                          * This is really only a problem for the selftest as
1448                          * normally there is a large interlude between resets
1449                          * (hangcheck), or we focus on resetting just one
1450                          * engine and so avoid repeatedly resetting innocents.
1451                          */
1452                         err = wait_for_others(gt, engine);
1453                         if (err) {
1454                                 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1455                                        __func__, engine->name);
1456                                 i915_request_put(rq);
1457                                 i915_request_put(prev);
1458 
1459                                 GEM_TRACE_DUMP();
1460                                 intel_gt_set_wedged(gt);
1461                                 goto fini;
1462                         }
1463 
1464                         if (!wait_until_running(&h, prev)) {
1465                                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1466 
1467                                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1468                                        __func__, engine->name,
1469                                        prev->fence.seqno, hws_seqno(&h, prev));
1470                                 intel_engine_dump(engine, &p,
1471                                                   "%s\n", engine->name);
1472 
1473                                 i915_request_put(rq);
1474                                 i915_request_put(prev);
1475 
1476                                 intel_gt_set_wedged(gt);
1477 
1478                                 err = -EIO;
1479                                 goto fini;
1480                         }
1481 
1482                         reset_count = fake_hangcheck(gt, BIT(id));
1483 
1484                         if (prev->fence.error != -EIO) {
1485                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1486                                        prev->fence.error);
1487                                 i915_request_put(rq);
1488                                 i915_request_put(prev);
1489                                 err = -EINVAL;
1490                                 goto fini;
1491                         }
1492 
1493                         if (rq->fence.error) {
1494                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1495                                        rq->fence.error);
1496                                 i915_request_put(rq);
1497                                 i915_request_put(prev);
1498                                 err = -EINVAL;
1499                                 goto fini;
1500                         }
1501 
1502                         if (i915_reset_count(global) == reset_count) {
1503                                 pr_err("No GPU reset recorded!\n");
1504                                 i915_request_put(rq);
1505                                 i915_request_put(prev);
1506                                 err = -EINVAL;
1507                                 goto fini;
1508                         }
1509 
1510                         i915_request_put(prev);
1511                         prev = rq;
1512                         count++;
1513                 } while (time_before(jiffies, end_time));
1514                 pr_info("%s: Completed %d resets\n", engine->name, count);
1515 
1516                 *h.batch = MI_BATCH_BUFFER_END;
1517                 intel_gt_chipset_flush(engine->gt);
1518 
1519                 i915_request_put(prev);
1520 
1521                 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1522                 if (err)
1523                         break;
1524         }
1525 
1526 fini:
1527         hang_fini(&h);
1528 unlock:
1529         mutex_unlock(&gt->i915->drm.struct_mutex);
1530         igt_global_reset_unlock(gt);
1531 
1532         if (intel_gt_is_wedged(gt))
1533                 return -EIO;
1534 
1535         return err;
1536 }
1537 
1538 static int igt_handle_error(void *arg)
1539 {
1540         struct intel_gt *gt = arg;
1541         struct i915_gpu_error *global = &gt->i915->gpu_error;
1542         struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1543         struct hang h;
1544         struct i915_request *rq;
1545         struct i915_gpu_state *error;
1546         int err;
1547 
1548         /* Check that we can issue a global GPU and engine reset */
1549 
1550         if (!intel_has_reset_engine(gt->i915))
1551                 return 0;
1552 
1553         if (!engine || !intel_engine_can_store_dword(engine))
1554                 return 0;
1555 
1556         mutex_lock(&gt->i915->drm.struct_mutex);
1557 
1558         err = hang_init(&h, gt);
1559         if (err)
1560                 goto err_unlock;
1561 
1562         rq = hang_create_request(&h, engine);
1563         if (IS_ERR(rq)) {
1564                 err = PTR_ERR(rq);
1565                 goto err_fini;
1566         }
1567 
1568         i915_request_get(rq);
1569         i915_request_add(rq);
1570 
1571         if (!wait_until_running(&h, rq)) {
1572                 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1573 
1574                 pr_err("%s: Failed to start request %llx, at %x\n",
1575                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1576                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1577 
1578                 intel_gt_set_wedged(gt);
1579 
1580                 err = -EIO;
1581                 goto err_request;
1582         }
1583 
1584         mutex_unlock(&gt->i915->drm.struct_mutex);
1585 
1586         /* Temporarily disable error capture */
1587         error = xchg(&global->first_error, (void *)-1);
1588 
1589         intel_gt_handle_error(gt, engine->mask, 0, NULL);
1590 
1591         xchg(&global->first_error, error);
1592 
1593         mutex_lock(&gt->i915->drm.struct_mutex);
1594 
1595         if (rq->fence.error != -EIO) {
1596                 pr_err("Guilty request not identified!\n");
1597                 err = -EINVAL;
1598                 goto err_request;
1599         }
1600 
1601 err_request:
1602         i915_request_put(rq);
1603 err_fini:
1604         hang_fini(&h);
1605 err_unlock:
1606         mutex_unlock(&gt->i915->drm.struct_mutex);
1607         return err;
1608 }
1609 
1610 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1611                                      const struct igt_atomic_section *p,
1612                                      const char *mode)
1613 {
1614         struct tasklet_struct * const t = &engine->execlists.tasklet;
1615         int err;
1616 
1617         GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1618                   engine->name, mode, p->name);
1619 
1620         tasklet_disable_nosync(t);
1621         p->critical_section_begin();
1622 
1623         err = intel_engine_reset(engine, NULL);
1624 
1625         p->critical_section_end();
1626         tasklet_enable(t);
1627 
1628         if (err)
1629                 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1630                        engine->name, mode, p->name);
1631 
1632         return err;
1633 }
1634 
1635 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1636                                    const struct igt_atomic_section *p)
1637 {
1638         struct i915_request *rq;
1639         struct hang h;
1640         int err;
1641 
1642         err = __igt_atomic_reset_engine(engine, p, "idle");
1643         if (err)
1644                 return err;
1645 
1646         err = hang_init(&h, engine->gt);
1647         if (err)
1648                 return err;
1649 
1650         rq = hang_create_request(&h, engine);
1651         if (IS_ERR(rq)) {
1652                 err = PTR_ERR(rq);
1653                 goto out;
1654         }
1655 
1656         i915_request_get(rq);
1657         i915_request_add(rq);
1658 
1659         if (wait_until_running(&h, rq)) {
1660                 err = __igt_atomic_reset_engine(engine, p, "active");
1661         } else {
1662                 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1663                        __func__, engine->name,
1664                        rq->fence.seqno, hws_seqno(&h, rq));
1665                 intel_gt_set_wedged(engine->gt);
1666                 err = -EIO;
1667         }
1668 
1669         if (err == 0) {
1670                 struct intel_wedge_me w;
1671 
1672                 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1673                         i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1674                 if (intel_gt_is_wedged(engine->gt))
1675                         err = -EIO;
1676         }
1677 
1678         i915_request_put(rq);
1679 out:
1680         hang_fini(&h);
1681         return err;
1682 }
1683 
1684 static int igt_reset_engines_atomic(void *arg)
1685 {
1686         struct intel_gt *gt = arg;
1687         const typeof(*igt_atomic_phases) *p;
1688         int err = 0;
1689 
1690         /* Check that the engines resets are usable from atomic context */
1691 
1692         if (!intel_has_reset_engine(gt->i915))
1693                 return 0;
1694 
1695         if (USES_GUC_SUBMISSION(gt->i915))
1696                 return 0;
1697 
1698         igt_global_reset_lock(gt);
1699         mutex_lock(&gt->i915->drm.struct_mutex);
1700 
1701         /* Flush any requests before we get started and check basics */
1702         if (!igt_force_reset(gt))
1703                 goto unlock;
1704 
1705         for (p = igt_atomic_phases; p->name; p++) {
1706                 struct intel_engine_cs *engine;
1707                 enum intel_engine_id id;
1708 
1709                 for_each_engine(engine, gt->i915, id) {
1710                         err = igt_atomic_reset_engine(engine, p);
1711                         if (err)
1712                                 goto out;
1713                 }
1714         }
1715 
1716 out:
1717         /* As we poke around the guts, do a full reset before continuing. */
1718         igt_force_reset(gt);
1719 
1720 unlock:
1721         mutex_unlock(&gt->i915->drm.struct_mutex);
1722         igt_global_reset_unlock(gt);
1723 
1724         return err;
1725 }
1726 
1727 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1728 {
1729         static const struct i915_subtest tests[] = {
1730                 SUBTEST(igt_hang_sanitycheck),
1731                 SUBTEST(igt_reset_nop),
1732                 SUBTEST(igt_reset_nop_engine),
1733                 SUBTEST(igt_reset_idle_engine),
1734                 SUBTEST(igt_reset_active_engine),
1735                 SUBTEST(igt_reset_engines),
1736                 SUBTEST(igt_reset_engines_atomic),
1737                 SUBTEST(igt_reset_queue),
1738                 SUBTEST(igt_reset_wait),
1739                 SUBTEST(igt_reset_evict_ggtt),
1740                 SUBTEST(igt_reset_evict_ppgtt),
1741                 SUBTEST(igt_reset_evict_fence),
1742                 SUBTEST(igt_handle_error),
1743         };
1744         struct intel_gt *gt = &i915->gt;
1745         intel_wakeref_t wakeref;
1746         bool saved_hangcheck;
1747         int err;
1748 
1749         if (!intel_has_gpu_reset(gt->i915))
1750                 return 0;
1751 
1752         if (intel_gt_is_wedged(gt))
1753                 return -EIO; /* we're long past hope of a successful reset */
1754 
1755         wakeref = intel_runtime_pm_get(&gt->i915->runtime_pm);
1756         saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1757         drain_delayed_work(&gt->hangcheck.work); /* flush param */
1758 
1759         err = intel_gt_live_subtests(tests, gt);
1760 
1761         mutex_lock(&gt->i915->drm.struct_mutex);
1762         igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1763         mutex_unlock(&gt->i915->drm.struct_mutex);
1764 
1765         i915_modparams.enable_hangcheck = saved_hangcheck;
1766         intel_runtime_pm_put(&gt->i915->runtime_pm, wakeref);
1767 
1768         return err;
1769 }

/* [<][>][^][v][top][bottom][index][help] */