This source file includes following definitions.
- hang_init
- hws_address
- move_to_active
- hang_create_request
- hws_seqno
- hang_fini
- wait_until_running
- igt_hang_sanitycheck
- wait_for_idle
- igt_reset_nop
- igt_reset_nop_engine
- __igt_reset_engine
- igt_reset_idle_engine
- igt_reset_active_engine
- active_request_put
- active_engine
- __igt_reset_engines
- igt_reset_engines
- fake_hangcheck
- igt_reset_wait
- evict_vma
- evict_fence
- __igt_reset_evict_vma
- igt_reset_evict_ggtt
- igt_reset_evict_ppgtt
- igt_reset_evict_fence
- wait_for_others
- igt_reset_queue
- igt_handle_error
- __igt_atomic_reset_engine
- igt_atomic_reset_engine
- igt_reset_engines_atomic
- intel_hangcheck_live_selftests
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 #include <linux/kthread.h>
26
27 #include "gem/i915_gem_context.h"
28 #include "gt/intel_gt.h"
29 #include "intel_engine_pm.h"
30
31 #include "i915_selftest.h"
32 #include "selftests/i915_random.h"
33 #include "selftests/igt_flush_test.h"
34 #include "selftests/igt_reset.h"
35 #include "selftests/igt_atomic.h"
36
37 #include "selftests/mock_drm.h"
38
39 #include "gem/selftests/mock_context.h"
40 #include "gem/selftests/igt_gem_utils.h"
41
42 #define IGT_IDLE_TIMEOUT 50
43
44 struct hang {
45 struct intel_gt *gt;
46 struct drm_i915_gem_object *hws;
47 struct drm_i915_gem_object *obj;
48 struct i915_gem_context *ctx;
49 u32 *seqno;
50 u32 *batch;
51 };
52
53 static int hang_init(struct hang *h, struct intel_gt *gt)
54 {
55 void *vaddr;
56 int err;
57
58 memset(h, 0, sizeof(*h));
59 h->gt = gt;
60
61 h->ctx = kernel_context(gt->i915);
62 if (IS_ERR(h->ctx))
63 return PTR_ERR(h->ctx);
64
65 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
66
67 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
68 if (IS_ERR(h->hws)) {
69 err = PTR_ERR(h->hws);
70 goto err_ctx;
71 }
72
73 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
74 if (IS_ERR(h->obj)) {
75 err = PTR_ERR(h->obj);
76 goto err_hws;
77 }
78
79 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
80 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
81 if (IS_ERR(vaddr)) {
82 err = PTR_ERR(vaddr);
83 goto err_obj;
84 }
85 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
86
87 vaddr = i915_gem_object_pin_map(h->obj,
88 i915_coherent_map_type(gt->i915));
89 if (IS_ERR(vaddr)) {
90 err = PTR_ERR(vaddr);
91 goto err_unpin_hws;
92 }
93 h->batch = vaddr;
94
95 return 0;
96
97 err_unpin_hws:
98 i915_gem_object_unpin_map(h->hws);
99 err_obj:
100 i915_gem_object_put(h->obj);
101 err_hws:
102 i915_gem_object_put(h->hws);
103 err_ctx:
104 kernel_context_close(h->ctx);
105 return err;
106 }
107
108 static u64 hws_address(const struct i915_vma *hws,
109 const struct i915_request *rq)
110 {
111 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
112 }
113
114 static int move_to_active(struct i915_vma *vma,
115 struct i915_request *rq,
116 unsigned int flags)
117 {
118 int err;
119
120 i915_vma_lock(vma);
121 err = i915_request_await_object(rq, vma->obj,
122 flags & EXEC_OBJECT_WRITE);
123 if (err == 0)
124 err = i915_vma_move_to_active(vma, rq, flags);
125 i915_vma_unlock(vma);
126
127 return err;
128 }
129
130 static struct i915_request *
131 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
132 {
133 struct intel_gt *gt = h->gt;
134 struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm;
135 struct drm_i915_gem_object *obj;
136 struct i915_request *rq = NULL;
137 struct i915_vma *hws, *vma;
138 unsigned int flags;
139 void *vaddr;
140 u32 *batch;
141 int err;
142
143 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
144 if (IS_ERR(obj))
145 return ERR_CAST(obj);
146
147 vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
148 if (IS_ERR(vaddr)) {
149 i915_gem_object_put(obj);
150 return ERR_CAST(vaddr);
151 }
152
153 i915_gem_object_unpin_map(h->obj);
154 i915_gem_object_put(h->obj);
155
156 h->obj = obj;
157 h->batch = vaddr;
158
159 vma = i915_vma_instance(h->obj, vm, NULL);
160 if (IS_ERR(vma))
161 return ERR_CAST(vma);
162
163 hws = i915_vma_instance(h->hws, vm, NULL);
164 if (IS_ERR(hws))
165 return ERR_CAST(hws);
166
167 err = i915_vma_pin(vma, 0, 0, PIN_USER);
168 if (err)
169 return ERR_PTR(err);
170
171 err = i915_vma_pin(hws, 0, 0, PIN_USER);
172 if (err)
173 goto unpin_vma;
174
175 rq = igt_request_alloc(h->ctx, engine);
176 if (IS_ERR(rq)) {
177 err = PTR_ERR(rq);
178 goto unpin_hws;
179 }
180
181 err = move_to_active(vma, rq, 0);
182 if (err)
183 goto cancel_rq;
184
185 err = move_to_active(hws, rq, 0);
186 if (err)
187 goto cancel_rq;
188
189 batch = h->batch;
190 if (INTEL_GEN(gt->i915) >= 8) {
191 *batch++ = MI_STORE_DWORD_IMM_GEN4;
192 *batch++ = lower_32_bits(hws_address(hws, rq));
193 *batch++ = upper_32_bits(hws_address(hws, rq));
194 *batch++ = rq->fence.seqno;
195 *batch++ = MI_ARB_CHECK;
196
197 memset(batch, 0, 1024);
198 batch += 1024 / sizeof(*batch);
199
200 *batch++ = MI_ARB_CHECK;
201 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
202 *batch++ = lower_32_bits(vma->node.start);
203 *batch++ = upper_32_bits(vma->node.start);
204 } else if (INTEL_GEN(gt->i915) >= 6) {
205 *batch++ = MI_STORE_DWORD_IMM_GEN4;
206 *batch++ = 0;
207 *batch++ = lower_32_bits(hws_address(hws, rq));
208 *batch++ = rq->fence.seqno;
209 *batch++ = MI_ARB_CHECK;
210
211 memset(batch, 0, 1024);
212 batch += 1024 / sizeof(*batch);
213
214 *batch++ = MI_ARB_CHECK;
215 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
216 *batch++ = lower_32_bits(vma->node.start);
217 } else if (INTEL_GEN(gt->i915) >= 4) {
218 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
219 *batch++ = 0;
220 *batch++ = lower_32_bits(hws_address(hws, rq));
221 *batch++ = rq->fence.seqno;
222 *batch++ = MI_ARB_CHECK;
223
224 memset(batch, 0, 1024);
225 batch += 1024 / sizeof(*batch);
226
227 *batch++ = MI_ARB_CHECK;
228 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
229 *batch++ = lower_32_bits(vma->node.start);
230 } else {
231 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
232 *batch++ = lower_32_bits(hws_address(hws, rq));
233 *batch++ = rq->fence.seqno;
234 *batch++ = MI_ARB_CHECK;
235
236 memset(batch, 0, 1024);
237 batch += 1024 / sizeof(*batch);
238
239 *batch++ = MI_ARB_CHECK;
240 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
241 *batch++ = lower_32_bits(vma->node.start);
242 }
243 *batch++ = MI_BATCH_BUFFER_END;
244 intel_gt_chipset_flush(engine->gt);
245
246 if (rq->engine->emit_init_breadcrumb) {
247 err = rq->engine->emit_init_breadcrumb(rq);
248 if (err)
249 goto cancel_rq;
250 }
251
252 flags = 0;
253 if (INTEL_GEN(gt->i915) <= 5)
254 flags |= I915_DISPATCH_SECURE;
255
256 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
257
258 cancel_rq:
259 if (err) {
260 i915_request_skip(rq, err);
261 i915_request_add(rq);
262 }
263 unpin_hws:
264 i915_vma_unpin(hws);
265 unpin_vma:
266 i915_vma_unpin(vma);
267 return err ? ERR_PTR(err) : rq;
268 }
269
270 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
271 {
272 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
273 }
274
275 static void hang_fini(struct hang *h)
276 {
277 *h->batch = MI_BATCH_BUFFER_END;
278 intel_gt_chipset_flush(h->gt);
279
280 i915_gem_object_unpin_map(h->obj);
281 i915_gem_object_put(h->obj);
282
283 i915_gem_object_unpin_map(h->hws);
284 i915_gem_object_put(h->hws);
285
286 kernel_context_close(h->ctx);
287
288 igt_flush_test(h->gt->i915, I915_WAIT_LOCKED);
289 }
290
291 static bool wait_until_running(struct hang *h, struct i915_request *rq)
292 {
293 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
294 rq->fence.seqno),
295 10) &&
296 wait_for(i915_seqno_passed(hws_seqno(h, rq),
297 rq->fence.seqno),
298 1000));
299 }
300
301 static int igt_hang_sanitycheck(void *arg)
302 {
303 struct intel_gt *gt = arg;
304 struct i915_request *rq;
305 struct intel_engine_cs *engine;
306 enum intel_engine_id id;
307 struct hang h;
308 int err;
309
310
311
312 mutex_lock(>->i915->drm.struct_mutex);
313 err = hang_init(&h, gt);
314 if (err)
315 goto unlock;
316
317 for_each_engine(engine, gt->i915, id) {
318 struct intel_wedge_me w;
319 long timeout;
320
321 if (!intel_engine_can_store_dword(engine))
322 continue;
323
324 rq = hang_create_request(&h, engine);
325 if (IS_ERR(rq)) {
326 err = PTR_ERR(rq);
327 pr_err("Failed to create request for %s, err=%d\n",
328 engine->name, err);
329 goto fini;
330 }
331
332 i915_request_get(rq);
333
334 *h.batch = MI_BATCH_BUFFER_END;
335 intel_gt_chipset_flush(engine->gt);
336
337 i915_request_add(rq);
338
339 timeout = 0;
340 intel_wedge_on_timeout(&w, gt, HZ / 10 )
341 timeout = i915_request_wait(rq, 0,
342 MAX_SCHEDULE_TIMEOUT);
343 if (intel_gt_is_wedged(gt))
344 timeout = -EIO;
345
346 i915_request_put(rq);
347
348 if (timeout < 0) {
349 err = timeout;
350 pr_err("Wait for request failed on %s, err=%d\n",
351 engine->name, err);
352 goto fini;
353 }
354 }
355
356 fini:
357 hang_fini(&h);
358 unlock:
359 mutex_unlock(>->i915->drm.struct_mutex);
360 return err;
361 }
362
363 static bool wait_for_idle(struct intel_engine_cs *engine)
364 {
365 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
366 }
367
368 static int igt_reset_nop(void *arg)
369 {
370 struct intel_gt *gt = arg;
371 struct i915_gpu_error *global = >->i915->gpu_error;
372 struct intel_engine_cs *engine;
373 struct i915_gem_context *ctx;
374 unsigned int reset_count, count;
375 enum intel_engine_id id;
376 struct drm_file *file;
377 IGT_TIMEOUT(end_time);
378 int err = 0;
379
380
381
382 file = mock_file(gt->i915);
383 if (IS_ERR(file))
384 return PTR_ERR(file);
385
386 mutex_lock(>->i915->drm.struct_mutex);
387 ctx = live_context(gt->i915, file);
388 mutex_unlock(>->i915->drm.struct_mutex);
389 if (IS_ERR(ctx)) {
390 err = PTR_ERR(ctx);
391 goto out;
392 }
393
394 i915_gem_context_clear_bannable(ctx);
395 reset_count = i915_reset_count(global);
396 count = 0;
397 do {
398 mutex_lock(>->i915->drm.struct_mutex);
399
400 for_each_engine(engine, gt->i915, id) {
401 int i;
402
403 for (i = 0; i < 16; i++) {
404 struct i915_request *rq;
405
406 rq = igt_request_alloc(ctx, engine);
407 if (IS_ERR(rq)) {
408 err = PTR_ERR(rq);
409 break;
410 }
411
412 i915_request_add(rq);
413 }
414 }
415
416 igt_global_reset_lock(gt);
417 intel_gt_reset(gt, ALL_ENGINES, NULL);
418 igt_global_reset_unlock(gt);
419
420 mutex_unlock(>->i915->drm.struct_mutex);
421 if (intel_gt_is_wedged(gt)) {
422 err = -EIO;
423 break;
424 }
425
426 if (i915_reset_count(global) != reset_count + ++count) {
427 pr_err("Full GPU reset not recorded!\n");
428 err = -EINVAL;
429 break;
430 }
431
432 err = igt_flush_test(gt->i915, 0);
433 if (err)
434 break;
435 } while (time_before(jiffies, end_time));
436 pr_info("%s: %d resets\n", __func__, count);
437
438 mutex_lock(>->i915->drm.struct_mutex);
439 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
440 mutex_unlock(>->i915->drm.struct_mutex);
441
442 out:
443 mock_file_free(gt->i915, file);
444 if (intel_gt_is_wedged(gt))
445 err = -EIO;
446 return err;
447 }
448
449 static int igt_reset_nop_engine(void *arg)
450 {
451 struct intel_gt *gt = arg;
452 struct i915_gpu_error *global = >->i915->gpu_error;
453 struct intel_engine_cs *engine;
454 struct i915_gem_context *ctx;
455 enum intel_engine_id id;
456 struct drm_file *file;
457 int err = 0;
458
459
460
461 if (!intel_has_reset_engine(gt->i915))
462 return 0;
463
464 file = mock_file(gt->i915);
465 if (IS_ERR(file))
466 return PTR_ERR(file);
467
468 mutex_lock(>->i915->drm.struct_mutex);
469 ctx = live_context(gt->i915, file);
470 mutex_unlock(>->i915->drm.struct_mutex);
471 if (IS_ERR(ctx)) {
472 err = PTR_ERR(ctx);
473 goto out;
474 }
475
476 i915_gem_context_clear_bannable(ctx);
477 for_each_engine(engine, gt->i915, id) {
478 unsigned int reset_count, reset_engine_count;
479 unsigned int count;
480 IGT_TIMEOUT(end_time);
481
482 reset_count = i915_reset_count(global);
483 reset_engine_count = i915_reset_engine_count(global, engine);
484 count = 0;
485
486 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
487 do {
488 int i;
489
490 if (!wait_for_idle(engine)) {
491 pr_err("%s failed to idle before reset\n",
492 engine->name);
493 err = -EIO;
494 break;
495 }
496
497 mutex_lock(>->i915->drm.struct_mutex);
498 for (i = 0; i < 16; i++) {
499 struct i915_request *rq;
500
501 rq = igt_request_alloc(ctx, engine);
502 if (IS_ERR(rq)) {
503 err = PTR_ERR(rq);
504 break;
505 }
506
507 i915_request_add(rq);
508 }
509 err = intel_engine_reset(engine, NULL);
510 mutex_unlock(>->i915->drm.struct_mutex);
511 if (err) {
512 pr_err("i915_reset_engine failed\n");
513 break;
514 }
515
516 if (i915_reset_count(global) != reset_count) {
517 pr_err("Full GPU reset recorded! (engine reset expected)\n");
518 err = -EINVAL;
519 break;
520 }
521
522 if (i915_reset_engine_count(global, engine) !=
523 reset_engine_count + ++count) {
524 pr_err("%s engine reset not recorded!\n",
525 engine->name);
526 err = -EINVAL;
527 break;
528 }
529 } while (time_before(jiffies, end_time));
530 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
531 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
532
533 if (err)
534 break;
535
536 err = igt_flush_test(gt->i915, 0);
537 if (err)
538 break;
539 }
540
541 mutex_lock(>->i915->drm.struct_mutex);
542 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
543 mutex_unlock(>->i915->drm.struct_mutex);
544
545 out:
546 mock_file_free(gt->i915, file);
547 if (intel_gt_is_wedged(gt))
548 err = -EIO;
549 return err;
550 }
551
552 static int __igt_reset_engine(struct intel_gt *gt, bool active)
553 {
554 struct i915_gpu_error *global = >->i915->gpu_error;
555 struct intel_engine_cs *engine;
556 enum intel_engine_id id;
557 struct hang h;
558 int err = 0;
559
560
561
562 if (!intel_has_reset_engine(gt->i915))
563 return 0;
564
565 if (active) {
566 mutex_lock(>->i915->drm.struct_mutex);
567 err = hang_init(&h, gt);
568 mutex_unlock(>->i915->drm.struct_mutex);
569 if (err)
570 return err;
571 }
572
573 for_each_engine(engine, gt->i915, id) {
574 unsigned int reset_count, reset_engine_count;
575 IGT_TIMEOUT(end_time);
576
577 if (active && !intel_engine_can_store_dword(engine))
578 continue;
579
580 if (!wait_for_idle(engine)) {
581 pr_err("%s failed to idle before reset\n",
582 engine->name);
583 err = -EIO;
584 break;
585 }
586
587 reset_count = i915_reset_count(global);
588 reset_engine_count = i915_reset_engine_count(global, engine);
589
590 intel_engine_pm_get(engine);
591 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
592 do {
593 if (active) {
594 struct i915_request *rq;
595
596 mutex_lock(>->i915->drm.struct_mutex);
597 rq = hang_create_request(&h, engine);
598 if (IS_ERR(rq)) {
599 err = PTR_ERR(rq);
600 mutex_unlock(>->i915->drm.struct_mutex);
601 break;
602 }
603
604 i915_request_get(rq);
605 i915_request_add(rq);
606 mutex_unlock(>->i915->drm.struct_mutex);
607
608 if (!wait_until_running(&h, rq)) {
609 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
610
611 pr_err("%s: Failed to start request %llx, at %x\n",
612 __func__, rq->fence.seqno, hws_seqno(&h, rq));
613 intel_engine_dump(engine, &p,
614 "%s\n", engine->name);
615
616 i915_request_put(rq);
617 err = -EIO;
618 break;
619 }
620
621 i915_request_put(rq);
622 }
623
624 err = intel_engine_reset(engine, NULL);
625 if (err) {
626 pr_err("i915_reset_engine failed\n");
627 break;
628 }
629
630 if (i915_reset_count(global) != reset_count) {
631 pr_err("Full GPU reset recorded! (engine reset expected)\n");
632 err = -EINVAL;
633 break;
634 }
635
636 if (i915_reset_engine_count(global, engine) !=
637 ++reset_engine_count) {
638 pr_err("%s engine reset not recorded!\n",
639 engine->name);
640 err = -EINVAL;
641 break;
642 }
643 } while (time_before(jiffies, end_time));
644 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
645 intel_engine_pm_put(engine);
646
647 if (err)
648 break;
649
650 err = igt_flush_test(gt->i915, 0);
651 if (err)
652 break;
653 }
654
655 if (intel_gt_is_wedged(gt))
656 err = -EIO;
657
658 if (active) {
659 mutex_lock(>->i915->drm.struct_mutex);
660 hang_fini(&h);
661 mutex_unlock(>->i915->drm.struct_mutex);
662 }
663
664 return err;
665 }
666
667 static int igt_reset_idle_engine(void *arg)
668 {
669 return __igt_reset_engine(arg, false);
670 }
671
672 static int igt_reset_active_engine(void *arg)
673 {
674 return __igt_reset_engine(arg, true);
675 }
676
677 struct active_engine {
678 struct task_struct *task;
679 struct intel_engine_cs *engine;
680 unsigned long resets;
681 unsigned int flags;
682 };
683
684 #define TEST_ACTIVE BIT(0)
685 #define TEST_OTHERS BIT(1)
686 #define TEST_SELF BIT(2)
687 #define TEST_PRIORITY BIT(3)
688
689 static int active_request_put(struct i915_request *rq)
690 {
691 int err = 0;
692
693 if (!rq)
694 return 0;
695
696 if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
697 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
698 rq->engine->name,
699 rq->fence.context,
700 rq->fence.seqno);
701 GEM_TRACE_DUMP();
702
703 intel_gt_set_wedged(rq->engine->gt);
704 err = -EIO;
705 }
706
707 i915_request_put(rq);
708
709 return err;
710 }
711
712 static int active_engine(void *data)
713 {
714 I915_RND_STATE(prng);
715 struct active_engine *arg = data;
716 struct intel_engine_cs *engine = arg->engine;
717 struct i915_request *rq[8] = {};
718 struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
719 struct drm_file *file;
720 unsigned long count = 0;
721 int err = 0;
722
723 file = mock_file(engine->i915);
724 if (IS_ERR(file))
725 return PTR_ERR(file);
726
727 for (count = 0; count < ARRAY_SIZE(ctx); count++) {
728 mutex_lock(&engine->i915->drm.struct_mutex);
729 ctx[count] = live_context(engine->i915, file);
730 mutex_unlock(&engine->i915->drm.struct_mutex);
731 if (IS_ERR(ctx[count])) {
732 err = PTR_ERR(ctx[count]);
733 while (--count)
734 i915_gem_context_put(ctx[count]);
735 goto err_file;
736 }
737 }
738
739 while (!kthread_should_stop()) {
740 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
741 struct i915_request *old = rq[idx];
742 struct i915_request *new;
743
744 mutex_lock(&engine->i915->drm.struct_mutex);
745 new = igt_request_alloc(ctx[idx], engine);
746 if (IS_ERR(new)) {
747 mutex_unlock(&engine->i915->drm.struct_mutex);
748 err = PTR_ERR(new);
749 break;
750 }
751
752 if (arg->flags & TEST_PRIORITY)
753 ctx[idx]->sched.priority =
754 i915_prandom_u32_max_state(512, &prng);
755
756 rq[idx] = i915_request_get(new);
757 i915_request_add(new);
758 mutex_unlock(&engine->i915->drm.struct_mutex);
759
760 err = active_request_put(old);
761 if (err)
762 break;
763
764 cond_resched();
765 }
766
767 for (count = 0; count < ARRAY_SIZE(rq); count++) {
768 int err__ = active_request_put(rq[count]);
769
770
771 if (!err)
772 err = err__;
773 }
774
775 err_file:
776 mock_file_free(engine->i915, file);
777 return err;
778 }
779
780 static int __igt_reset_engines(struct intel_gt *gt,
781 const char *test_name,
782 unsigned int flags)
783 {
784 struct i915_gpu_error *global = >->i915->gpu_error;
785 struct intel_engine_cs *engine, *other;
786 enum intel_engine_id id, tmp;
787 struct hang h;
788 int err = 0;
789
790
791
792
793
794 if (!intel_has_reset_engine(gt->i915))
795 return 0;
796
797 if (flags & TEST_ACTIVE) {
798 mutex_lock(>->i915->drm.struct_mutex);
799 err = hang_init(&h, gt);
800 mutex_unlock(>->i915->drm.struct_mutex);
801 if (err)
802 return err;
803
804 if (flags & TEST_PRIORITY)
805 h.ctx->sched.priority = 1024;
806 }
807
808 for_each_engine(engine, gt->i915, id) {
809 struct active_engine threads[I915_NUM_ENGINES] = {};
810 unsigned long device = i915_reset_count(global);
811 unsigned long count = 0, reported;
812 IGT_TIMEOUT(end_time);
813
814 if (flags & TEST_ACTIVE &&
815 !intel_engine_can_store_dword(engine))
816 continue;
817
818 if (!wait_for_idle(engine)) {
819 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
820 engine->name, test_name);
821 err = -EIO;
822 break;
823 }
824
825 memset(threads, 0, sizeof(threads));
826 for_each_engine(other, gt->i915, tmp) {
827 struct task_struct *tsk;
828
829 threads[tmp].resets =
830 i915_reset_engine_count(global, other);
831
832 if (!(flags & TEST_OTHERS))
833 continue;
834
835 if (other == engine && !(flags & TEST_SELF))
836 continue;
837
838 threads[tmp].engine = other;
839 threads[tmp].flags = flags;
840
841 tsk = kthread_run(active_engine, &threads[tmp],
842 "igt/%s", other->name);
843 if (IS_ERR(tsk)) {
844 err = PTR_ERR(tsk);
845 goto unwind;
846 }
847
848 threads[tmp].task = tsk;
849 get_task_struct(tsk);
850 }
851
852 intel_engine_pm_get(engine);
853 set_bit(I915_RESET_ENGINE + id, >->reset.flags);
854 do {
855 struct i915_request *rq = NULL;
856
857 if (flags & TEST_ACTIVE) {
858 mutex_lock(>->i915->drm.struct_mutex);
859 rq = hang_create_request(&h, engine);
860 if (IS_ERR(rq)) {
861 err = PTR_ERR(rq);
862 mutex_unlock(>->i915->drm.struct_mutex);
863 break;
864 }
865
866 i915_request_get(rq);
867 i915_request_add(rq);
868 mutex_unlock(>->i915->drm.struct_mutex);
869
870 if (!wait_until_running(&h, rq)) {
871 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
872
873 pr_err("%s: Failed to start request %llx, at %x\n",
874 __func__, rq->fence.seqno, hws_seqno(&h, rq));
875 intel_engine_dump(engine, &p,
876 "%s\n", engine->name);
877
878 i915_request_put(rq);
879 err = -EIO;
880 break;
881 }
882 }
883
884 err = intel_engine_reset(engine, NULL);
885 if (err) {
886 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
887 engine->name, test_name, err);
888 break;
889 }
890
891 count++;
892
893 if (rq) {
894 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
895 struct drm_printer p =
896 drm_info_printer(gt->i915->drm.dev);
897
898 pr_err("i915_reset_engine(%s:%s):"
899 " failed to complete request after reset\n",
900 engine->name, test_name);
901 intel_engine_dump(engine, &p,
902 "%s\n", engine->name);
903 i915_request_put(rq);
904
905 GEM_TRACE_DUMP();
906 intel_gt_set_wedged(gt);
907 err = -EIO;
908 break;
909 }
910
911 i915_request_put(rq);
912 }
913
914 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
915 struct drm_printer p =
916 drm_info_printer(gt->i915->drm.dev);
917
918 pr_err("i915_reset_engine(%s:%s):"
919 " failed to idle after reset\n",
920 engine->name, test_name);
921 intel_engine_dump(engine, &p,
922 "%s\n", engine->name);
923
924 err = -EIO;
925 break;
926 }
927 } while (time_before(jiffies, end_time));
928 clear_bit(I915_RESET_ENGINE + id, >->reset.flags);
929 intel_engine_pm_put(engine);
930 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
931 engine->name, test_name, count);
932
933 reported = i915_reset_engine_count(global, engine);
934 reported -= threads[engine->id].resets;
935 if (reported != count) {
936 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
937 engine->name, test_name, count, reported);
938 if (!err)
939 err = -EINVAL;
940 }
941
942 unwind:
943 for_each_engine(other, gt->i915, tmp) {
944 int ret;
945
946 if (!threads[tmp].task)
947 continue;
948
949 ret = kthread_stop(threads[tmp].task);
950 if (ret) {
951 pr_err("kthread for other engine %s failed, err=%d\n",
952 other->name, ret);
953 if (!err)
954 err = ret;
955 }
956 put_task_struct(threads[tmp].task);
957
958 if (other->uabi_class != engine->uabi_class &&
959 threads[tmp].resets !=
960 i915_reset_engine_count(global, other)) {
961 pr_err("Innocent engine %s was reset (count=%ld)\n",
962 other->name,
963 i915_reset_engine_count(global, other) -
964 threads[tmp].resets);
965 if (!err)
966 err = -EINVAL;
967 }
968 }
969
970 if (device != i915_reset_count(global)) {
971 pr_err("Global reset (count=%ld)!\n",
972 i915_reset_count(global) - device);
973 if (!err)
974 err = -EINVAL;
975 }
976
977 if (err)
978 break;
979
980 mutex_lock(>->i915->drm.struct_mutex);
981 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
982 mutex_unlock(>->i915->drm.struct_mutex);
983 if (err)
984 break;
985 }
986
987 if (intel_gt_is_wedged(gt))
988 err = -EIO;
989
990 if (flags & TEST_ACTIVE) {
991 mutex_lock(>->i915->drm.struct_mutex);
992 hang_fini(&h);
993 mutex_unlock(>->i915->drm.struct_mutex);
994 }
995
996 return err;
997 }
998
999 static int igt_reset_engines(void *arg)
1000 {
1001 static const struct {
1002 const char *name;
1003 unsigned int flags;
1004 } phases[] = {
1005 { "idle", 0 },
1006 { "active", TEST_ACTIVE },
1007 { "others-idle", TEST_OTHERS },
1008 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1009 {
1010 "others-priority",
1011 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1012 },
1013 {
1014 "self-priority",
1015 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1016 },
1017 { }
1018 };
1019 struct intel_gt *gt = arg;
1020 typeof(*phases) *p;
1021 int err;
1022
1023 for (p = phases; p->name; p++) {
1024 if (p->flags & TEST_PRIORITY) {
1025 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1026 continue;
1027 }
1028
1029 err = __igt_reset_engines(arg, p->name, p->flags);
1030 if (err)
1031 return err;
1032 }
1033
1034 return 0;
1035 }
1036
1037 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1038 {
1039 u32 count = i915_reset_count(>->i915->gpu_error);
1040
1041 intel_gt_reset(gt, mask, NULL);
1042
1043 return count;
1044 }
1045
1046 static int igt_reset_wait(void *arg)
1047 {
1048 struct intel_gt *gt = arg;
1049 struct i915_gpu_error *global = >->i915->gpu_error;
1050 struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1051 struct i915_request *rq;
1052 unsigned int reset_count;
1053 struct hang h;
1054 long timeout;
1055 int err;
1056
1057 if (!engine || !intel_engine_can_store_dword(engine))
1058 return 0;
1059
1060
1061
1062 igt_global_reset_lock(gt);
1063
1064 mutex_lock(>->i915->drm.struct_mutex);
1065 err = hang_init(&h, gt);
1066 if (err)
1067 goto unlock;
1068
1069 rq = hang_create_request(&h, engine);
1070 if (IS_ERR(rq)) {
1071 err = PTR_ERR(rq);
1072 goto fini;
1073 }
1074
1075 i915_request_get(rq);
1076 i915_request_add(rq);
1077
1078 if (!wait_until_running(&h, rq)) {
1079 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1080
1081 pr_err("%s: Failed to start request %llx, at %x\n",
1082 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1083 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1084
1085 intel_gt_set_wedged(gt);
1086
1087 err = -EIO;
1088 goto out_rq;
1089 }
1090
1091 reset_count = fake_hangcheck(gt, ALL_ENGINES);
1092
1093 timeout = i915_request_wait(rq, 0, 10);
1094 if (timeout < 0) {
1095 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1096 timeout);
1097 err = timeout;
1098 goto out_rq;
1099 }
1100
1101 if (i915_reset_count(global) == reset_count) {
1102 pr_err("No GPU reset recorded!\n");
1103 err = -EINVAL;
1104 goto out_rq;
1105 }
1106
1107 out_rq:
1108 i915_request_put(rq);
1109 fini:
1110 hang_fini(&h);
1111 unlock:
1112 mutex_unlock(>->i915->drm.struct_mutex);
1113 igt_global_reset_unlock(gt);
1114
1115 if (intel_gt_is_wedged(gt))
1116 return -EIO;
1117
1118 return err;
1119 }
1120
1121 struct evict_vma {
1122 struct completion completion;
1123 struct i915_vma *vma;
1124 };
1125
1126 static int evict_vma(void *data)
1127 {
1128 struct evict_vma *arg = data;
1129 struct i915_address_space *vm = arg->vma->vm;
1130 struct drm_i915_private *i915 = vm->i915;
1131 struct drm_mm_node evict = arg->vma->node;
1132 int err;
1133
1134 complete(&arg->completion);
1135
1136 mutex_lock(&i915->drm.struct_mutex);
1137 err = i915_gem_evict_for_node(vm, &evict, 0);
1138 mutex_unlock(&i915->drm.struct_mutex);
1139
1140 return err;
1141 }
1142
1143 static int evict_fence(void *data)
1144 {
1145 struct evict_vma *arg = data;
1146 struct drm_i915_private *i915 = arg->vma->vm->i915;
1147 int err;
1148
1149 complete(&arg->completion);
1150
1151 mutex_lock(&i915->drm.struct_mutex);
1152
1153
1154 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1155 if (err) {
1156 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1157 goto out_unlock;
1158 }
1159
1160 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1161 if (err) {
1162 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1163 goto out_unlock;
1164 }
1165
1166 err = i915_vma_pin_fence(arg->vma);
1167 i915_vma_unpin(arg->vma);
1168 if (err) {
1169 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1170 goto out_unlock;
1171 }
1172
1173 i915_vma_unpin_fence(arg->vma);
1174
1175 out_unlock:
1176 mutex_unlock(&i915->drm.struct_mutex);
1177
1178 return err;
1179 }
1180
1181 static int __igt_reset_evict_vma(struct intel_gt *gt,
1182 struct i915_address_space *vm,
1183 int (*fn)(void *),
1184 unsigned int flags)
1185 {
1186 struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1187 struct drm_i915_gem_object *obj;
1188 struct task_struct *tsk = NULL;
1189 struct i915_request *rq;
1190 struct evict_vma arg;
1191 struct hang h;
1192 int err;
1193
1194 if (!engine || !intel_engine_can_store_dword(engine))
1195 return 0;
1196
1197
1198
1199 mutex_lock(>->i915->drm.struct_mutex);
1200 err = hang_init(&h, gt);
1201 if (err)
1202 goto unlock;
1203
1204 obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1205 if (IS_ERR(obj)) {
1206 err = PTR_ERR(obj);
1207 goto fini;
1208 }
1209
1210 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1211 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1212 if (err) {
1213 pr_err("Invalid X-tiling settings; err:%d\n", err);
1214 goto out_obj;
1215 }
1216 }
1217
1218 arg.vma = i915_vma_instance(obj, vm, NULL);
1219 if (IS_ERR(arg.vma)) {
1220 err = PTR_ERR(arg.vma);
1221 goto out_obj;
1222 }
1223
1224 rq = hang_create_request(&h, engine);
1225 if (IS_ERR(rq)) {
1226 err = PTR_ERR(rq);
1227 goto out_obj;
1228 }
1229
1230 err = i915_vma_pin(arg.vma, 0, 0,
1231 i915_vma_is_ggtt(arg.vma) ?
1232 PIN_GLOBAL | PIN_MAPPABLE :
1233 PIN_USER);
1234 if (err) {
1235 i915_request_add(rq);
1236 goto out_obj;
1237 }
1238
1239 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1240 err = i915_vma_pin_fence(arg.vma);
1241 if (err) {
1242 pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1243 i915_vma_unpin(arg.vma);
1244 i915_request_add(rq);
1245 goto out_obj;
1246 }
1247 }
1248
1249 i915_vma_lock(arg.vma);
1250 err = i915_request_await_object(rq, arg.vma->obj,
1251 flags & EXEC_OBJECT_WRITE);
1252 if (err == 0)
1253 err = i915_vma_move_to_active(arg.vma, rq, flags);
1254 i915_vma_unlock(arg.vma);
1255
1256 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1257 i915_vma_unpin_fence(arg.vma);
1258 i915_vma_unpin(arg.vma);
1259
1260 i915_request_get(rq);
1261 i915_request_add(rq);
1262 if (err)
1263 goto out_rq;
1264
1265 mutex_unlock(>->i915->drm.struct_mutex);
1266
1267 if (!wait_until_running(&h, rq)) {
1268 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1269
1270 pr_err("%s: Failed to start request %llx, at %x\n",
1271 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1272 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1273
1274 intel_gt_set_wedged(gt);
1275 goto out_reset;
1276 }
1277
1278 init_completion(&arg.completion);
1279
1280 tsk = kthread_run(fn, &arg, "igt/evict_vma");
1281 if (IS_ERR(tsk)) {
1282 err = PTR_ERR(tsk);
1283 tsk = NULL;
1284 goto out_reset;
1285 }
1286 get_task_struct(tsk);
1287
1288 wait_for_completion(&arg.completion);
1289
1290 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1291 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1292
1293 pr_err("igt/evict_vma kthread did not wait\n");
1294 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1295
1296 intel_gt_set_wedged(gt);
1297 goto out_reset;
1298 }
1299
1300 out_reset:
1301 igt_global_reset_lock(gt);
1302 fake_hangcheck(gt, rq->engine->mask);
1303 igt_global_reset_unlock(gt);
1304
1305 if (tsk) {
1306 struct intel_wedge_me w;
1307
1308
1309 intel_wedge_on_timeout(&w, gt, HZ / 10 )
1310 err = kthread_stop(tsk);
1311
1312 put_task_struct(tsk);
1313 }
1314
1315 mutex_lock(>->i915->drm.struct_mutex);
1316 out_rq:
1317 i915_request_put(rq);
1318 out_obj:
1319 i915_gem_object_put(obj);
1320 fini:
1321 hang_fini(&h);
1322 unlock:
1323 mutex_unlock(>->i915->drm.struct_mutex);
1324
1325 if (intel_gt_is_wedged(gt))
1326 return -EIO;
1327
1328 return err;
1329 }
1330
1331 static int igt_reset_evict_ggtt(void *arg)
1332 {
1333 struct intel_gt *gt = arg;
1334
1335 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1336 evict_vma, EXEC_OBJECT_WRITE);
1337 }
1338
1339 static int igt_reset_evict_ppgtt(void *arg)
1340 {
1341 struct intel_gt *gt = arg;
1342 struct i915_gem_context *ctx;
1343 struct drm_file *file;
1344 int err;
1345
1346 file = mock_file(gt->i915);
1347 if (IS_ERR(file))
1348 return PTR_ERR(file);
1349
1350 mutex_lock(>->i915->drm.struct_mutex);
1351 ctx = live_context(gt->i915, file);
1352 mutex_unlock(>->i915->drm.struct_mutex);
1353 if (IS_ERR(ctx)) {
1354 err = PTR_ERR(ctx);
1355 goto out;
1356 }
1357
1358 err = 0;
1359 if (ctx->vm)
1360 err = __igt_reset_evict_vma(gt, ctx->vm,
1361 evict_vma, EXEC_OBJECT_WRITE);
1362
1363 out:
1364 mock_file_free(gt->i915, file);
1365 return err;
1366 }
1367
1368 static int igt_reset_evict_fence(void *arg)
1369 {
1370 struct intel_gt *gt = arg;
1371
1372 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1373 evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1374 }
1375
1376 static int wait_for_others(struct intel_gt *gt,
1377 struct intel_engine_cs *exclude)
1378 {
1379 struct intel_engine_cs *engine;
1380 enum intel_engine_id id;
1381
1382 for_each_engine(engine, gt->i915, id) {
1383 if (engine == exclude)
1384 continue;
1385
1386 if (!wait_for_idle(engine))
1387 return -EIO;
1388 }
1389
1390 return 0;
1391 }
1392
1393 static int igt_reset_queue(void *arg)
1394 {
1395 struct intel_gt *gt = arg;
1396 struct i915_gpu_error *global = >->i915->gpu_error;
1397 struct intel_engine_cs *engine;
1398 enum intel_engine_id id;
1399 struct hang h;
1400 int err;
1401
1402
1403
1404 igt_global_reset_lock(gt);
1405
1406 mutex_lock(>->i915->drm.struct_mutex);
1407 err = hang_init(&h, gt);
1408 if (err)
1409 goto unlock;
1410
1411 for_each_engine(engine, gt->i915, id) {
1412 struct i915_request *prev;
1413 IGT_TIMEOUT(end_time);
1414 unsigned int count;
1415
1416 if (!intel_engine_can_store_dword(engine))
1417 continue;
1418
1419 prev = hang_create_request(&h, engine);
1420 if (IS_ERR(prev)) {
1421 err = PTR_ERR(prev);
1422 goto fini;
1423 }
1424
1425 i915_request_get(prev);
1426 i915_request_add(prev);
1427
1428 count = 0;
1429 do {
1430 struct i915_request *rq;
1431 unsigned int reset_count;
1432
1433 rq = hang_create_request(&h, engine);
1434 if (IS_ERR(rq)) {
1435 err = PTR_ERR(rq);
1436 goto fini;
1437 }
1438
1439 i915_request_get(rq);
1440 i915_request_add(rq);
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452 err = wait_for_others(gt, engine);
1453 if (err) {
1454 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1455 __func__, engine->name);
1456 i915_request_put(rq);
1457 i915_request_put(prev);
1458
1459 GEM_TRACE_DUMP();
1460 intel_gt_set_wedged(gt);
1461 goto fini;
1462 }
1463
1464 if (!wait_until_running(&h, prev)) {
1465 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1466
1467 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1468 __func__, engine->name,
1469 prev->fence.seqno, hws_seqno(&h, prev));
1470 intel_engine_dump(engine, &p,
1471 "%s\n", engine->name);
1472
1473 i915_request_put(rq);
1474 i915_request_put(prev);
1475
1476 intel_gt_set_wedged(gt);
1477
1478 err = -EIO;
1479 goto fini;
1480 }
1481
1482 reset_count = fake_hangcheck(gt, BIT(id));
1483
1484 if (prev->fence.error != -EIO) {
1485 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1486 prev->fence.error);
1487 i915_request_put(rq);
1488 i915_request_put(prev);
1489 err = -EINVAL;
1490 goto fini;
1491 }
1492
1493 if (rq->fence.error) {
1494 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1495 rq->fence.error);
1496 i915_request_put(rq);
1497 i915_request_put(prev);
1498 err = -EINVAL;
1499 goto fini;
1500 }
1501
1502 if (i915_reset_count(global) == reset_count) {
1503 pr_err("No GPU reset recorded!\n");
1504 i915_request_put(rq);
1505 i915_request_put(prev);
1506 err = -EINVAL;
1507 goto fini;
1508 }
1509
1510 i915_request_put(prev);
1511 prev = rq;
1512 count++;
1513 } while (time_before(jiffies, end_time));
1514 pr_info("%s: Completed %d resets\n", engine->name, count);
1515
1516 *h.batch = MI_BATCH_BUFFER_END;
1517 intel_gt_chipset_flush(engine->gt);
1518
1519 i915_request_put(prev);
1520
1521 err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1522 if (err)
1523 break;
1524 }
1525
1526 fini:
1527 hang_fini(&h);
1528 unlock:
1529 mutex_unlock(>->i915->drm.struct_mutex);
1530 igt_global_reset_unlock(gt);
1531
1532 if (intel_gt_is_wedged(gt))
1533 return -EIO;
1534
1535 return err;
1536 }
1537
1538 static int igt_handle_error(void *arg)
1539 {
1540 struct intel_gt *gt = arg;
1541 struct i915_gpu_error *global = >->i915->gpu_error;
1542 struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1543 struct hang h;
1544 struct i915_request *rq;
1545 struct i915_gpu_state *error;
1546 int err;
1547
1548
1549
1550 if (!intel_has_reset_engine(gt->i915))
1551 return 0;
1552
1553 if (!engine || !intel_engine_can_store_dword(engine))
1554 return 0;
1555
1556 mutex_lock(>->i915->drm.struct_mutex);
1557
1558 err = hang_init(&h, gt);
1559 if (err)
1560 goto err_unlock;
1561
1562 rq = hang_create_request(&h, engine);
1563 if (IS_ERR(rq)) {
1564 err = PTR_ERR(rq);
1565 goto err_fini;
1566 }
1567
1568 i915_request_get(rq);
1569 i915_request_add(rq);
1570
1571 if (!wait_until_running(&h, rq)) {
1572 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1573
1574 pr_err("%s: Failed to start request %llx, at %x\n",
1575 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1576 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1577
1578 intel_gt_set_wedged(gt);
1579
1580 err = -EIO;
1581 goto err_request;
1582 }
1583
1584 mutex_unlock(>->i915->drm.struct_mutex);
1585
1586
1587 error = xchg(&global->first_error, (void *)-1);
1588
1589 intel_gt_handle_error(gt, engine->mask, 0, NULL);
1590
1591 xchg(&global->first_error, error);
1592
1593 mutex_lock(>->i915->drm.struct_mutex);
1594
1595 if (rq->fence.error != -EIO) {
1596 pr_err("Guilty request not identified!\n");
1597 err = -EINVAL;
1598 goto err_request;
1599 }
1600
1601 err_request:
1602 i915_request_put(rq);
1603 err_fini:
1604 hang_fini(&h);
1605 err_unlock:
1606 mutex_unlock(>->i915->drm.struct_mutex);
1607 return err;
1608 }
1609
1610 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1611 const struct igt_atomic_section *p,
1612 const char *mode)
1613 {
1614 struct tasklet_struct * const t = &engine->execlists.tasklet;
1615 int err;
1616
1617 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1618 engine->name, mode, p->name);
1619
1620 tasklet_disable_nosync(t);
1621 p->critical_section_begin();
1622
1623 err = intel_engine_reset(engine, NULL);
1624
1625 p->critical_section_end();
1626 tasklet_enable(t);
1627
1628 if (err)
1629 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1630 engine->name, mode, p->name);
1631
1632 return err;
1633 }
1634
1635 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1636 const struct igt_atomic_section *p)
1637 {
1638 struct i915_request *rq;
1639 struct hang h;
1640 int err;
1641
1642 err = __igt_atomic_reset_engine(engine, p, "idle");
1643 if (err)
1644 return err;
1645
1646 err = hang_init(&h, engine->gt);
1647 if (err)
1648 return err;
1649
1650 rq = hang_create_request(&h, engine);
1651 if (IS_ERR(rq)) {
1652 err = PTR_ERR(rq);
1653 goto out;
1654 }
1655
1656 i915_request_get(rq);
1657 i915_request_add(rq);
1658
1659 if (wait_until_running(&h, rq)) {
1660 err = __igt_atomic_reset_engine(engine, p, "active");
1661 } else {
1662 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1663 __func__, engine->name,
1664 rq->fence.seqno, hws_seqno(&h, rq));
1665 intel_gt_set_wedged(engine->gt);
1666 err = -EIO;
1667 }
1668
1669 if (err == 0) {
1670 struct intel_wedge_me w;
1671
1672 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 )
1673 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1674 if (intel_gt_is_wedged(engine->gt))
1675 err = -EIO;
1676 }
1677
1678 i915_request_put(rq);
1679 out:
1680 hang_fini(&h);
1681 return err;
1682 }
1683
1684 static int igt_reset_engines_atomic(void *arg)
1685 {
1686 struct intel_gt *gt = arg;
1687 const typeof(*igt_atomic_phases) *p;
1688 int err = 0;
1689
1690
1691
1692 if (!intel_has_reset_engine(gt->i915))
1693 return 0;
1694
1695 if (USES_GUC_SUBMISSION(gt->i915))
1696 return 0;
1697
1698 igt_global_reset_lock(gt);
1699 mutex_lock(>->i915->drm.struct_mutex);
1700
1701
1702 if (!igt_force_reset(gt))
1703 goto unlock;
1704
1705 for (p = igt_atomic_phases; p->name; p++) {
1706 struct intel_engine_cs *engine;
1707 enum intel_engine_id id;
1708
1709 for_each_engine(engine, gt->i915, id) {
1710 err = igt_atomic_reset_engine(engine, p);
1711 if (err)
1712 goto out;
1713 }
1714 }
1715
1716 out:
1717
1718 igt_force_reset(gt);
1719
1720 unlock:
1721 mutex_unlock(>->i915->drm.struct_mutex);
1722 igt_global_reset_unlock(gt);
1723
1724 return err;
1725 }
1726
1727 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1728 {
1729 static const struct i915_subtest tests[] = {
1730 SUBTEST(igt_hang_sanitycheck),
1731 SUBTEST(igt_reset_nop),
1732 SUBTEST(igt_reset_nop_engine),
1733 SUBTEST(igt_reset_idle_engine),
1734 SUBTEST(igt_reset_active_engine),
1735 SUBTEST(igt_reset_engines),
1736 SUBTEST(igt_reset_engines_atomic),
1737 SUBTEST(igt_reset_queue),
1738 SUBTEST(igt_reset_wait),
1739 SUBTEST(igt_reset_evict_ggtt),
1740 SUBTEST(igt_reset_evict_ppgtt),
1741 SUBTEST(igt_reset_evict_fence),
1742 SUBTEST(igt_handle_error),
1743 };
1744 struct intel_gt *gt = &i915->gt;
1745 intel_wakeref_t wakeref;
1746 bool saved_hangcheck;
1747 int err;
1748
1749 if (!intel_has_gpu_reset(gt->i915))
1750 return 0;
1751
1752 if (intel_gt_is_wedged(gt))
1753 return -EIO;
1754
1755 wakeref = intel_runtime_pm_get(>->i915->runtime_pm);
1756 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1757 drain_delayed_work(>->hangcheck.work);
1758
1759 err = intel_gt_live_subtests(tests, gt);
1760
1761 mutex_lock(>->i915->drm.struct_mutex);
1762 igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1763 mutex_unlock(>->i915->drm.struct_mutex);
1764
1765 i915_modparams.enable_hangcheck = saved_hangcheck;
1766 intel_runtime_pm_put(>->i915->runtime_pm, wakeref);
1767
1768 return err;
1769 }