This source file includes following definitions.
- rmw_set_fw
- rmw_clear_fw
- engine_skip_context
- client_mark_guilty
- context_mark_guilty
- context_mark_innocent
- __i915_request_reset
- i915_in_reset
- i915_do_reset
- g4x_reset_complete
- g33_do_reset
- g4x_do_reset
- ironlake_do_reset
- gen6_hw_domain_reset
- gen6_reset_engines
- gen11_lock_sfc
- gen11_unlock_sfc
- gen11_reset_engines
- gen8_engine_reset_prepare
- gen8_engine_reset_cancel
- gen8_reset_engines
- intel_get_gpu_reset
- __intel_gt_reset
- intel_has_gpu_reset
- intel_has_reset_engine
- intel_reset_guc
- reset_prepare_engine
- revoke_mmaps
- reset_prepare
- gt_revoke
- gt_reset
- reset_finish_engine
- reset_finish
- nop_submit_request
- __intel_gt_set_wedged
- intel_gt_set_wedged
- __intel_gt_unset_wedged
- intel_gt_unset_wedged
- do_reset
- resume
- intel_gt_reset
- intel_gt_reset_engine
- intel_engine_reset
- intel_gt_reset_global
- intel_gt_handle_error
- intel_gt_reset_trylock
- intel_gt_reset_unlock
- intel_gt_terminally_wedged
- intel_gt_init_reset
- intel_gt_fini_reset
- intel_wedge_me
- __intel_init_wedge
- __intel_fini_wedge
1
2
3
4
5
6
7 #include <linux/sched/mm.h>
8 #include <linux/stop_machine.h>
9
10 #include "display/intel_display_types.h"
11 #include "display/intel_overlay.h"
12
13 #include "gem/i915_gem_context.h"
14
15 #include "i915_drv.h"
16 #include "i915_gpu_error.h"
17 #include "i915_irq.h"
18 #include "intel_engine_pm.h"
19 #include "intel_gt.h"
20 #include "intel_gt_pm.h"
21 #include "intel_reset.h"
22
23 #include "uc/intel_guc.h"
24
25 #define RESET_MAX_RETRIES 3
26
27
28 #define RESET_UNDER_STOP_MACHINE 0
29
30 static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
31 {
32 intel_uncore_rmw_fw(uncore, reg, 0, set);
33 }
34
35 static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
36 {
37 intel_uncore_rmw_fw(uncore, reg, clr, 0);
38 }
39
40 static void engine_skip_context(struct i915_request *rq)
41 {
42 struct intel_engine_cs *engine = rq->engine;
43 struct i915_gem_context *hung_ctx = rq->gem_context;
44
45 if (!i915_request_is_active(rq))
46 return;
47
48 lockdep_assert_held(&engine->active.lock);
49 list_for_each_entry_continue(rq, &engine->active.requests, sched.link)
50 if (rq->gem_context == hung_ctx)
51 i915_request_skip(rq, -EIO);
52 }
53
54 static void client_mark_guilty(struct drm_i915_file_private *file_priv,
55 const struct i915_gem_context *ctx)
56 {
57 unsigned int score;
58 unsigned long prev_hang;
59
60 if (i915_gem_context_is_banned(ctx))
61 score = I915_CLIENT_SCORE_CONTEXT_BAN;
62 else
63 score = 0;
64
65 prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
66 if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
67 score += I915_CLIENT_SCORE_HANG_FAST;
68
69 if (score) {
70 atomic_add(score, &file_priv->ban_score);
71
72 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
73 ctx->name, score,
74 atomic_read(&file_priv->ban_score));
75 }
76 }
77
78 static bool context_mark_guilty(struct i915_gem_context *ctx)
79 {
80 unsigned long prev_hang;
81 bool banned;
82 int i;
83
84 atomic_inc(&ctx->guilty_count);
85
86
87 if (!i915_gem_context_is_bannable(ctx))
88 return false;
89
90
91 prev_hang = ctx->hang_timestamp[0];
92 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
93 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
94 ctx->hang_timestamp[i] = jiffies;
95
96
97 banned = !i915_gem_context_is_recoverable(ctx);
98 if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
99 banned = true;
100 if (banned) {
101 DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
102 ctx->name, atomic_read(&ctx->guilty_count));
103 i915_gem_context_set_banned(ctx);
104 }
105
106 if (!IS_ERR_OR_NULL(ctx->file_priv))
107 client_mark_guilty(ctx->file_priv, ctx);
108
109 return banned;
110 }
111
112 static void context_mark_innocent(struct i915_gem_context *ctx)
113 {
114 atomic_inc(&ctx->active_count);
115 }
116
117 void __i915_request_reset(struct i915_request *rq, bool guilty)
118 {
119 GEM_TRACE("%s rq=%llx:%lld, guilty? %s\n",
120 rq->engine->name,
121 rq->fence.context,
122 rq->fence.seqno,
123 yesno(guilty));
124
125 GEM_BUG_ON(i915_request_completed(rq));
126
127 if (guilty) {
128 i915_request_skip(rq, -EIO);
129 if (context_mark_guilty(rq->gem_context))
130 engine_skip_context(rq);
131 } else {
132 dma_fence_set_error(&rq->fence, -EAGAIN);
133 context_mark_innocent(rq->gem_context);
134 }
135 }
136
137 static bool i915_in_reset(struct pci_dev *pdev)
138 {
139 u8 gdrst;
140
141 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
142 return gdrst & GRDOM_RESET_STATUS;
143 }
144
145 static int i915_do_reset(struct intel_gt *gt,
146 intel_engine_mask_t engine_mask,
147 unsigned int retry)
148 {
149 struct pci_dev *pdev = gt->i915->drm.pdev;
150 int err;
151
152
153 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
154 udelay(50);
155 err = wait_for_atomic(i915_in_reset(pdev), 50);
156
157
158 pci_write_config_byte(pdev, I915_GDRST, 0);
159 udelay(50);
160 if (!err)
161 err = wait_for_atomic(!i915_in_reset(pdev), 50);
162
163 return err;
164 }
165
166 static bool g4x_reset_complete(struct pci_dev *pdev)
167 {
168 u8 gdrst;
169
170 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
171 return (gdrst & GRDOM_RESET_ENABLE) == 0;
172 }
173
174 static int g33_do_reset(struct intel_gt *gt,
175 intel_engine_mask_t engine_mask,
176 unsigned int retry)
177 {
178 struct pci_dev *pdev = gt->i915->drm.pdev;
179
180 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
181 return wait_for_atomic(g4x_reset_complete(pdev), 50);
182 }
183
184 static int g4x_do_reset(struct intel_gt *gt,
185 intel_engine_mask_t engine_mask,
186 unsigned int retry)
187 {
188 struct pci_dev *pdev = gt->i915->drm.pdev;
189 struct intel_uncore *uncore = gt->uncore;
190 int ret;
191
192
193 rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
194 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
195
196 pci_write_config_byte(pdev, I915_GDRST,
197 GRDOM_MEDIA | GRDOM_RESET_ENABLE);
198 ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
199 if (ret) {
200 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
201 goto out;
202 }
203
204 pci_write_config_byte(pdev, I915_GDRST,
205 GRDOM_RENDER | GRDOM_RESET_ENABLE);
206 ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
207 if (ret) {
208 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
209 goto out;
210 }
211
212 out:
213 pci_write_config_byte(pdev, I915_GDRST, 0);
214
215 rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
216 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
217
218 return ret;
219 }
220
221 static int ironlake_do_reset(struct intel_gt *gt,
222 intel_engine_mask_t engine_mask,
223 unsigned int retry)
224 {
225 struct intel_uncore *uncore = gt->uncore;
226 int ret;
227
228 intel_uncore_write_fw(uncore, ILK_GDSR,
229 ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
230 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
231 ILK_GRDOM_RESET_ENABLE, 0,
232 5000, 0,
233 NULL);
234 if (ret) {
235 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
236 goto out;
237 }
238
239 intel_uncore_write_fw(uncore, ILK_GDSR,
240 ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
241 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
242 ILK_GRDOM_RESET_ENABLE, 0,
243 5000, 0,
244 NULL);
245 if (ret) {
246 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
247 goto out;
248 }
249
250 out:
251 intel_uncore_write_fw(uncore, ILK_GDSR, 0);
252 intel_uncore_posting_read_fw(uncore, ILK_GDSR);
253 return ret;
254 }
255
256
257 static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
258 {
259 struct intel_uncore *uncore = gt->uncore;
260 int err;
261
262
263
264
265
266
267 intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
268
269
270 err = __intel_wait_for_register_fw(uncore,
271 GEN6_GDRST, hw_domain_mask, 0,
272 500, 0,
273 NULL);
274 if (err)
275 DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
276 hw_domain_mask);
277
278 return err;
279 }
280
281 static int gen6_reset_engines(struct intel_gt *gt,
282 intel_engine_mask_t engine_mask,
283 unsigned int retry)
284 {
285 struct intel_engine_cs *engine;
286 const u32 hw_engine_mask[] = {
287 [RCS0] = GEN6_GRDOM_RENDER,
288 [BCS0] = GEN6_GRDOM_BLT,
289 [VCS0] = GEN6_GRDOM_MEDIA,
290 [VCS1] = GEN8_GRDOM_MEDIA2,
291 [VECS0] = GEN6_GRDOM_VECS,
292 };
293 u32 hw_mask;
294
295 if (engine_mask == ALL_ENGINES) {
296 hw_mask = GEN6_GRDOM_FULL;
297 } else {
298 intel_engine_mask_t tmp;
299
300 hw_mask = 0;
301 for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
302 GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
303 hw_mask |= hw_engine_mask[engine->id];
304 }
305 }
306
307 return gen6_hw_domain_reset(gt, hw_mask);
308 }
309
310 static u32 gen11_lock_sfc(struct intel_engine_cs *engine)
311 {
312 struct intel_uncore *uncore = engine->uncore;
313 u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
314 i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
315 u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
316 i915_reg_t sfc_usage;
317 u32 sfc_usage_bit;
318 u32 sfc_reset_bit;
319
320 switch (engine->class) {
321 case VIDEO_DECODE_CLASS:
322 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
323 return 0;
324
325 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
326 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
327
328 sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
329 sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT;
330
331 sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
332 sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
333 sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
334 break;
335
336 case VIDEO_ENHANCEMENT_CLASS:
337 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
338 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
339
340 sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
341 sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT;
342
343 sfc_usage = GEN11_VECS_SFC_USAGE(engine);
344 sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
345 sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
346 break;
347
348 default:
349 return 0;
350 }
351
352
353
354
355
356
357
358
359
360 rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
361
362 if (__intel_wait_for_register_fw(uncore,
363 sfc_forced_lock_ack,
364 sfc_forced_lock_ack_bit,
365 sfc_forced_lock_ack_bit,
366 1000, 0, NULL)) {
367 DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
368 return 0;
369 }
370
371 if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit)
372 return sfc_reset_bit;
373
374 return 0;
375 }
376
377 static void gen11_unlock_sfc(struct intel_engine_cs *engine)
378 {
379 struct intel_uncore *uncore = engine->uncore;
380 u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
381 i915_reg_t sfc_forced_lock;
382 u32 sfc_forced_lock_bit;
383
384 switch (engine->class) {
385 case VIDEO_DECODE_CLASS:
386 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
387 return;
388
389 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
390 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
391 break;
392
393 case VIDEO_ENHANCEMENT_CLASS:
394 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
395 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
396 break;
397
398 default:
399 return;
400 }
401
402 rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
403 }
404
405 static int gen11_reset_engines(struct intel_gt *gt,
406 intel_engine_mask_t engine_mask,
407 unsigned int retry)
408 {
409 const u32 hw_engine_mask[] = {
410 [RCS0] = GEN11_GRDOM_RENDER,
411 [BCS0] = GEN11_GRDOM_BLT,
412 [VCS0] = GEN11_GRDOM_MEDIA,
413 [VCS1] = GEN11_GRDOM_MEDIA2,
414 [VCS2] = GEN11_GRDOM_MEDIA3,
415 [VCS3] = GEN11_GRDOM_MEDIA4,
416 [VECS0] = GEN11_GRDOM_VECS,
417 [VECS1] = GEN11_GRDOM_VECS2,
418 };
419 struct intel_engine_cs *engine;
420 intel_engine_mask_t tmp;
421 u32 hw_mask;
422 int ret;
423
424 if (engine_mask == ALL_ENGINES) {
425 hw_mask = GEN11_GRDOM_FULL;
426 } else {
427 hw_mask = 0;
428 for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
429 GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
430 hw_mask |= hw_engine_mask[engine->id];
431 hw_mask |= gen11_lock_sfc(engine);
432 }
433 }
434
435 ret = gen6_hw_domain_reset(gt, hw_mask);
436
437 if (engine_mask != ALL_ENGINES)
438 for_each_engine_masked(engine, gt->i915, engine_mask, tmp)
439 gen11_unlock_sfc(engine);
440
441 return ret;
442 }
443
444 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
445 {
446 struct intel_uncore *uncore = engine->uncore;
447 const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
448 u32 request, mask, ack;
449 int ret;
450
451 ack = intel_uncore_read_fw(uncore, reg);
452 if (ack & RESET_CTL_CAT_ERROR) {
453
454
455
456
457 request = RESET_CTL_CAT_ERROR;
458 mask = RESET_CTL_CAT_ERROR;
459
460
461 ack = 0;
462 } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
463 request = RESET_CTL_REQUEST_RESET;
464 mask = RESET_CTL_READY_TO_RESET;
465 ack = RESET_CTL_READY_TO_RESET;
466 } else {
467 return 0;
468 }
469
470 intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
471 ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
472 700, 0, NULL);
473 if (ret)
474 DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
475 engine->name, request,
476 intel_uncore_read_fw(uncore, reg));
477
478 return ret;
479 }
480
481 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
482 {
483 intel_uncore_write_fw(engine->uncore,
484 RING_RESET_CTL(engine->mmio_base),
485 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
486 }
487
488 static int gen8_reset_engines(struct intel_gt *gt,
489 intel_engine_mask_t engine_mask,
490 unsigned int retry)
491 {
492 struct intel_engine_cs *engine;
493 const bool reset_non_ready = retry >= 1;
494 intel_engine_mask_t tmp;
495 int ret;
496
497 for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
498 ret = gen8_engine_reset_prepare(engine);
499 if (ret && !reset_non_ready)
500 goto skip_reset;
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515 }
516
517 if (INTEL_GEN(gt->i915) >= 11)
518 ret = gen11_reset_engines(gt, engine_mask, retry);
519 else
520 ret = gen6_reset_engines(gt, engine_mask, retry);
521
522 skip_reset:
523 for_each_engine_masked(engine, gt->i915, engine_mask, tmp)
524 gen8_engine_reset_cancel(engine);
525
526 return ret;
527 }
528
529 typedef int (*reset_func)(struct intel_gt *,
530 intel_engine_mask_t engine_mask,
531 unsigned int retry);
532
533 static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
534 {
535 if (INTEL_GEN(i915) >= 8)
536 return gen8_reset_engines;
537 else if (INTEL_GEN(i915) >= 6)
538 return gen6_reset_engines;
539 else if (INTEL_GEN(i915) >= 5)
540 return ironlake_do_reset;
541 else if (IS_G4X(i915))
542 return g4x_do_reset;
543 else if (IS_G33(i915) || IS_PINEVIEW(i915))
544 return g33_do_reset;
545 else if (INTEL_GEN(i915) >= 3)
546 return i915_do_reset;
547 else
548 return NULL;
549 }
550
551 int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
552 {
553 const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
554 reset_func reset;
555 int ret = -ETIMEDOUT;
556 int retry;
557
558 reset = intel_get_gpu_reset(gt->i915);
559 if (!reset)
560 return -ENODEV;
561
562
563
564
565
566 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
567 for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
568 GEM_TRACE("engine_mask=%x\n", engine_mask);
569 preempt_disable();
570 ret = reset(gt, engine_mask, retry);
571 preempt_enable();
572 }
573 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
574
575 return ret;
576 }
577
578 bool intel_has_gpu_reset(struct drm_i915_private *i915)
579 {
580 if (!i915_modparams.reset)
581 return NULL;
582
583 return intel_get_gpu_reset(i915);
584 }
585
586 bool intel_has_reset_engine(struct drm_i915_private *i915)
587 {
588 return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2;
589 }
590
591 int intel_reset_guc(struct intel_gt *gt)
592 {
593 u32 guc_domain =
594 INTEL_GEN(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
595 int ret;
596
597 GEM_BUG_ON(!HAS_GT_UC(gt->i915));
598
599 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
600 ret = gen6_hw_domain_reset(gt, guc_domain);
601 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
602
603 return ret;
604 }
605
606
607
608
609
610 static void reset_prepare_engine(struct intel_engine_cs *engine)
611 {
612
613
614
615
616
617
618
619 intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
620 engine->reset.prepare(engine);
621 }
622
623 static void revoke_mmaps(struct intel_gt *gt)
624 {
625 int i;
626
627 for (i = 0; i < gt->ggtt->num_fences; i++) {
628 struct drm_vma_offset_node *node;
629 struct i915_vma *vma;
630 u64 vma_offset;
631
632 vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
633 if (!vma)
634 continue;
635
636 if (!i915_vma_has_userfault(vma))
637 continue;
638
639 GEM_BUG_ON(vma->fence != >->ggtt->fence_regs[i]);
640 node = &vma->obj->base.vma_node;
641 vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
642 unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
643 drm_vma_node_offset_addr(node) + vma_offset,
644 vma->size,
645 1);
646 }
647 }
648
649 static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
650 {
651 struct intel_engine_cs *engine;
652 intel_engine_mask_t awake = 0;
653 enum intel_engine_id id;
654
655 for_each_engine(engine, gt->i915, id) {
656 if (intel_engine_pm_get_if_awake(engine))
657 awake |= engine->mask;
658 reset_prepare_engine(engine);
659 }
660
661 intel_uc_reset_prepare(>->uc);
662
663 return awake;
664 }
665
666 static void gt_revoke(struct intel_gt *gt)
667 {
668 revoke_mmaps(gt);
669 }
670
671 static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
672 {
673 struct intel_engine_cs *engine;
674 enum intel_engine_id id;
675 int err;
676
677
678
679
680
681 err = i915_ggtt_enable_hw(gt->i915);
682 if (err)
683 return err;
684
685 for_each_engine(engine, gt->i915, id)
686 __intel_engine_reset(engine, stalled_mask & engine->mask);
687
688 i915_gem_restore_fences(gt->i915);
689
690 return err;
691 }
692
693 static void reset_finish_engine(struct intel_engine_cs *engine)
694 {
695 engine->reset.finish(engine);
696 intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
697
698 intel_engine_signal_breadcrumbs(engine);
699 }
700
701 static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
702 {
703 struct intel_engine_cs *engine;
704 enum intel_engine_id id;
705
706 for_each_engine(engine, gt->i915, id) {
707 reset_finish_engine(engine);
708 if (awake & engine->mask)
709 intel_engine_pm_put(engine);
710 }
711 }
712
713 static void nop_submit_request(struct i915_request *request)
714 {
715 struct intel_engine_cs *engine = request->engine;
716 unsigned long flags;
717
718 GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
719 engine->name, request->fence.context, request->fence.seqno);
720 dma_fence_set_error(&request->fence, -EIO);
721
722 spin_lock_irqsave(&engine->active.lock, flags);
723 __i915_request_submit(request);
724 i915_request_mark_complete(request);
725 spin_unlock_irqrestore(&engine->active.lock, flags);
726
727 intel_engine_queue_breadcrumbs(engine);
728 }
729
730 static void __intel_gt_set_wedged(struct intel_gt *gt)
731 {
732 struct intel_engine_cs *engine;
733 intel_engine_mask_t awake;
734 enum intel_engine_id id;
735
736 if (test_bit(I915_WEDGED, >->reset.flags))
737 return;
738
739 if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {
740 struct drm_printer p = drm_debug_printer(__func__);
741
742 for_each_engine(engine, gt->i915, id)
743 intel_engine_dump(engine, &p, "%s\n", engine->name);
744 }
745
746 GEM_TRACE("start\n");
747
748
749
750
751
752
753 awake = reset_prepare(gt);
754
755
756 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
757 __intel_gt_reset(gt, ALL_ENGINES);
758
759 for_each_engine(engine, gt->i915, id)
760 engine->submit_request = nop_submit_request;
761
762
763
764
765
766
767 synchronize_rcu_expedited();
768 set_bit(I915_WEDGED, >->reset.flags);
769
770
771 for_each_engine(engine, gt->i915, id)
772 engine->cancel_requests(engine);
773
774 reset_finish(gt, awake);
775
776 GEM_TRACE("end\n");
777 }
778
779 void intel_gt_set_wedged(struct intel_gt *gt)
780 {
781 intel_wakeref_t wakeref;
782
783 mutex_lock(>->reset.mutex);
784 with_intel_runtime_pm(>->i915->runtime_pm, wakeref)
785 __intel_gt_set_wedged(gt);
786 mutex_unlock(>->reset.mutex);
787 }
788
789 static bool __intel_gt_unset_wedged(struct intel_gt *gt)
790 {
791 struct intel_gt_timelines *timelines = >->timelines;
792 struct intel_timeline *tl;
793 unsigned long flags;
794
795 if (!test_bit(I915_WEDGED, >->reset.flags))
796 return true;
797
798 if (!gt->scratch)
799 return false;
800
801 GEM_TRACE("start\n");
802
803
804
805
806
807
808
809
810
811
812
813 spin_lock_irqsave(&timelines->lock, flags);
814 list_for_each_entry(tl, &timelines->active_list, link) {
815 struct i915_request *rq;
816
817 rq = i915_active_request_get_unlocked(&tl->last_request);
818 if (!rq)
819 continue;
820
821 spin_unlock_irqrestore(&timelines->lock, flags);
822
823
824
825
826
827
828
829
830 dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT);
831 i915_request_put(rq);
832
833
834 spin_lock_irqsave(&timelines->lock, flags);
835 tl = list_entry(&timelines->active_list, typeof(*tl), link);
836 }
837 spin_unlock_irqrestore(&timelines->lock, flags);
838
839 intel_gt_sanitize(gt, false);
840
841
842
843
844
845
846
847
848
849
850 intel_engines_reset_default_submission(gt);
851
852 GEM_TRACE("end\n");
853
854 smp_mb__before_atomic();
855 clear_bit(I915_WEDGED, >->reset.flags);
856
857 return true;
858 }
859
860 bool intel_gt_unset_wedged(struct intel_gt *gt)
861 {
862 bool result;
863
864 mutex_lock(>->reset.mutex);
865 result = __intel_gt_unset_wedged(gt);
866 mutex_unlock(>->reset.mutex);
867
868 return result;
869 }
870
871 static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
872 {
873 int err, i;
874
875 gt_revoke(gt);
876
877 err = __intel_gt_reset(gt, ALL_ENGINES);
878 for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
879 msleep(10 * (i + 1));
880 err = __intel_gt_reset(gt, ALL_ENGINES);
881 }
882 if (err)
883 return err;
884
885 return gt_reset(gt, stalled_mask);
886 }
887
888 static int resume(struct intel_gt *gt)
889 {
890 struct intel_engine_cs *engine;
891 enum intel_engine_id id;
892 int ret;
893
894 for_each_engine(engine, gt->i915, id) {
895 ret = engine->resume(engine);
896 if (ret)
897 return ret;
898 }
899
900 return 0;
901 }
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920 void intel_gt_reset(struct intel_gt *gt,
921 intel_engine_mask_t stalled_mask,
922 const char *reason)
923 {
924 intel_engine_mask_t awake;
925 int ret;
926
927 GEM_TRACE("flags=%lx\n", gt->reset.flags);
928
929 might_sleep();
930 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags));
931 mutex_lock(>->reset.mutex);
932
933
934 if (!__intel_gt_unset_wedged(gt))
935 goto unlock;
936
937 if (reason)
938 dev_notice(gt->i915->drm.dev,
939 "Resetting chip for %s\n", reason);
940 atomic_inc(>->i915->gpu_error.reset_count);
941
942 awake = reset_prepare(gt);
943
944 if (!intel_has_gpu_reset(gt->i915)) {
945 if (i915_modparams.reset)
946 dev_err(gt->i915->drm.dev, "GPU reset not supported\n");
947 else
948 DRM_DEBUG_DRIVER("GPU reset disabled\n");
949 goto error;
950 }
951
952 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
953 intel_runtime_pm_disable_interrupts(gt->i915);
954
955 if (do_reset(gt, stalled_mask)) {
956 dev_err(gt->i915->drm.dev, "Failed to reset chip\n");
957 goto taint;
958 }
959
960 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
961 intel_runtime_pm_enable_interrupts(gt->i915);
962
963 intel_overlay_reset(gt->i915);
964
965
966
967
968
969
970
971
972
973 ret = i915_gem_init_hw(gt->i915);
974 if (ret) {
975 DRM_ERROR("Failed to initialise HW following reset (%d)\n",
976 ret);
977 goto taint;
978 }
979
980 ret = resume(gt);
981 if (ret)
982 goto taint;
983
984 intel_gt_queue_hangcheck(gt);
985
986 finish:
987 reset_finish(gt, awake);
988 unlock:
989 mutex_unlock(>->reset.mutex);
990 return;
991
992 taint:
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005 add_taint_for_CI(TAINT_WARN);
1006 error:
1007 __intel_gt_set_wedged(gt);
1008 goto finish;
1009 }
1010
1011 static inline int intel_gt_reset_engine(struct intel_engine_cs *engine)
1012 {
1013 return __intel_gt_reset(engine->gt, engine->mask);
1014 }
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029 int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
1030 {
1031 struct intel_gt *gt = engine->gt;
1032 int ret;
1033
1034 GEM_TRACE("%s flags=%lx\n", engine->name, gt->reset.flags);
1035 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags));
1036
1037 if (!intel_engine_pm_get_if_awake(engine))
1038 return 0;
1039
1040 reset_prepare_engine(engine);
1041
1042 if (msg)
1043 dev_notice(engine->i915->drm.dev,
1044 "Resetting %s for %s\n", engine->name, msg);
1045 atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
1046
1047 if (!engine->gt->uc.guc.execbuf_client)
1048 ret = intel_gt_reset_engine(engine);
1049 else
1050 ret = intel_guc_reset_engine(&engine->gt->uc.guc, engine);
1051 if (ret) {
1052
1053 DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
1054 engine->gt->uc.guc.execbuf_client ? "GuC " : "",
1055 engine->name, ret);
1056 goto out;
1057 }
1058
1059
1060
1061
1062
1063
1064 __intel_engine_reset(engine, true);
1065
1066
1067
1068
1069
1070
1071 ret = engine->resume(engine);
1072
1073 out:
1074 intel_engine_cancel_stop_cs(engine);
1075 reset_finish_engine(engine);
1076 intel_engine_pm_put(engine);
1077 return ret;
1078 }
1079
1080 static void intel_gt_reset_global(struct intel_gt *gt,
1081 u32 engine_mask,
1082 const char *reason)
1083 {
1084 struct kobject *kobj = >->i915->drm.primary->kdev->kobj;
1085 char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1086 char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1087 char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1088 struct intel_wedge_me w;
1089
1090 kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1091
1092 DRM_DEBUG_DRIVER("resetting chip\n");
1093 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1094
1095
1096 intel_wedge_on_timeout(&w, gt, 5 * HZ) {
1097 intel_prepare_reset(gt->i915);
1098
1099
1100 synchronize_srcu_expedited(>->reset.backoff_srcu);
1101
1102 intel_gt_reset(gt, engine_mask, reason);
1103
1104 intel_finish_reset(gt->i915);
1105 }
1106
1107 if (!test_bit(I915_WEDGED, >->reset.flags))
1108 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1109 }
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124 void intel_gt_handle_error(struct intel_gt *gt,
1125 intel_engine_mask_t engine_mask,
1126 unsigned long flags,
1127 const char *fmt, ...)
1128 {
1129 struct intel_engine_cs *engine;
1130 intel_wakeref_t wakeref;
1131 intel_engine_mask_t tmp;
1132 char error_msg[80];
1133 char *msg = NULL;
1134
1135 if (fmt) {
1136 va_list args;
1137
1138 va_start(args, fmt);
1139 vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1140 va_end(args);
1141
1142 msg = error_msg;
1143 }
1144
1145
1146
1147
1148
1149
1150
1151
1152 wakeref = intel_runtime_pm_get(>->i915->runtime_pm);
1153
1154 engine_mask &= INTEL_INFO(gt->i915)->engine_mask;
1155
1156 if (flags & I915_ERROR_CAPTURE) {
1157 i915_capture_error_state(gt->i915, engine_mask, msg);
1158 intel_gt_clear_error_registers(gt, engine_mask);
1159 }
1160
1161
1162
1163
1164
1165 if (intel_has_reset_engine(gt->i915) && !intel_gt_is_wedged(gt)) {
1166 for_each_engine_masked(engine, gt->i915, engine_mask, tmp) {
1167 BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1168 if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1169 >->reset.flags))
1170 continue;
1171
1172 if (intel_engine_reset(engine, msg) == 0)
1173 engine_mask &= ~engine->mask;
1174
1175 clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1176 >->reset.flags);
1177 }
1178 }
1179
1180 if (!engine_mask)
1181 goto out;
1182
1183
1184 if (test_and_set_bit(I915_RESET_BACKOFF, >->reset.flags)) {
1185 wait_event(gt->reset.queue,
1186 !test_bit(I915_RESET_BACKOFF, >->reset.flags));
1187 goto out;
1188 }
1189
1190
1191 synchronize_rcu_expedited();
1192
1193
1194 for_each_engine(engine, gt->i915, tmp) {
1195 while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1196 >->reset.flags))
1197 wait_on_bit(>->reset.flags,
1198 I915_RESET_ENGINE + engine->id,
1199 TASK_UNINTERRUPTIBLE);
1200 }
1201
1202 intel_gt_reset_global(gt, engine_mask, msg);
1203
1204 for_each_engine(engine, gt->i915, tmp)
1205 clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1206 >->reset.flags);
1207 clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags);
1208 smp_mb__after_atomic();
1209 wake_up_all(>->reset.queue);
1210
1211 out:
1212 intel_runtime_pm_put(>->i915->runtime_pm, wakeref);
1213 }
1214
1215 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
1216 {
1217 might_lock(>->reset.backoff_srcu);
1218 might_sleep();
1219
1220 rcu_read_lock();
1221 while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) {
1222 rcu_read_unlock();
1223
1224 if (wait_event_interruptible(gt->reset.queue,
1225 !test_bit(I915_RESET_BACKOFF,
1226 >->reset.flags)))
1227 return -EINTR;
1228
1229 rcu_read_lock();
1230 }
1231 *srcu = srcu_read_lock(>->reset.backoff_srcu);
1232 rcu_read_unlock();
1233
1234 return 0;
1235 }
1236
1237 void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1238 __releases(>->reset.backoff_srcu)
1239 {
1240 srcu_read_unlock(>->reset.backoff_srcu, tag);
1241 }
1242
1243 int intel_gt_terminally_wedged(struct intel_gt *gt)
1244 {
1245 might_sleep();
1246
1247 if (!intel_gt_is_wedged(gt))
1248 return 0;
1249
1250
1251 if (!test_bit(I915_RESET_BACKOFF, >->reset.flags))
1252 return -EIO;
1253
1254
1255 if (mutex_is_locked(>->i915->drm.struct_mutex))
1256 return -EAGAIN;
1257
1258 if (wait_event_interruptible(gt->reset.queue,
1259 !test_bit(I915_RESET_BACKOFF,
1260 >->reset.flags)))
1261 return -EINTR;
1262
1263 return intel_gt_is_wedged(gt) ? -EIO : 0;
1264 }
1265
1266 void intel_gt_init_reset(struct intel_gt *gt)
1267 {
1268 init_waitqueue_head(>->reset.queue);
1269 mutex_init(>->reset.mutex);
1270 init_srcu_struct(>->reset.backoff_srcu);
1271 }
1272
1273 void intel_gt_fini_reset(struct intel_gt *gt)
1274 {
1275 cleanup_srcu_struct(>->reset.backoff_srcu);
1276 }
1277
1278 static void intel_wedge_me(struct work_struct *work)
1279 {
1280 struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1281
1282 dev_err(w->gt->i915->drm.dev,
1283 "%s timed out, cancelling all in-flight rendering.\n",
1284 w->name);
1285 intel_gt_set_wedged(w->gt);
1286 }
1287
1288 void __intel_init_wedge(struct intel_wedge_me *w,
1289 struct intel_gt *gt,
1290 long timeout,
1291 const char *name)
1292 {
1293 w->gt = gt;
1294 w->name = name;
1295
1296 INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1297 schedule_delayed_work(&w->work, timeout);
1298 }
1299
1300 void __intel_fini_wedge(struct intel_wedge_me *w)
1301 {
1302 cancel_delayed_work_sync(&w->work);
1303 destroy_delayed_work_on_stack(&w->work);
1304 w->gt = NULL;
1305 }
1306
1307 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1308 #include "selftest_reset.c"
1309 #endif