2 * SPDX-License-Identifier: MIT
4 * Copyright © 2008-2018 Intel Corporation
7 #include <linux/sched/mm.h>
8 #include <linux/stop_machine.h>
10 #include "display/intel_display_types.h"
11 #include "display/intel_overlay.h"
13 #include "gem/i915_gem_context.h"
16 #include "i915_gpu_error.h"
18 #include "intel_breadcrumbs.h"
19 #include "intel_engine_pm.h"
21 #include "intel_gt_pm.h"
22 #include "intel_gt_requests.h"
23 #include "intel_reset.h"
25 #include "uc/intel_guc.h"
26 #include "uc/intel_guc_submission.h"
28 #define RESET_MAX_RETRIES 3
30 /* XXX How to handle concurrent GGTT updates using tiling registers? */
31 #define RESET_UNDER_STOP_MACHINE 0
33 static void rmw_set_fw(struct intel_uncore
*uncore
, i915_reg_t reg
, u32 set
)
35 intel_uncore_rmw_fw(uncore
, reg
, 0, set
);
38 static void rmw_clear_fw(struct intel_uncore
*uncore
, i915_reg_t reg
, u32 clr
)
40 intel_uncore_rmw_fw(uncore
, reg
, clr
, 0);
43 static void engine_skip_context(struct i915_request
*rq
)
45 struct intel_engine_cs
*engine
= rq
->engine
;
46 struct intel_context
*hung_ctx
= rq
->context
;
48 if (!i915_request_is_active(rq
))
51 lockdep_assert_held(&engine
->active
.lock
);
52 list_for_each_entry_continue(rq
, &engine
->active
.requests
, sched
.link
)
53 if (rq
->context
== hung_ctx
) {
54 i915_request_set_error_once(rq
, -EIO
);
55 __i915_request_skip(rq
);
59 static void client_mark_guilty(struct i915_gem_context
*ctx
, bool banned
)
61 struct drm_i915_file_private
*file_priv
= ctx
->file_priv
;
62 unsigned long prev_hang
;
65 if (IS_ERR_OR_NULL(file_priv
))
70 score
= I915_CLIENT_SCORE_CONTEXT_BAN
;
72 prev_hang
= xchg(&file_priv
->hang_timestamp
, jiffies
);
73 if (time_before(jiffies
, prev_hang
+ I915_CLIENT_FAST_HANG_JIFFIES
))
74 score
+= I915_CLIENT_SCORE_HANG_FAST
;
77 atomic_add(score
, &file_priv
->ban_score
);
79 drm_dbg(&ctx
->i915
->drm
,
80 "client %s: gained %u ban score, now %u\n",
82 atomic_read(&file_priv
->ban_score
));
86 static bool mark_guilty(struct i915_request
*rq
)
88 struct i915_gem_context
*ctx
;
89 unsigned long prev_hang
;
93 if (intel_context_is_closed(rq
->context
)) {
94 intel_context_set_banned(rq
->context
);
99 ctx
= rcu_dereference(rq
->context
->gem_context
);
100 if (ctx
&& !kref_get_unless_zero(&ctx
->ref
))
104 return intel_context_is_banned(rq
->context
);
106 atomic_inc(&ctx
->guilty_count
);
108 /* Cool contexts are too cool to be banned! (Used for reset testing.) */
109 if (!i915_gem_context_is_bannable(ctx
)) {
114 drm_notice(&ctx
->i915
->drm
,
115 "%s context reset due to GPU hang\n",
118 /* Record the timestamp for the last N hangs */
119 prev_hang
= ctx
->hang_timestamp
[0];
120 for (i
= 0; i
< ARRAY_SIZE(ctx
->hang_timestamp
) - 1; i
++)
121 ctx
->hang_timestamp
[i
] = ctx
->hang_timestamp
[i
+ 1];
122 ctx
->hang_timestamp
[i
] = jiffies
;
124 /* If we have hung N+1 times in rapid succession, we ban the context! */
125 banned
= !i915_gem_context_is_recoverable(ctx
);
126 if (time_before(jiffies
, prev_hang
+ CONTEXT_FAST_HANG_JIFFIES
))
129 drm_dbg(&ctx
->i915
->drm
, "context %s: guilty %d, banned\n",
130 ctx
->name
, atomic_read(&ctx
->guilty_count
));
131 intel_context_set_banned(rq
->context
);
134 client_mark_guilty(ctx
, banned
);
137 i915_gem_context_put(ctx
);
141 static void mark_innocent(struct i915_request
*rq
)
143 struct i915_gem_context
*ctx
;
146 ctx
= rcu_dereference(rq
->context
->gem_context
);
148 atomic_inc(&ctx
->active_count
);
152 void __i915_request_reset(struct i915_request
*rq
, bool guilty
)
154 RQ_TRACE(rq
, "guilty? %s\n", yesno(guilty
));
156 GEM_BUG_ON(i915_request_completed(rq
));
158 rcu_read_lock(); /* protect the GEM context */
160 i915_request_set_error_once(rq
, -EIO
);
161 __i915_request_skip(rq
);
163 engine_skip_context(rq
);
165 i915_request_set_error_once(rq
, -EAGAIN
);
171 static bool i915_in_reset(struct pci_dev
*pdev
)
175 pci_read_config_byte(pdev
, I915_GDRST
, &gdrst
);
176 return gdrst
& GRDOM_RESET_STATUS
;
179 static int i915_do_reset(struct intel_gt
*gt
,
180 intel_engine_mask_t engine_mask
,
183 struct pci_dev
*pdev
= gt
->i915
->drm
.pdev
;
186 /* Assert reset for at least 20 usec, and wait for acknowledgement. */
187 pci_write_config_byte(pdev
, I915_GDRST
, GRDOM_RESET_ENABLE
);
189 err
= wait_for_atomic(i915_in_reset(pdev
), 50);
191 /* Clear the reset request. */
192 pci_write_config_byte(pdev
, I915_GDRST
, 0);
195 err
= wait_for_atomic(!i915_in_reset(pdev
), 50);
200 static bool g4x_reset_complete(struct pci_dev
*pdev
)
204 pci_read_config_byte(pdev
, I915_GDRST
, &gdrst
);
205 return (gdrst
& GRDOM_RESET_ENABLE
) == 0;
208 static int g33_do_reset(struct intel_gt
*gt
,
209 intel_engine_mask_t engine_mask
,
212 struct pci_dev
*pdev
= gt
->i915
->drm
.pdev
;
214 pci_write_config_byte(pdev
, I915_GDRST
, GRDOM_RESET_ENABLE
);
215 return wait_for_atomic(g4x_reset_complete(pdev
), 50);
218 static int g4x_do_reset(struct intel_gt
*gt
,
219 intel_engine_mask_t engine_mask
,
222 struct pci_dev
*pdev
= gt
->i915
->drm
.pdev
;
223 struct intel_uncore
*uncore
= gt
->uncore
;
226 /* WaVcpClkGateDisableForMediaReset:ctg,elk */
227 rmw_set_fw(uncore
, VDECCLK_GATE_D
, VCP_UNIT_CLOCK_GATE_DISABLE
);
228 intel_uncore_posting_read_fw(uncore
, VDECCLK_GATE_D
);
230 pci_write_config_byte(pdev
, I915_GDRST
,
231 GRDOM_MEDIA
| GRDOM_RESET_ENABLE
);
232 ret
= wait_for_atomic(g4x_reset_complete(pdev
), 50);
234 drm_dbg(>
->i915
->drm
, "Wait for media reset failed\n");
238 pci_write_config_byte(pdev
, I915_GDRST
,
239 GRDOM_RENDER
| GRDOM_RESET_ENABLE
);
240 ret
= wait_for_atomic(g4x_reset_complete(pdev
), 50);
242 drm_dbg(>
->i915
->drm
, "Wait for render reset failed\n");
247 pci_write_config_byte(pdev
, I915_GDRST
, 0);
249 rmw_clear_fw(uncore
, VDECCLK_GATE_D
, VCP_UNIT_CLOCK_GATE_DISABLE
);
250 intel_uncore_posting_read_fw(uncore
, VDECCLK_GATE_D
);
255 static int ilk_do_reset(struct intel_gt
*gt
, intel_engine_mask_t engine_mask
,
258 struct intel_uncore
*uncore
= gt
->uncore
;
261 intel_uncore_write_fw(uncore
, ILK_GDSR
,
262 ILK_GRDOM_RENDER
| ILK_GRDOM_RESET_ENABLE
);
263 ret
= __intel_wait_for_register_fw(uncore
, ILK_GDSR
,
264 ILK_GRDOM_RESET_ENABLE
, 0,
268 drm_dbg(>
->i915
->drm
, "Wait for render reset failed\n");
272 intel_uncore_write_fw(uncore
, ILK_GDSR
,
273 ILK_GRDOM_MEDIA
| ILK_GRDOM_RESET_ENABLE
);
274 ret
= __intel_wait_for_register_fw(uncore
, ILK_GDSR
,
275 ILK_GRDOM_RESET_ENABLE
, 0,
279 drm_dbg(>
->i915
->drm
, "Wait for media reset failed\n");
284 intel_uncore_write_fw(uncore
, ILK_GDSR
, 0);
285 intel_uncore_posting_read_fw(uncore
, ILK_GDSR
);
289 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
290 static int gen6_hw_domain_reset(struct intel_gt
*gt
, u32 hw_domain_mask
)
292 struct intel_uncore
*uncore
= gt
->uncore
;
296 * GEN6_GDRST is not in the gt power well, no need to check
297 * for fifo space for the write or forcewake the chip for
300 intel_uncore_write_fw(uncore
, GEN6_GDRST
, hw_domain_mask
);
302 /* Wait for the device to ack the reset requests */
303 err
= __intel_wait_for_register_fw(uncore
,
304 GEN6_GDRST
, hw_domain_mask
, 0,
308 drm_dbg(>
->i915
->drm
,
309 "Wait for 0x%08x engines reset failed\n",
315 static int gen6_reset_engines(struct intel_gt
*gt
,
316 intel_engine_mask_t engine_mask
,
319 static const u32 hw_engine_mask
[] = {
320 [RCS0
] = GEN6_GRDOM_RENDER
,
321 [BCS0
] = GEN6_GRDOM_BLT
,
322 [VCS0
] = GEN6_GRDOM_MEDIA
,
323 [VCS1
] = GEN8_GRDOM_MEDIA2
,
324 [VECS0
] = GEN6_GRDOM_VECS
,
326 struct intel_engine_cs
*engine
;
329 if (engine_mask
== ALL_ENGINES
) {
330 hw_mask
= GEN6_GRDOM_FULL
;
332 intel_engine_mask_t tmp
;
335 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
) {
336 GEM_BUG_ON(engine
->id
>= ARRAY_SIZE(hw_engine_mask
));
337 hw_mask
|= hw_engine_mask
[engine
->id
];
341 return gen6_hw_domain_reset(gt
, hw_mask
);
344 static int gen11_lock_sfc(struct intel_engine_cs
*engine
, u32
*hw_mask
)
346 struct intel_uncore
*uncore
= engine
->uncore
;
347 u8 vdbox_sfc_access
= engine
->gt
->info
.vdbox_sfc_access
;
348 i915_reg_t sfc_forced_lock
, sfc_forced_lock_ack
;
349 u32 sfc_forced_lock_bit
, sfc_forced_lock_ack_bit
;
350 i915_reg_t sfc_usage
;
355 switch (engine
->class) {
356 case VIDEO_DECODE_CLASS
:
357 if ((BIT(engine
->instance
) & vdbox_sfc_access
) == 0)
360 sfc_forced_lock
= GEN11_VCS_SFC_FORCED_LOCK(engine
);
361 sfc_forced_lock_bit
= GEN11_VCS_SFC_FORCED_LOCK_BIT
;
363 sfc_forced_lock_ack
= GEN11_VCS_SFC_LOCK_STATUS(engine
);
364 sfc_forced_lock_ack_bit
= GEN11_VCS_SFC_LOCK_ACK_BIT
;
366 sfc_usage
= GEN11_VCS_SFC_LOCK_STATUS(engine
);
367 sfc_usage_bit
= GEN11_VCS_SFC_USAGE_BIT
;
368 sfc_reset_bit
= GEN11_VCS_SFC_RESET_BIT(engine
->instance
);
371 case VIDEO_ENHANCEMENT_CLASS
:
372 sfc_forced_lock
= GEN11_VECS_SFC_FORCED_LOCK(engine
);
373 sfc_forced_lock_bit
= GEN11_VECS_SFC_FORCED_LOCK_BIT
;
375 sfc_forced_lock_ack
= GEN11_VECS_SFC_LOCK_ACK(engine
);
376 sfc_forced_lock_ack_bit
= GEN11_VECS_SFC_LOCK_ACK_BIT
;
378 sfc_usage
= GEN11_VECS_SFC_USAGE(engine
);
379 sfc_usage_bit
= GEN11_VECS_SFC_USAGE_BIT
;
380 sfc_reset_bit
= GEN11_VECS_SFC_RESET_BIT(engine
->instance
);
388 * If the engine is using a SFC, tell the engine that a software reset
389 * is going to happen. The engine will then try to force lock the SFC.
390 * If SFC ends up being locked to the engine we want to reset, we have
391 * to reset it as well (we will unlock it once the reset sequence is
394 if (!(intel_uncore_read_fw(uncore
, sfc_usage
) & sfc_usage_bit
))
397 rmw_set_fw(uncore
, sfc_forced_lock
, sfc_forced_lock_bit
);
399 ret
= __intel_wait_for_register_fw(uncore
,
401 sfc_forced_lock_ack_bit
,
402 sfc_forced_lock_ack_bit
,
405 /* Was the SFC released while we were trying to lock it? */
406 if (!(intel_uncore_read_fw(uncore
, sfc_usage
) & sfc_usage_bit
))
410 drm_dbg(&engine
->i915
->drm
,
411 "Wait for SFC forced lock ack failed\n");
415 *hw_mask
|= sfc_reset_bit
;
419 static void gen11_unlock_sfc(struct intel_engine_cs
*engine
)
421 struct intel_uncore
*uncore
= engine
->uncore
;
422 u8 vdbox_sfc_access
= engine
->gt
->info
.vdbox_sfc_access
;
423 i915_reg_t sfc_forced_lock
;
424 u32 sfc_forced_lock_bit
;
426 switch (engine
->class) {
427 case VIDEO_DECODE_CLASS
:
428 if ((BIT(engine
->instance
) & vdbox_sfc_access
) == 0)
431 sfc_forced_lock
= GEN11_VCS_SFC_FORCED_LOCK(engine
);
432 sfc_forced_lock_bit
= GEN11_VCS_SFC_FORCED_LOCK_BIT
;
435 case VIDEO_ENHANCEMENT_CLASS
:
436 sfc_forced_lock
= GEN11_VECS_SFC_FORCED_LOCK(engine
);
437 sfc_forced_lock_bit
= GEN11_VECS_SFC_FORCED_LOCK_BIT
;
444 rmw_clear_fw(uncore
, sfc_forced_lock
, sfc_forced_lock_bit
);
447 static int gen11_reset_engines(struct intel_gt
*gt
,
448 intel_engine_mask_t engine_mask
,
451 static const u32 hw_engine_mask
[] = {
452 [RCS0
] = GEN11_GRDOM_RENDER
,
453 [BCS0
] = GEN11_GRDOM_BLT
,
454 [VCS0
] = GEN11_GRDOM_MEDIA
,
455 [VCS1
] = GEN11_GRDOM_MEDIA2
,
456 [VCS2
] = GEN11_GRDOM_MEDIA3
,
457 [VCS3
] = GEN11_GRDOM_MEDIA4
,
458 [VECS0
] = GEN11_GRDOM_VECS
,
459 [VECS1
] = GEN11_GRDOM_VECS2
,
461 struct intel_engine_cs
*engine
;
462 intel_engine_mask_t tmp
;
466 if (engine_mask
== ALL_ENGINES
) {
467 hw_mask
= GEN11_GRDOM_FULL
;
470 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
) {
471 GEM_BUG_ON(engine
->id
>= ARRAY_SIZE(hw_engine_mask
));
472 hw_mask
|= hw_engine_mask
[engine
->id
];
473 ret
= gen11_lock_sfc(engine
, &hw_mask
);
479 ret
= gen6_hw_domain_reset(gt
, hw_mask
);
483 * We unlock the SFC based on the lock status and not the result of
484 * gen11_lock_sfc to make sure that we clean properly if something
485 * wrong happened during the lock (e.g. lock acquired after timeout
488 if (engine_mask
!= ALL_ENGINES
)
489 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
)
490 gen11_unlock_sfc(engine
);
495 static int gen8_engine_reset_prepare(struct intel_engine_cs
*engine
)
497 struct intel_uncore
*uncore
= engine
->uncore
;
498 const i915_reg_t reg
= RING_RESET_CTL(engine
->mmio_base
);
499 u32 request
, mask
, ack
;
502 ack
= intel_uncore_read_fw(uncore
, reg
);
503 if (ack
& RESET_CTL_CAT_ERROR
) {
505 * For catastrophic errors, ready-for-reset sequence
506 * needs to be bypassed: HAS#396813
508 request
= RESET_CTL_CAT_ERROR
;
509 mask
= RESET_CTL_CAT_ERROR
;
511 /* Catastrophic errors need to be cleared by HW */
513 } else if (!(ack
& RESET_CTL_READY_TO_RESET
)) {
514 request
= RESET_CTL_REQUEST_RESET
;
515 mask
= RESET_CTL_READY_TO_RESET
;
516 ack
= RESET_CTL_READY_TO_RESET
;
521 intel_uncore_write_fw(uncore
, reg
, _MASKED_BIT_ENABLE(request
));
522 ret
= __intel_wait_for_register_fw(uncore
, reg
, mask
, ack
,
525 drm_err(&engine
->i915
->drm
,
526 "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
527 engine
->name
, request
,
528 intel_uncore_read_fw(uncore
, reg
));
533 static void gen8_engine_reset_cancel(struct intel_engine_cs
*engine
)
535 intel_uncore_write_fw(engine
->uncore
,
536 RING_RESET_CTL(engine
->mmio_base
),
537 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET
));
540 static int gen8_reset_engines(struct intel_gt
*gt
,
541 intel_engine_mask_t engine_mask
,
544 struct intel_engine_cs
*engine
;
545 const bool reset_non_ready
= retry
>= 1;
546 intel_engine_mask_t tmp
;
549 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
) {
550 ret
= gen8_engine_reset_prepare(engine
);
551 if (ret
&& !reset_non_ready
)
555 * If this is not the first failed attempt to prepare,
556 * we decide to proceed anyway.
558 * By doing so we risk context corruption and with
559 * some gens (kbl), possible system hang if reset
560 * happens during active bb execution.
562 * We rather take context corruption instead of
563 * failed reset with a wedged driver/gpu. And
564 * active bb execution case should be covered by
565 * stop_engines() we have before the reset.
569 if (INTEL_GEN(gt
->i915
) >= 11)
570 ret
= gen11_reset_engines(gt
, engine_mask
, retry
);
572 ret
= gen6_reset_engines(gt
, engine_mask
, retry
);
575 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
)
576 gen8_engine_reset_cancel(engine
);
581 static int mock_reset(struct intel_gt
*gt
,
582 intel_engine_mask_t mask
,
588 typedef int (*reset_func
)(struct intel_gt
*,
589 intel_engine_mask_t engine_mask
,
592 static reset_func
intel_get_gpu_reset(const struct intel_gt
*gt
)
594 struct drm_i915_private
*i915
= gt
->i915
;
598 else if (INTEL_GEN(i915
) >= 8)
599 return gen8_reset_engines
;
600 else if (INTEL_GEN(i915
) >= 6)
601 return gen6_reset_engines
;
602 else if (INTEL_GEN(i915
) >= 5)
604 else if (IS_G4X(i915
))
606 else if (IS_G33(i915
) || IS_PINEVIEW(i915
))
608 else if (INTEL_GEN(i915
) >= 3)
609 return i915_do_reset
;
614 int __intel_gt_reset(struct intel_gt
*gt
, intel_engine_mask_t engine_mask
)
616 const int retries
= engine_mask
== ALL_ENGINES
? RESET_MAX_RETRIES
: 1;
618 int ret
= -ETIMEDOUT
;
621 reset
= intel_get_gpu_reset(gt
);
626 * If the power well sleeps during the reset, the reset
627 * request may be dropped and never completes (causing -EIO).
629 intel_uncore_forcewake_get(gt
->uncore
, FORCEWAKE_ALL
);
630 for (retry
= 0; ret
== -ETIMEDOUT
&& retry
< retries
; retry
++) {
631 GT_TRACE(gt
, "engine_mask=%x\n", engine_mask
);
633 ret
= reset(gt
, engine_mask
, retry
);
636 intel_uncore_forcewake_put(gt
->uncore
, FORCEWAKE_ALL
);
641 bool intel_has_gpu_reset(const struct intel_gt
*gt
)
643 if (!gt
->i915
->params
.reset
)
646 return intel_get_gpu_reset(gt
);
649 bool intel_has_reset_engine(const struct intel_gt
*gt
)
651 if (gt
->i915
->params
.reset
< 2)
654 return INTEL_INFO(gt
->i915
)->has_reset_engine
;
657 int intel_reset_guc(struct intel_gt
*gt
)
660 INTEL_GEN(gt
->i915
) >= 11 ? GEN11_GRDOM_GUC
: GEN9_GRDOM_GUC
;
663 GEM_BUG_ON(!HAS_GT_UC(gt
->i915
));
665 intel_uncore_forcewake_get(gt
->uncore
, FORCEWAKE_ALL
);
666 ret
= gen6_hw_domain_reset(gt
, guc_domain
);
667 intel_uncore_forcewake_put(gt
->uncore
, FORCEWAKE_ALL
);
673 * Ensure irq handler finishes, and not run again.
674 * Also return the active request so that we only search for it once.
676 static void reset_prepare_engine(struct intel_engine_cs
*engine
)
679 * During the reset sequence, we must prevent the engine from
680 * entering RC6. As the context state is undefined until we restart
681 * the engine, if it does enter RC6 during the reset, the state
682 * written to the powercontext is undefined and so we may lose
683 * GPU state upon resume, i.e. fail to restart after a reset.
685 intel_uncore_forcewake_get(engine
->uncore
, FORCEWAKE_ALL
);
686 if (engine
->reset
.prepare
)
687 engine
->reset
.prepare(engine
);
690 static void revoke_mmaps(struct intel_gt
*gt
)
694 for (i
= 0; i
< gt
->ggtt
->num_fences
; i
++) {
695 struct drm_vma_offset_node
*node
;
696 struct i915_vma
*vma
;
699 vma
= READ_ONCE(gt
->ggtt
->fence_regs
[i
].vma
);
703 if (!i915_vma_has_userfault(vma
))
706 GEM_BUG_ON(vma
->fence
!= >
->ggtt
->fence_regs
[i
]);
711 node
= &vma
->mmo
->vma_node
;
712 vma_offset
= vma
->ggtt_view
.partial
.offset
<< PAGE_SHIFT
;
714 unmap_mapping_range(gt
->i915
->drm
.anon_inode
->i_mapping
,
715 drm_vma_node_offset_addr(node
) + vma_offset
,
721 static intel_engine_mask_t
reset_prepare(struct intel_gt
*gt
)
723 struct intel_engine_cs
*engine
;
724 intel_engine_mask_t awake
= 0;
725 enum intel_engine_id id
;
727 for_each_engine(engine
, gt
, id
) {
728 if (intel_engine_pm_get_if_awake(engine
))
729 awake
|= engine
->mask
;
730 reset_prepare_engine(engine
);
733 intel_uc_reset_prepare(>
->uc
);
738 static void gt_revoke(struct intel_gt
*gt
)
743 static int gt_reset(struct intel_gt
*gt
, intel_engine_mask_t stalled_mask
)
745 struct intel_engine_cs
*engine
;
746 enum intel_engine_id id
;
750 * Everything depends on having the GTT running, so we need to start
753 err
= i915_ggtt_enable_hw(gt
->i915
);
757 for_each_engine(engine
, gt
, id
)
758 __intel_engine_reset(engine
, stalled_mask
& engine
->mask
);
760 intel_ggtt_restore_fences(gt
->ggtt
);
765 static void reset_finish_engine(struct intel_engine_cs
*engine
)
767 if (engine
->reset
.finish
)
768 engine
->reset
.finish(engine
);
769 intel_uncore_forcewake_put(engine
->uncore
, FORCEWAKE_ALL
);
771 intel_engine_signal_breadcrumbs(engine
);
774 static void reset_finish(struct intel_gt
*gt
, intel_engine_mask_t awake
)
776 struct intel_engine_cs
*engine
;
777 enum intel_engine_id id
;
779 for_each_engine(engine
, gt
, id
) {
780 reset_finish_engine(engine
);
781 if (awake
& engine
->mask
)
782 intel_engine_pm_put(engine
);
786 static void nop_submit_request(struct i915_request
*request
)
788 struct intel_engine_cs
*engine
= request
->engine
;
791 RQ_TRACE(request
, "-EIO\n");
792 i915_request_set_error_once(request
, -EIO
);
794 spin_lock_irqsave(&engine
->active
.lock
, flags
);
795 __i915_request_submit(request
);
796 i915_request_mark_complete(request
);
797 spin_unlock_irqrestore(&engine
->active
.lock
, flags
);
799 intel_engine_signal_breadcrumbs(engine
);
802 static void __intel_gt_set_wedged(struct intel_gt
*gt
)
804 struct intel_engine_cs
*engine
;
805 intel_engine_mask_t awake
;
806 enum intel_engine_id id
;
808 if (test_bit(I915_WEDGED
, >
->reset
.flags
))
811 GT_TRACE(gt
, "start\n");
814 * First, stop submission to hw, but do not yet complete requests by
815 * rolling the global seqno forward (since this would complete requests
816 * for which we haven't set the fence error to EIO yet).
818 awake
= reset_prepare(gt
);
820 /* Even if the GPU reset fails, it should still stop the engines */
821 if (!INTEL_INFO(gt
->i915
)->gpu_reset_clobbers_display
)
822 __intel_gt_reset(gt
, ALL_ENGINES
);
824 for_each_engine(engine
, gt
, id
)
825 engine
->submit_request
= nop_submit_request
;
828 * Make sure no request can slip through without getting completed by
829 * either this call here to intel_engine_write_global_seqno, or the one
830 * in nop_submit_request.
832 synchronize_rcu_expedited();
833 set_bit(I915_WEDGED
, >
->reset
.flags
);
835 /* Mark all executing requests as skipped */
836 for_each_engine(engine
, gt
, id
)
837 if (engine
->reset
.cancel
)
838 engine
->reset
.cancel(engine
);
840 reset_finish(gt
, awake
);
842 GT_TRACE(gt
, "end\n");
845 void intel_gt_set_wedged(struct intel_gt
*gt
)
847 intel_wakeref_t wakeref
;
849 if (test_bit(I915_WEDGED
, >
->reset
.flags
))
852 wakeref
= intel_runtime_pm_get(gt
->uncore
->rpm
);
853 mutex_lock(>
->reset
.mutex
);
855 if (GEM_SHOW_DEBUG()) {
856 struct drm_printer p
= drm_debug_printer(__func__
);
857 struct intel_engine_cs
*engine
;
858 enum intel_engine_id id
;
860 drm_printf(&p
, "called from %pS\n", (void *)_RET_IP_
);
861 for_each_engine(engine
, gt
, id
) {
862 if (intel_engine_is_idle(engine
))
865 intel_engine_dump(engine
, &p
, "%s\n", engine
->name
);
869 __intel_gt_set_wedged(gt
);
871 mutex_unlock(>
->reset
.mutex
);
872 intel_runtime_pm_put(gt
->uncore
->rpm
, wakeref
);
875 static bool __intel_gt_unset_wedged(struct intel_gt
*gt
)
877 struct intel_gt_timelines
*timelines
= >
->timelines
;
878 struct intel_timeline
*tl
;
881 if (!test_bit(I915_WEDGED
, >
->reset
.flags
))
884 /* Never fully initialised, recovery impossible */
885 if (intel_gt_has_unrecoverable_error(gt
))
888 GT_TRACE(gt
, "start\n");
891 * Before unwedging, make sure that all pending operations
892 * are flushed and errored out - we may have requests waiting upon
893 * third party fences. We marked all inflight requests as EIO, and
894 * every execbuf since returned EIO, for consistency we want all
895 * the currently pending requests to also be marked as EIO, which
896 * is done inside our nop_submit_request - and so we must wait.
898 * No more can be submitted until we reset the wedged bit.
900 spin_lock(&timelines
->lock
);
901 list_for_each_entry(tl
, &timelines
->active_list
, link
) {
902 struct dma_fence
*fence
;
904 fence
= i915_active_fence_get(&tl
->last_request
);
908 spin_unlock(&timelines
->lock
);
911 * All internal dependencies (i915_requests) will have
912 * been flushed by the set-wedge, but we may be stuck waiting
913 * for external fences. These should all be capped to 10s
914 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
917 dma_fence_default_wait(fence
, false, MAX_SCHEDULE_TIMEOUT
);
918 dma_fence_put(fence
);
920 /* Restart iteration after droping lock */
921 spin_lock(&timelines
->lock
);
922 tl
= list_entry(&timelines
->active_list
, typeof(*tl
), link
);
924 spin_unlock(&timelines
->lock
);
926 /* We must reset pending GPU events before restoring our submission */
927 ok
= !HAS_EXECLISTS(gt
->i915
); /* XXX better agnosticism desired */
928 if (!INTEL_INFO(gt
->i915
)->gpu_reset_clobbers_display
)
929 ok
= __intel_gt_reset(gt
, ALL_ENGINES
) == 0;
932 * Warn CI about the unrecoverable wedged condition.
935 add_taint_for_CI(gt
->i915
, TAINT_WARN
);
940 * Undo nop_submit_request. We prevent all new i915 requests from
941 * being queued (by disallowing execbuf whilst wedged) so having
942 * waited for all active requests above, we know the system is idle
943 * and do not have to worry about a thread being inside
944 * engine->submit_request() as we swap over. So unlike installing
945 * the nop_submit_request on reset, we can do this from normal
946 * context and do not require stop_machine().
948 intel_engines_reset_default_submission(gt
);
950 GT_TRACE(gt
, "end\n");
952 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
953 clear_bit(I915_WEDGED
, >
->reset
.flags
);
958 bool intel_gt_unset_wedged(struct intel_gt
*gt
)
962 mutex_lock(>
->reset
.mutex
);
963 result
= __intel_gt_unset_wedged(gt
);
964 mutex_unlock(>
->reset
.mutex
);
969 static int do_reset(struct intel_gt
*gt
, intel_engine_mask_t stalled_mask
)
975 err
= __intel_gt_reset(gt
, ALL_ENGINES
);
976 for (i
= 0; err
&& i
< RESET_MAX_RETRIES
; i
++) {
977 msleep(10 * (i
+ 1));
978 err
= __intel_gt_reset(gt
, ALL_ENGINES
);
983 return gt_reset(gt
, stalled_mask
);
986 static int resume(struct intel_gt
*gt
)
988 struct intel_engine_cs
*engine
;
989 enum intel_engine_id id
;
992 for_each_engine(engine
, gt
, id
) {
993 ret
= intel_engine_resume(engine
);
1002 * intel_gt_reset - reset chip after a hang
1003 * @gt: #intel_gt to reset
1004 * @stalled_mask: mask of the stalled engines with the guilty requests
1005 * @reason: user error message for why we are resetting
1007 * Reset the chip. Useful if a hang is detected. Marks the device as wedged
1010 * Procedure is fairly simple:
1011 * - reset the chip using the reset reg
1012 * - re-init context state
1013 * - re-init hardware status page
1014 * - re-init ring buffer
1015 * - re-init interrupt state
1018 void intel_gt_reset(struct intel_gt
*gt
,
1019 intel_engine_mask_t stalled_mask
,
1022 intel_engine_mask_t awake
;
1025 GT_TRACE(gt
, "flags=%lx\n", gt
->reset
.flags
);
1028 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF
, >
->reset
.flags
));
1029 mutex_lock(>
->reset
.mutex
);
1031 /* Clear any previous failed attempts at recovery. Time to try again. */
1032 if (!__intel_gt_unset_wedged(gt
))
1036 drm_notice(>
->i915
->drm
,
1037 "Resetting chip for %s\n", reason
);
1038 atomic_inc(>
->i915
->gpu_error
.reset_count
);
1040 awake
= reset_prepare(gt
);
1042 if (!intel_has_gpu_reset(gt
)) {
1043 if (gt
->i915
->params
.reset
)
1044 drm_err(>
->i915
->drm
, "GPU reset not supported\n");
1046 drm_dbg(>
->i915
->drm
, "GPU reset disabled\n");
1050 if (INTEL_INFO(gt
->i915
)->gpu_reset_clobbers_display
)
1051 intel_runtime_pm_disable_interrupts(gt
->i915
);
1053 if (do_reset(gt
, stalled_mask
)) {
1054 drm_err(>
->i915
->drm
, "Failed to reset chip\n");
1058 if (INTEL_INFO(gt
->i915
)->gpu_reset_clobbers_display
)
1059 intel_runtime_pm_enable_interrupts(gt
->i915
);
1061 intel_overlay_reset(gt
->i915
);
1064 * Next we need to restore the context, but we don't use those
1067 * Ring buffer needs to be re-initialized in the KMS case, or if X
1068 * was running at the time of the reset (i.e. we weren't VT
1071 ret
= intel_gt_init_hw(gt
);
1073 drm_err(>
->i915
->drm
,
1074 "Failed to initialise HW following reset (%d)\n",
1084 reset_finish(gt
, awake
);
1086 mutex_unlock(>
->reset
.mutex
);
1091 * History tells us that if we cannot reset the GPU now, we
1092 * never will. This then impacts everything that is run
1093 * subsequently. On failing the reset, we mark the driver
1094 * as wedged, preventing further execution on the GPU.
1095 * We also want to go one step further and add a taint to the
1096 * kernel so that any subsequent faults can be traced back to
1097 * this failure. This is important for CI, where if the
1098 * GPU/driver fails we would like to reboot and restart testing
1099 * rather than continue on into oblivion. For everyone else,
1100 * the system should still plod along, but they have been warned!
1102 add_taint_for_CI(gt
->i915
, TAINT_WARN
);
1104 __intel_gt_set_wedged(gt
);
1108 static inline int intel_gt_reset_engine(struct intel_engine_cs
*engine
)
1110 return __intel_gt_reset(engine
->gt
, engine
->mask
);
1114 * intel_engine_reset - reset GPU engine to recover from a hang
1115 * @engine: engine to reset
1116 * @msg: reason for GPU reset; or NULL for no drm_notice()
1118 * Reset a specific GPU engine. Useful if a hang is detected.
1119 * Returns zero on successful reset or otherwise an error code.
1122 * - identifies the request that caused the hang and it is dropped
1123 * - reset engine (which will force the engine to idle)
1124 * - re-init/configure engine
1126 int intel_engine_reset(struct intel_engine_cs
*engine
, const char *msg
)
1128 struct intel_gt
*gt
= engine
->gt
;
1129 bool uses_guc
= intel_engine_in_guc_submission_mode(engine
);
1132 ENGINE_TRACE(engine
, "flags=%lx\n", gt
->reset
.flags
);
1133 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE
+ engine
->id
, >
->reset
.flags
));
1135 if (!intel_engine_pm_get_if_awake(engine
))
1138 reset_prepare_engine(engine
);
1141 drm_notice(&engine
->i915
->drm
,
1142 "Resetting %s for %s\n", engine
->name
, msg
);
1143 atomic_inc(&engine
->i915
->gpu_error
.reset_engine_count
[engine
->uabi_class
]);
1146 ret
= intel_gt_reset_engine(engine
);
1148 ret
= intel_guc_reset_engine(&engine
->gt
->uc
.guc
, engine
);
1150 /* If we fail here, we expect to fallback to a global reset */
1151 drm_dbg(>
->i915
->drm
, "%sFailed to reset %s, ret=%d\n",
1152 uses_guc
? "GuC " : "", engine
->name
, ret
);
1157 * The request that caused the hang is stuck on elsp, we know the
1158 * active request and can drop it, adjust head to skip the offending
1159 * request to resume executing remaining requests in the queue.
1161 __intel_engine_reset(engine
, true);
1164 * The engine and its registers (and workarounds in case of render)
1165 * have been reset to their default values. Follow the init_ring
1166 * process to program RING_MODE, HWSP and re-enable submission.
1168 ret
= intel_engine_resume(engine
);
1171 intel_engine_cancel_stop_cs(engine
);
1172 reset_finish_engine(engine
);
1173 intel_engine_pm_put_async(engine
);
1177 static void intel_gt_reset_global(struct intel_gt
*gt
,
1181 struct kobject
*kobj
= >
->i915
->drm
.primary
->kdev
->kobj
;
1182 char *error_event
[] = { I915_ERROR_UEVENT
"=1", NULL
};
1183 char *reset_event
[] = { I915_RESET_UEVENT
"=1", NULL
};
1184 char *reset_done_event
[] = { I915_ERROR_UEVENT
"=0", NULL
};
1185 struct intel_wedge_me w
;
1187 kobject_uevent_env(kobj
, KOBJ_CHANGE
, error_event
);
1189 drm_dbg(>
->i915
->drm
, "resetting chip, engines=%x\n", engine_mask
);
1190 kobject_uevent_env(kobj
, KOBJ_CHANGE
, reset_event
);
1192 /* Use a watchdog to ensure that our reset completes */
1193 intel_wedge_on_timeout(&w
, gt
, 5 * HZ
) {
1194 intel_display_prepare_reset(gt
->i915
);
1196 /* Flush everyone using a resource about to be clobbered */
1197 synchronize_srcu_expedited(>
->reset
.backoff_srcu
);
1199 intel_gt_reset(gt
, engine_mask
, reason
);
1201 intel_display_finish_reset(gt
->i915
);
1204 if (!test_bit(I915_WEDGED
, >
->reset
.flags
))
1205 kobject_uevent_env(kobj
, KOBJ_CHANGE
, reset_done_event
);
1209 * intel_gt_handle_error - handle a gpu error
1211 * @engine_mask: mask representing engines that are hung
1212 * @flags: control flags
1213 * @fmt: Error message format string
1215 * Do some basic checking of register state at error time and
1216 * dump it to the syslog. Also call i915_capture_error_state() to make
1217 * sure we get a record and make it available in debugfs. Fire a uevent
1218 * so userspace knows something bad happened (should trigger collection
1219 * of a ring dump etc.).
1221 void intel_gt_handle_error(struct intel_gt
*gt
,
1222 intel_engine_mask_t engine_mask
,
1223 unsigned long flags
,
1224 const char *fmt
, ...)
1226 struct intel_engine_cs
*engine
;
1227 intel_wakeref_t wakeref
;
1228 intel_engine_mask_t tmp
;
1235 va_start(args
, fmt
);
1236 vscnprintf(error_msg
, sizeof(error_msg
), fmt
, args
);
1243 * In most cases it's guaranteed that we get here with an RPM
1244 * reference held, for example because there is a pending GPU
1245 * request that won't finish until the reset is done. This
1246 * isn't the case at least when we get here by doing a
1247 * simulated reset via debugfs, so get an RPM reference.
1249 wakeref
= intel_runtime_pm_get(gt
->uncore
->rpm
);
1251 engine_mask
&= gt
->info
.engine_mask
;
1253 if (flags
& I915_ERROR_CAPTURE
) {
1254 i915_capture_error_state(gt
, engine_mask
);
1255 intel_gt_clear_error_registers(gt
, engine_mask
);
1259 * Try engine reset when available. We fall back to full reset if
1260 * single reset fails.
1262 if (intel_has_reset_engine(gt
) && !intel_gt_is_wedged(gt
)) {
1263 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
) {
1264 BUILD_BUG_ON(I915_RESET_MODESET
>= I915_RESET_ENGINE
);
1265 if (test_and_set_bit(I915_RESET_ENGINE
+ engine
->id
,
1269 if (intel_engine_reset(engine
, msg
) == 0)
1270 engine_mask
&= ~engine
->mask
;
1272 clear_and_wake_up_bit(I915_RESET_ENGINE
+ engine
->id
,
1280 /* Full reset needs the mutex, stop any other user trying to do so. */
1281 if (test_and_set_bit(I915_RESET_BACKOFF
, >
->reset
.flags
)) {
1282 wait_event(gt
->reset
.queue
,
1283 !test_bit(I915_RESET_BACKOFF
, >
->reset
.flags
));
1284 goto out
; /* piggy-back on the other reset */
1287 /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1288 synchronize_rcu_expedited();
1290 /* Prevent any other reset-engine attempt. */
1291 for_each_engine(engine
, gt
, tmp
) {
1292 while (test_and_set_bit(I915_RESET_ENGINE
+ engine
->id
,
1294 wait_on_bit(>
->reset
.flags
,
1295 I915_RESET_ENGINE
+ engine
->id
,
1296 TASK_UNINTERRUPTIBLE
);
1299 intel_gt_reset_global(gt
, engine_mask
, msg
);
1301 for_each_engine(engine
, gt
, tmp
)
1302 clear_bit_unlock(I915_RESET_ENGINE
+ engine
->id
,
1304 clear_bit_unlock(I915_RESET_BACKOFF
, >
->reset
.flags
);
1305 smp_mb__after_atomic();
1306 wake_up_all(>
->reset
.queue
);
1309 intel_runtime_pm_put(gt
->uncore
->rpm
, wakeref
);
1312 int intel_gt_reset_trylock(struct intel_gt
*gt
, int *srcu
)
1314 might_lock(>
->reset
.backoff_srcu
);
1318 while (test_bit(I915_RESET_BACKOFF
, >
->reset
.flags
)) {
1321 if (wait_event_interruptible(gt
->reset
.queue
,
1322 !test_bit(I915_RESET_BACKOFF
,
1328 *srcu
= srcu_read_lock(>
->reset
.backoff_srcu
);
1334 void intel_gt_reset_unlock(struct intel_gt
*gt
, int tag
)
1335 __releases(>
->reset
.backoff_srcu
)
1337 srcu_read_unlock(>
->reset
.backoff_srcu
, tag
);
1340 int intel_gt_terminally_wedged(struct intel_gt
*gt
)
1344 if (!intel_gt_is_wedged(gt
))
1347 if (intel_gt_has_unrecoverable_error(gt
))
1350 /* Reset still in progress? Maybe we will recover? */
1351 if (wait_event_interruptible(gt
->reset
.queue
,
1352 !test_bit(I915_RESET_BACKOFF
,
1356 return intel_gt_is_wedged(gt
) ? -EIO
: 0;
1359 void intel_gt_set_wedged_on_init(struct intel_gt
*gt
)
1361 BUILD_BUG_ON(I915_RESET_ENGINE
+ I915_NUM_ENGINES
>
1362 I915_WEDGED_ON_INIT
);
1363 intel_gt_set_wedged(gt
);
1364 set_bit(I915_WEDGED_ON_INIT
, >
->reset
.flags
);
1366 /* Wedged on init is non-recoverable */
1367 add_taint_for_CI(gt
->i915
, TAINT_WARN
);
1370 void intel_gt_set_wedged_on_fini(struct intel_gt
*gt
)
1372 intel_gt_set_wedged(gt
);
1373 set_bit(I915_WEDGED_ON_FINI
, >
->reset
.flags
);
1374 intel_gt_retire_requests(gt
); /* cleanup any wedged requests */
1377 void intel_gt_init_reset(struct intel_gt
*gt
)
1379 init_waitqueue_head(>
->reset
.queue
);
1380 mutex_init(>
->reset
.mutex
);
1381 init_srcu_struct(>
->reset
.backoff_srcu
);
1383 /* no GPU until we are ready! */
1384 __set_bit(I915_WEDGED
, >
->reset
.flags
);
1387 void intel_gt_fini_reset(struct intel_gt
*gt
)
1389 cleanup_srcu_struct(>
->reset
.backoff_srcu
);
1392 static void intel_wedge_me(struct work_struct
*work
)
1394 struct intel_wedge_me
*w
= container_of(work
, typeof(*w
), work
.work
);
1396 drm_err(&w
->gt
->i915
->drm
,
1397 "%s timed out, cancelling all in-flight rendering.\n",
1399 intel_gt_set_wedged(w
->gt
);
1402 void __intel_init_wedge(struct intel_wedge_me
*w
,
1403 struct intel_gt
*gt
,
1410 INIT_DELAYED_WORK_ONSTACK(&w
->work
, intel_wedge_me
);
1411 schedule_delayed_work(&w
->work
, timeout
);
1414 void __intel_fini_wedge(struct intel_wedge_me
*w
)
1416 cancel_delayed_work_sync(&w
->work
);
1417 destroy_delayed_work_on_stack(&w
->work
);
1421 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1422 #include "selftest_reset.c"
1423 #include "selftest_hangcheck.c"