2 * SPDX-License-Identifier: MIT
4 * Copyright © 2008-2018 Intel Corporation
7 #include <linux/sched/mm.h>
8 #include <linux/stop_machine.h>
10 #include "display/intel_display_types.h"
11 #include "display/intel_overlay.h"
13 #include "gem/i915_gem_context.h"
16 #include "i915_gpu_error.h"
18 #include "intel_engine_pm.h"
20 #include "intel_gt_pm.h"
21 #include "intel_reset.h"
23 #include "uc/intel_guc.h"
24 #include "uc/intel_guc_submission.h"
26 #define RESET_MAX_RETRIES 3
28 /* XXX How to handle concurrent GGTT updates using tiling registers? */
29 #define RESET_UNDER_STOP_MACHINE 0
31 static void rmw_set_fw(struct intel_uncore
*uncore
, i915_reg_t reg
, u32 set
)
33 intel_uncore_rmw_fw(uncore
, reg
, 0, set
);
36 static void rmw_clear_fw(struct intel_uncore
*uncore
, i915_reg_t reg
, u32 clr
)
38 intel_uncore_rmw_fw(uncore
, reg
, clr
, 0);
41 static void engine_skip_context(struct i915_request
*rq
)
43 struct intel_engine_cs
*engine
= rq
->engine
;
44 struct intel_context
*hung_ctx
= rq
->context
;
46 if (!i915_request_is_active(rq
))
49 lockdep_assert_held(&engine
->active
.lock
);
50 list_for_each_entry_continue(rq
, &engine
->active
.requests
, sched
.link
)
51 if (rq
->context
== hung_ctx
)
52 i915_request_skip(rq
, -EIO
);
55 static void client_mark_guilty(struct i915_gem_context
*ctx
, bool banned
)
57 struct drm_i915_file_private
*file_priv
= ctx
->file_priv
;
58 unsigned long prev_hang
;
61 if (IS_ERR_OR_NULL(file_priv
))
66 score
= I915_CLIENT_SCORE_CONTEXT_BAN
;
68 prev_hang
= xchg(&file_priv
->hang_timestamp
, jiffies
);
69 if (time_before(jiffies
, prev_hang
+ I915_CLIENT_FAST_HANG_JIFFIES
))
70 score
+= I915_CLIENT_SCORE_HANG_FAST
;
73 atomic_add(score
, &file_priv
->ban_score
);
75 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
77 atomic_read(&file_priv
->ban_score
));
81 static bool mark_guilty(struct i915_request
*rq
)
83 struct i915_gem_context
*ctx
;
84 unsigned long prev_hang
;
89 ctx
= rcu_dereference(rq
->context
->gem_context
);
90 if (ctx
&& !kref_get_unless_zero(&ctx
->ref
))
96 if (i915_gem_context_is_closed(ctx
)) {
97 intel_context_set_banned(rq
->context
);
102 atomic_inc(&ctx
->guilty_count
);
104 /* Cool contexts are too cool to be banned! (Used for reset testing.) */
105 if (!i915_gem_context_is_bannable(ctx
)) {
110 dev_notice(ctx
->i915
->drm
.dev
,
111 "%s context reset due to GPU hang\n",
114 /* Record the timestamp for the last N hangs */
115 prev_hang
= ctx
->hang_timestamp
[0];
116 for (i
= 0; i
< ARRAY_SIZE(ctx
->hang_timestamp
) - 1; i
++)
117 ctx
->hang_timestamp
[i
] = ctx
->hang_timestamp
[i
+ 1];
118 ctx
->hang_timestamp
[i
] = jiffies
;
120 /* If we have hung N+1 times in rapid succession, we ban the context! */
121 banned
= !i915_gem_context_is_recoverable(ctx
);
122 if (time_before(jiffies
, prev_hang
+ CONTEXT_FAST_HANG_JIFFIES
))
125 DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
126 ctx
->name
, atomic_read(&ctx
->guilty_count
));
127 intel_context_set_banned(rq
->context
);
130 client_mark_guilty(ctx
, banned
);
133 i915_gem_context_put(ctx
);
137 static void mark_innocent(struct i915_request
*rq
)
139 struct i915_gem_context
*ctx
;
142 ctx
= rcu_dereference(rq
->context
->gem_context
);
144 atomic_inc(&ctx
->active_count
);
148 void __i915_request_reset(struct i915_request
*rq
, bool guilty
)
150 RQ_TRACE(rq
, "guilty? %s\n", yesno(guilty
));
152 GEM_BUG_ON(i915_request_completed(rq
));
154 rcu_read_lock(); /* protect the GEM context */
156 i915_request_skip(rq
, -EIO
);
158 engine_skip_context(rq
);
160 dma_fence_set_error(&rq
->fence
, -EAGAIN
);
166 static bool i915_in_reset(struct pci_dev
*pdev
)
170 pci_read_config_byte(pdev
, I915_GDRST
, &gdrst
);
171 return gdrst
& GRDOM_RESET_STATUS
;
174 static int i915_do_reset(struct intel_gt
*gt
,
175 intel_engine_mask_t engine_mask
,
178 struct pci_dev
*pdev
= gt
->i915
->drm
.pdev
;
181 /* Assert reset for at least 20 usec, and wait for acknowledgement. */
182 pci_write_config_byte(pdev
, I915_GDRST
, GRDOM_RESET_ENABLE
);
184 err
= wait_for_atomic(i915_in_reset(pdev
), 50);
186 /* Clear the reset request. */
187 pci_write_config_byte(pdev
, I915_GDRST
, 0);
190 err
= wait_for_atomic(!i915_in_reset(pdev
), 50);
195 static bool g4x_reset_complete(struct pci_dev
*pdev
)
199 pci_read_config_byte(pdev
, I915_GDRST
, &gdrst
);
200 return (gdrst
& GRDOM_RESET_ENABLE
) == 0;
203 static int g33_do_reset(struct intel_gt
*gt
,
204 intel_engine_mask_t engine_mask
,
207 struct pci_dev
*pdev
= gt
->i915
->drm
.pdev
;
209 pci_write_config_byte(pdev
, I915_GDRST
, GRDOM_RESET_ENABLE
);
210 return wait_for_atomic(g4x_reset_complete(pdev
), 50);
213 static int g4x_do_reset(struct intel_gt
*gt
,
214 intel_engine_mask_t engine_mask
,
217 struct pci_dev
*pdev
= gt
->i915
->drm
.pdev
;
218 struct intel_uncore
*uncore
= gt
->uncore
;
221 /* WaVcpClkGateDisableForMediaReset:ctg,elk */
222 rmw_set_fw(uncore
, VDECCLK_GATE_D
, VCP_UNIT_CLOCK_GATE_DISABLE
);
223 intel_uncore_posting_read_fw(uncore
, VDECCLK_GATE_D
);
225 pci_write_config_byte(pdev
, I915_GDRST
,
226 GRDOM_MEDIA
| GRDOM_RESET_ENABLE
);
227 ret
= wait_for_atomic(g4x_reset_complete(pdev
), 50);
229 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
233 pci_write_config_byte(pdev
, I915_GDRST
,
234 GRDOM_RENDER
| GRDOM_RESET_ENABLE
);
235 ret
= wait_for_atomic(g4x_reset_complete(pdev
), 50);
237 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
242 pci_write_config_byte(pdev
, I915_GDRST
, 0);
244 rmw_clear_fw(uncore
, VDECCLK_GATE_D
, VCP_UNIT_CLOCK_GATE_DISABLE
);
245 intel_uncore_posting_read_fw(uncore
, VDECCLK_GATE_D
);
250 static int ilk_do_reset(struct intel_gt
*gt
, intel_engine_mask_t engine_mask
,
253 struct intel_uncore
*uncore
= gt
->uncore
;
256 intel_uncore_write_fw(uncore
, ILK_GDSR
,
257 ILK_GRDOM_RENDER
| ILK_GRDOM_RESET_ENABLE
);
258 ret
= __intel_wait_for_register_fw(uncore
, ILK_GDSR
,
259 ILK_GRDOM_RESET_ENABLE
, 0,
263 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
267 intel_uncore_write_fw(uncore
, ILK_GDSR
,
268 ILK_GRDOM_MEDIA
| ILK_GRDOM_RESET_ENABLE
);
269 ret
= __intel_wait_for_register_fw(uncore
, ILK_GDSR
,
270 ILK_GRDOM_RESET_ENABLE
, 0,
274 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
279 intel_uncore_write_fw(uncore
, ILK_GDSR
, 0);
280 intel_uncore_posting_read_fw(uncore
, ILK_GDSR
);
284 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
285 static int gen6_hw_domain_reset(struct intel_gt
*gt
, u32 hw_domain_mask
)
287 struct intel_uncore
*uncore
= gt
->uncore
;
291 * GEN6_GDRST is not in the gt power well, no need to check
292 * for fifo space for the write or forcewake the chip for
295 intel_uncore_write_fw(uncore
, GEN6_GDRST
, hw_domain_mask
);
297 /* Wait for the device to ack the reset requests */
298 err
= __intel_wait_for_register_fw(uncore
,
299 GEN6_GDRST
, hw_domain_mask
, 0,
303 DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
309 static int gen6_reset_engines(struct intel_gt
*gt
,
310 intel_engine_mask_t engine_mask
,
313 static const u32 hw_engine_mask
[] = {
314 [RCS0
] = GEN6_GRDOM_RENDER
,
315 [BCS0
] = GEN6_GRDOM_BLT
,
316 [VCS0
] = GEN6_GRDOM_MEDIA
,
317 [VCS1
] = GEN8_GRDOM_MEDIA2
,
318 [VECS0
] = GEN6_GRDOM_VECS
,
320 struct intel_engine_cs
*engine
;
323 if (engine_mask
== ALL_ENGINES
) {
324 hw_mask
= GEN6_GRDOM_FULL
;
326 intel_engine_mask_t tmp
;
329 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
) {
330 GEM_BUG_ON(engine
->id
>= ARRAY_SIZE(hw_engine_mask
));
331 hw_mask
|= hw_engine_mask
[engine
->id
];
335 return gen6_hw_domain_reset(gt
, hw_mask
);
338 static int gen11_lock_sfc(struct intel_engine_cs
*engine
, u32
*hw_mask
)
340 struct intel_uncore
*uncore
= engine
->uncore
;
341 u8 vdbox_sfc_access
= RUNTIME_INFO(engine
->i915
)->vdbox_sfc_access
;
342 i915_reg_t sfc_forced_lock
, sfc_forced_lock_ack
;
343 u32 sfc_forced_lock_bit
, sfc_forced_lock_ack_bit
;
344 i915_reg_t sfc_usage
;
349 switch (engine
->class) {
350 case VIDEO_DECODE_CLASS
:
351 if ((BIT(engine
->instance
) & vdbox_sfc_access
) == 0)
354 sfc_forced_lock
= GEN11_VCS_SFC_FORCED_LOCK(engine
);
355 sfc_forced_lock_bit
= GEN11_VCS_SFC_FORCED_LOCK_BIT
;
357 sfc_forced_lock_ack
= GEN11_VCS_SFC_LOCK_STATUS(engine
);
358 sfc_forced_lock_ack_bit
= GEN11_VCS_SFC_LOCK_ACK_BIT
;
360 sfc_usage
= GEN11_VCS_SFC_LOCK_STATUS(engine
);
361 sfc_usage_bit
= GEN11_VCS_SFC_USAGE_BIT
;
362 sfc_reset_bit
= GEN11_VCS_SFC_RESET_BIT(engine
->instance
);
365 case VIDEO_ENHANCEMENT_CLASS
:
366 sfc_forced_lock
= GEN11_VECS_SFC_FORCED_LOCK(engine
);
367 sfc_forced_lock_bit
= GEN11_VECS_SFC_FORCED_LOCK_BIT
;
369 sfc_forced_lock_ack
= GEN11_VECS_SFC_LOCK_ACK(engine
);
370 sfc_forced_lock_ack_bit
= GEN11_VECS_SFC_LOCK_ACK_BIT
;
372 sfc_usage
= GEN11_VECS_SFC_USAGE(engine
);
373 sfc_usage_bit
= GEN11_VECS_SFC_USAGE_BIT
;
374 sfc_reset_bit
= GEN11_VECS_SFC_RESET_BIT(engine
->instance
);
382 * If the engine is using a SFC, tell the engine that a software reset
383 * is going to happen. The engine will then try to force lock the SFC.
384 * If SFC ends up being locked to the engine we want to reset, we have
385 * to reset it as well (we will unlock it once the reset sequence is
388 if (!(intel_uncore_read_fw(uncore
, sfc_usage
) & sfc_usage_bit
))
391 rmw_set_fw(uncore
, sfc_forced_lock
, sfc_forced_lock_bit
);
393 ret
= __intel_wait_for_register_fw(uncore
,
395 sfc_forced_lock_ack_bit
,
396 sfc_forced_lock_ack_bit
,
399 /* Was the SFC released while we were trying to lock it? */
400 if (!(intel_uncore_read_fw(uncore
, sfc_usage
) & sfc_usage_bit
))
404 DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
408 *hw_mask
|= sfc_reset_bit
;
412 static void gen11_unlock_sfc(struct intel_engine_cs
*engine
)
414 struct intel_uncore
*uncore
= engine
->uncore
;
415 u8 vdbox_sfc_access
= RUNTIME_INFO(engine
->i915
)->vdbox_sfc_access
;
416 i915_reg_t sfc_forced_lock
;
417 u32 sfc_forced_lock_bit
;
419 switch (engine
->class) {
420 case VIDEO_DECODE_CLASS
:
421 if ((BIT(engine
->instance
) & vdbox_sfc_access
) == 0)
424 sfc_forced_lock
= GEN11_VCS_SFC_FORCED_LOCK(engine
);
425 sfc_forced_lock_bit
= GEN11_VCS_SFC_FORCED_LOCK_BIT
;
428 case VIDEO_ENHANCEMENT_CLASS
:
429 sfc_forced_lock
= GEN11_VECS_SFC_FORCED_LOCK(engine
);
430 sfc_forced_lock_bit
= GEN11_VECS_SFC_FORCED_LOCK_BIT
;
437 rmw_clear_fw(uncore
, sfc_forced_lock
, sfc_forced_lock_bit
);
440 static int gen11_reset_engines(struct intel_gt
*gt
,
441 intel_engine_mask_t engine_mask
,
444 static const u32 hw_engine_mask
[] = {
445 [RCS0
] = GEN11_GRDOM_RENDER
,
446 [BCS0
] = GEN11_GRDOM_BLT
,
447 [VCS0
] = GEN11_GRDOM_MEDIA
,
448 [VCS1
] = GEN11_GRDOM_MEDIA2
,
449 [VCS2
] = GEN11_GRDOM_MEDIA3
,
450 [VCS3
] = GEN11_GRDOM_MEDIA4
,
451 [VECS0
] = GEN11_GRDOM_VECS
,
452 [VECS1
] = GEN11_GRDOM_VECS2
,
454 struct intel_engine_cs
*engine
;
455 intel_engine_mask_t tmp
;
459 if (engine_mask
== ALL_ENGINES
) {
460 hw_mask
= GEN11_GRDOM_FULL
;
463 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
) {
464 GEM_BUG_ON(engine
->id
>= ARRAY_SIZE(hw_engine_mask
));
465 hw_mask
|= hw_engine_mask
[engine
->id
];
466 ret
= gen11_lock_sfc(engine
, &hw_mask
);
472 ret
= gen6_hw_domain_reset(gt
, hw_mask
);
476 * We unlock the SFC based on the lock status and not the result of
477 * gen11_lock_sfc to make sure that we clean properly if something
478 * wrong happened during the lock (e.g. lock acquired after timeout
481 if (engine_mask
!= ALL_ENGINES
)
482 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
)
483 gen11_unlock_sfc(engine
);
488 static int gen8_engine_reset_prepare(struct intel_engine_cs
*engine
)
490 struct intel_uncore
*uncore
= engine
->uncore
;
491 const i915_reg_t reg
= RING_RESET_CTL(engine
->mmio_base
);
492 u32 request
, mask
, ack
;
495 ack
= intel_uncore_read_fw(uncore
, reg
);
496 if (ack
& RESET_CTL_CAT_ERROR
) {
498 * For catastrophic errors, ready-for-reset sequence
499 * needs to be bypassed: HAS#396813
501 request
= RESET_CTL_CAT_ERROR
;
502 mask
= RESET_CTL_CAT_ERROR
;
504 /* Catastrophic errors need to be cleared by HW */
506 } else if (!(ack
& RESET_CTL_READY_TO_RESET
)) {
507 request
= RESET_CTL_REQUEST_RESET
;
508 mask
= RESET_CTL_READY_TO_RESET
;
509 ack
= RESET_CTL_READY_TO_RESET
;
514 intel_uncore_write_fw(uncore
, reg
, _MASKED_BIT_ENABLE(request
));
515 ret
= __intel_wait_for_register_fw(uncore
, reg
, mask
, ack
,
518 DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
519 engine
->name
, request
,
520 intel_uncore_read_fw(uncore
, reg
));
525 static void gen8_engine_reset_cancel(struct intel_engine_cs
*engine
)
527 intel_uncore_write_fw(engine
->uncore
,
528 RING_RESET_CTL(engine
->mmio_base
),
529 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET
));
532 static int gen8_reset_engines(struct intel_gt
*gt
,
533 intel_engine_mask_t engine_mask
,
536 struct intel_engine_cs
*engine
;
537 const bool reset_non_ready
= retry
>= 1;
538 intel_engine_mask_t tmp
;
541 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
) {
542 ret
= gen8_engine_reset_prepare(engine
);
543 if (ret
&& !reset_non_ready
)
547 * If this is not the first failed attempt to prepare,
548 * we decide to proceed anyway.
550 * By doing so we risk context corruption and with
551 * some gens (kbl), possible system hang if reset
552 * happens during active bb execution.
554 * We rather take context corruption instead of
555 * failed reset with a wedged driver/gpu. And
556 * active bb execution case should be covered by
557 * stop_engines() we have before the reset.
561 if (INTEL_GEN(gt
->i915
) >= 11)
562 ret
= gen11_reset_engines(gt
, engine_mask
, retry
);
564 ret
= gen6_reset_engines(gt
, engine_mask
, retry
);
567 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
)
568 gen8_engine_reset_cancel(engine
);
573 static int mock_reset(struct intel_gt
*gt
,
574 intel_engine_mask_t mask
,
580 typedef int (*reset_func
)(struct intel_gt
*,
581 intel_engine_mask_t engine_mask
,
584 static reset_func
intel_get_gpu_reset(const struct intel_gt
*gt
)
586 struct drm_i915_private
*i915
= gt
->i915
;
590 else if (INTEL_GEN(i915
) >= 8)
591 return gen8_reset_engines
;
592 else if (INTEL_GEN(i915
) >= 6)
593 return gen6_reset_engines
;
594 else if (INTEL_GEN(i915
) >= 5)
596 else if (IS_G4X(i915
))
598 else if (IS_G33(i915
) || IS_PINEVIEW(i915
))
600 else if (INTEL_GEN(i915
) >= 3)
601 return i915_do_reset
;
606 int __intel_gt_reset(struct intel_gt
*gt
, intel_engine_mask_t engine_mask
)
608 const int retries
= engine_mask
== ALL_ENGINES
? RESET_MAX_RETRIES
: 1;
610 int ret
= -ETIMEDOUT
;
613 reset
= intel_get_gpu_reset(gt
);
618 * If the power well sleeps during the reset, the reset
619 * request may be dropped and never completes (causing -EIO).
621 intel_uncore_forcewake_get(gt
->uncore
, FORCEWAKE_ALL
);
622 for (retry
= 0; ret
== -ETIMEDOUT
&& retry
< retries
; retry
++) {
623 GT_TRACE(gt
, "engine_mask=%x\n", engine_mask
);
625 ret
= reset(gt
, engine_mask
, retry
);
628 intel_uncore_forcewake_put(gt
->uncore
, FORCEWAKE_ALL
);
633 bool intel_has_gpu_reset(const struct intel_gt
*gt
)
635 if (!i915_modparams
.reset
)
638 return intel_get_gpu_reset(gt
);
641 bool intel_has_reset_engine(const struct intel_gt
*gt
)
643 if (i915_modparams
.reset
< 2)
646 return INTEL_INFO(gt
->i915
)->has_reset_engine
;
649 int intel_reset_guc(struct intel_gt
*gt
)
652 INTEL_GEN(gt
->i915
) >= 11 ? GEN11_GRDOM_GUC
: GEN9_GRDOM_GUC
;
655 GEM_BUG_ON(!HAS_GT_UC(gt
->i915
));
657 intel_uncore_forcewake_get(gt
->uncore
, FORCEWAKE_ALL
);
658 ret
= gen6_hw_domain_reset(gt
, guc_domain
);
659 intel_uncore_forcewake_put(gt
->uncore
, FORCEWAKE_ALL
);
665 * Ensure irq handler finishes, and not run again.
666 * Also return the active request so that we only search for it once.
668 static void reset_prepare_engine(struct intel_engine_cs
*engine
)
671 * During the reset sequence, we must prevent the engine from
672 * entering RC6. As the context state is undefined until we restart
673 * the engine, if it does enter RC6 during the reset, the state
674 * written to the powercontext is undefined and so we may lose
675 * GPU state upon resume, i.e. fail to restart after a reset.
677 intel_uncore_forcewake_get(engine
->uncore
, FORCEWAKE_ALL
);
678 if (engine
->reset
.prepare
)
679 engine
->reset
.prepare(engine
);
682 static void revoke_mmaps(struct intel_gt
*gt
)
686 for (i
= 0; i
< gt
->ggtt
->num_fences
; i
++) {
687 struct drm_vma_offset_node
*node
;
688 struct i915_vma
*vma
;
691 vma
= READ_ONCE(gt
->ggtt
->fence_regs
[i
].vma
);
695 if (!i915_vma_has_userfault(vma
))
698 GEM_BUG_ON(vma
->fence
!= >
->ggtt
->fence_regs
[i
]);
703 node
= &vma
->mmo
->vma_node
;
704 vma_offset
= vma
->ggtt_view
.partial
.offset
<< PAGE_SHIFT
;
706 unmap_mapping_range(gt
->i915
->drm
.anon_inode
->i_mapping
,
707 drm_vma_node_offset_addr(node
) + vma_offset
,
713 static intel_engine_mask_t
reset_prepare(struct intel_gt
*gt
)
715 struct intel_engine_cs
*engine
;
716 intel_engine_mask_t awake
= 0;
717 enum intel_engine_id id
;
719 for_each_engine(engine
, gt
, id
) {
720 if (intel_engine_pm_get_if_awake(engine
))
721 awake
|= engine
->mask
;
722 reset_prepare_engine(engine
);
725 intel_uc_reset_prepare(>
->uc
);
730 static void gt_revoke(struct intel_gt
*gt
)
735 static int gt_reset(struct intel_gt
*gt
, intel_engine_mask_t stalled_mask
)
737 struct intel_engine_cs
*engine
;
738 enum intel_engine_id id
;
742 * Everything depends on having the GTT running, so we need to start
745 err
= i915_ggtt_enable_hw(gt
->i915
);
749 for_each_engine(engine
, gt
, id
)
750 __intel_engine_reset(engine
, stalled_mask
& engine
->mask
);
752 i915_gem_restore_fences(gt
->ggtt
);
757 static void reset_finish_engine(struct intel_engine_cs
*engine
)
759 if (engine
->reset
.finish
)
760 engine
->reset
.finish(engine
);
761 intel_uncore_forcewake_put(engine
->uncore
, FORCEWAKE_ALL
);
763 intel_engine_signal_breadcrumbs(engine
);
766 static void reset_finish(struct intel_gt
*gt
, intel_engine_mask_t awake
)
768 struct intel_engine_cs
*engine
;
769 enum intel_engine_id id
;
771 for_each_engine(engine
, gt
, id
) {
772 reset_finish_engine(engine
);
773 if (awake
& engine
->mask
)
774 intel_engine_pm_put(engine
);
778 static void nop_submit_request(struct i915_request
*request
)
780 struct intel_engine_cs
*engine
= request
->engine
;
783 RQ_TRACE(request
, "-EIO\n");
784 dma_fence_set_error(&request
->fence
, -EIO
);
786 spin_lock_irqsave(&engine
->active
.lock
, flags
);
787 __i915_request_submit(request
);
788 i915_request_mark_complete(request
);
789 spin_unlock_irqrestore(&engine
->active
.lock
, flags
);
791 intel_engine_signal_breadcrumbs(engine
);
794 static void __intel_gt_set_wedged(struct intel_gt
*gt
)
796 struct intel_engine_cs
*engine
;
797 intel_engine_mask_t awake
;
798 enum intel_engine_id id
;
800 if (test_bit(I915_WEDGED
, >
->reset
.flags
))
803 if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt
)) {
804 struct drm_printer p
= drm_debug_printer(__func__
);
806 for_each_engine(engine
, gt
, id
)
807 intel_engine_dump(engine
, &p
, "%s\n", engine
->name
);
810 GT_TRACE(gt
, "start\n");
813 * First, stop submission to hw, but do not yet complete requests by
814 * rolling the global seqno forward (since this would complete requests
815 * for which we haven't set the fence error to EIO yet).
817 awake
= reset_prepare(gt
);
819 /* Even if the GPU reset fails, it should still stop the engines */
820 if (!INTEL_INFO(gt
->i915
)->gpu_reset_clobbers_display
)
821 __intel_gt_reset(gt
, ALL_ENGINES
);
823 for_each_engine(engine
, gt
, id
)
824 engine
->submit_request
= nop_submit_request
;
827 * Make sure no request can slip through without getting completed by
828 * either this call here to intel_engine_write_global_seqno, or the one
829 * in nop_submit_request.
831 synchronize_rcu_expedited();
832 set_bit(I915_WEDGED
, >
->reset
.flags
);
834 /* Mark all executing requests as skipped */
835 for_each_engine(engine
, gt
, id
)
836 if (engine
->reset
.cancel
)
837 engine
->reset
.cancel(engine
);
839 reset_finish(gt
, awake
);
841 GT_TRACE(gt
, "end\n");
844 void intel_gt_set_wedged(struct intel_gt
*gt
)
846 intel_wakeref_t wakeref
;
848 mutex_lock(>
->reset
.mutex
);
849 with_intel_runtime_pm(gt
->uncore
->rpm
, wakeref
)
850 __intel_gt_set_wedged(gt
);
851 mutex_unlock(>
->reset
.mutex
);
854 static bool __intel_gt_unset_wedged(struct intel_gt
*gt
)
856 struct intel_gt_timelines
*timelines
= >
->timelines
;
857 struct intel_timeline
*tl
;
860 if (!test_bit(I915_WEDGED
, >
->reset
.flags
))
863 /* Never fully initialised, recovery impossible */
864 if (test_bit(I915_WEDGED_ON_INIT
, >
->reset
.flags
))
867 GT_TRACE(gt
, "start\n");
870 * Before unwedging, make sure that all pending operations
871 * are flushed and errored out - we may have requests waiting upon
872 * third party fences. We marked all inflight requests as EIO, and
873 * every execbuf since returned EIO, for consistency we want all
874 * the currently pending requests to also be marked as EIO, which
875 * is done inside our nop_submit_request - and so we must wait.
877 * No more can be submitted until we reset the wedged bit.
879 spin_lock(&timelines
->lock
);
880 list_for_each_entry(tl
, &timelines
->active_list
, link
) {
881 struct dma_fence
*fence
;
883 fence
= i915_active_fence_get(&tl
->last_request
);
887 spin_unlock(&timelines
->lock
);
890 * All internal dependencies (i915_requests) will have
891 * been flushed by the set-wedge, but we may be stuck waiting
892 * for external fences. These should all be capped to 10s
893 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
896 dma_fence_default_wait(fence
, false, MAX_SCHEDULE_TIMEOUT
);
897 dma_fence_put(fence
);
899 /* Restart iteration after droping lock */
900 spin_lock(&timelines
->lock
);
901 tl
= list_entry(&timelines
->active_list
, typeof(*tl
), link
);
903 spin_unlock(&timelines
->lock
);
905 /* We must reset pending GPU events before restoring our submission */
906 ok
= !HAS_EXECLISTS(gt
->i915
); /* XXX better agnosticism desired */
907 if (!INTEL_INFO(gt
->i915
)->gpu_reset_clobbers_display
)
908 ok
= __intel_gt_reset(gt
, ALL_ENGINES
) == 0;
911 * Warn CI about the unrecoverable wedged condition.
914 add_taint_for_CI(TAINT_WARN
);
919 * Undo nop_submit_request. We prevent all new i915 requests from
920 * being queued (by disallowing execbuf whilst wedged) so having
921 * waited for all active requests above, we know the system is idle
922 * and do not have to worry about a thread being inside
923 * engine->submit_request() as we swap over. So unlike installing
924 * the nop_submit_request on reset, we can do this from normal
925 * context and do not require stop_machine().
927 intel_engines_reset_default_submission(gt
);
929 GT_TRACE(gt
, "end\n");
931 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
932 clear_bit(I915_WEDGED
, >
->reset
.flags
);
937 bool intel_gt_unset_wedged(struct intel_gt
*gt
)
941 mutex_lock(>
->reset
.mutex
);
942 result
= __intel_gt_unset_wedged(gt
);
943 mutex_unlock(>
->reset
.mutex
);
948 static int do_reset(struct intel_gt
*gt
, intel_engine_mask_t stalled_mask
)
954 err
= __intel_gt_reset(gt
, ALL_ENGINES
);
955 for (i
= 0; err
&& i
< RESET_MAX_RETRIES
; i
++) {
956 msleep(10 * (i
+ 1));
957 err
= __intel_gt_reset(gt
, ALL_ENGINES
);
962 return gt_reset(gt
, stalled_mask
);
965 static int resume(struct intel_gt
*gt
)
967 struct intel_engine_cs
*engine
;
968 enum intel_engine_id id
;
971 for_each_engine(engine
, gt
, id
) {
972 ret
= engine
->resume(engine
);
981 * intel_gt_reset - reset chip after a hang
982 * @gt: #intel_gt to reset
983 * @stalled_mask: mask of the stalled engines with the guilty requests
984 * @reason: user error message for why we are resetting
986 * Reset the chip. Useful if a hang is detected. Marks the device as wedged
989 * Procedure is fairly simple:
990 * - reset the chip using the reset reg
991 * - re-init context state
992 * - re-init hardware status page
993 * - re-init ring buffer
994 * - re-init interrupt state
997 void intel_gt_reset(struct intel_gt
*gt
,
998 intel_engine_mask_t stalled_mask
,
1001 intel_engine_mask_t awake
;
1004 GT_TRACE(gt
, "flags=%lx\n", gt
->reset
.flags
);
1007 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF
, >
->reset
.flags
));
1008 mutex_lock(>
->reset
.mutex
);
1010 /* Clear any previous failed attempts at recovery. Time to try again. */
1011 if (!__intel_gt_unset_wedged(gt
))
1015 dev_notice(gt
->i915
->drm
.dev
,
1016 "Resetting chip for %s\n", reason
);
1017 atomic_inc(>
->i915
->gpu_error
.reset_count
);
1019 awake
= reset_prepare(gt
);
1021 if (!intel_has_gpu_reset(gt
)) {
1022 if (i915_modparams
.reset
)
1023 dev_err(gt
->i915
->drm
.dev
, "GPU reset not supported\n");
1025 DRM_DEBUG_DRIVER("GPU reset disabled\n");
1029 if (INTEL_INFO(gt
->i915
)->gpu_reset_clobbers_display
)
1030 intel_runtime_pm_disable_interrupts(gt
->i915
);
1032 if (do_reset(gt
, stalled_mask
)) {
1033 dev_err(gt
->i915
->drm
.dev
, "Failed to reset chip\n");
1037 if (INTEL_INFO(gt
->i915
)->gpu_reset_clobbers_display
)
1038 intel_runtime_pm_enable_interrupts(gt
->i915
);
1040 intel_overlay_reset(gt
->i915
);
1043 * Next we need to restore the context, but we don't use those
1046 * Ring buffer needs to be re-initialized in the KMS case, or if X
1047 * was running at the time of the reset (i.e. we weren't VT
1050 ret
= intel_gt_init_hw(gt
);
1052 DRM_ERROR("Failed to initialise HW following reset (%d)\n",
1062 reset_finish(gt
, awake
);
1064 mutex_unlock(>
->reset
.mutex
);
1069 * History tells us that if we cannot reset the GPU now, we
1070 * never will. This then impacts everything that is run
1071 * subsequently. On failing the reset, we mark the driver
1072 * as wedged, preventing further execution on the GPU.
1073 * We also want to go one step further and add a taint to the
1074 * kernel so that any subsequent faults can be traced back to
1075 * this failure. This is important for CI, where if the
1076 * GPU/driver fails we would like to reboot and restart testing
1077 * rather than continue on into oblivion. For everyone else,
1078 * the system should still plod along, but they have been warned!
1080 add_taint_for_CI(TAINT_WARN
);
1082 __intel_gt_set_wedged(gt
);
1086 static inline int intel_gt_reset_engine(struct intel_engine_cs
*engine
)
1088 return __intel_gt_reset(engine
->gt
, engine
->mask
);
1092 * intel_engine_reset - reset GPU engine to recover from a hang
1093 * @engine: engine to reset
1094 * @msg: reason for GPU reset; or NULL for no dev_notice()
1096 * Reset a specific GPU engine. Useful if a hang is detected.
1097 * Returns zero on successful reset or otherwise an error code.
1100 * - identifies the request that caused the hang and it is dropped
1101 * - reset engine (which will force the engine to idle)
1102 * - re-init/configure engine
1104 int intel_engine_reset(struct intel_engine_cs
*engine
, const char *msg
)
1106 struct intel_gt
*gt
= engine
->gt
;
1107 bool uses_guc
= intel_engine_in_guc_submission_mode(engine
);
1110 ENGINE_TRACE(engine
, "flags=%lx\n", gt
->reset
.flags
);
1111 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE
+ engine
->id
, >
->reset
.flags
));
1113 if (!intel_engine_pm_get_if_awake(engine
))
1116 reset_prepare_engine(engine
);
1119 dev_notice(engine
->i915
->drm
.dev
,
1120 "Resetting %s for %s\n", engine
->name
, msg
);
1121 atomic_inc(&engine
->i915
->gpu_error
.reset_engine_count
[engine
->uabi_class
]);
1124 ret
= intel_gt_reset_engine(engine
);
1126 ret
= intel_guc_reset_engine(&engine
->gt
->uc
.guc
, engine
);
1128 /* If we fail here, we expect to fallback to a global reset */
1129 DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
1130 uses_guc
? "GuC " : "",
1136 * The request that caused the hang is stuck on elsp, we know the
1137 * active request and can drop it, adjust head to skip the offending
1138 * request to resume executing remaining requests in the queue.
1140 __intel_engine_reset(engine
, true);
1143 * The engine and its registers (and workarounds in case of render)
1144 * have been reset to their default values. Follow the init_ring
1145 * process to program RING_MODE, HWSP and re-enable submission.
1147 ret
= engine
->resume(engine
);
1150 intel_engine_cancel_stop_cs(engine
);
1151 reset_finish_engine(engine
);
1152 intel_engine_pm_put_async(engine
);
1156 static void intel_gt_reset_global(struct intel_gt
*gt
,
1160 struct kobject
*kobj
= >
->i915
->drm
.primary
->kdev
->kobj
;
1161 char *error_event
[] = { I915_ERROR_UEVENT
"=1", NULL
};
1162 char *reset_event
[] = { I915_RESET_UEVENT
"=1", NULL
};
1163 char *reset_done_event
[] = { I915_ERROR_UEVENT
"=0", NULL
};
1164 struct intel_wedge_me w
;
1166 kobject_uevent_env(kobj
, KOBJ_CHANGE
, error_event
);
1168 DRM_DEBUG_DRIVER("resetting chip\n");
1169 kobject_uevent_env(kobj
, KOBJ_CHANGE
, reset_event
);
1171 /* Use a watchdog to ensure that our reset completes */
1172 intel_wedge_on_timeout(&w
, gt
, 5 * HZ
) {
1173 intel_prepare_reset(gt
->i915
);
1175 /* Flush everyone using a resource about to be clobbered */
1176 synchronize_srcu_expedited(>
->reset
.backoff_srcu
);
1178 intel_gt_reset(gt
, engine_mask
, reason
);
1180 intel_finish_reset(gt
->i915
);
1183 if (!test_bit(I915_WEDGED
, >
->reset
.flags
))
1184 kobject_uevent_env(kobj
, KOBJ_CHANGE
, reset_done_event
);
1188 * intel_gt_handle_error - handle a gpu error
1190 * @engine_mask: mask representing engines that are hung
1191 * @flags: control flags
1192 * @fmt: Error message format string
1194 * Do some basic checking of register state at error time and
1195 * dump it to the syslog. Also call i915_capture_error_state() to make
1196 * sure we get a record and make it available in debugfs. Fire a uevent
1197 * so userspace knows something bad happened (should trigger collection
1198 * of a ring dump etc.).
1200 void intel_gt_handle_error(struct intel_gt
*gt
,
1201 intel_engine_mask_t engine_mask
,
1202 unsigned long flags
,
1203 const char *fmt
, ...)
1205 struct intel_engine_cs
*engine
;
1206 intel_wakeref_t wakeref
;
1207 intel_engine_mask_t tmp
;
1214 va_start(args
, fmt
);
1215 vscnprintf(error_msg
, sizeof(error_msg
), fmt
, args
);
1222 * In most cases it's guaranteed that we get here with an RPM
1223 * reference held, for example because there is a pending GPU
1224 * request that won't finish until the reset is done. This
1225 * isn't the case at least when we get here by doing a
1226 * simulated reset via debugfs, so get an RPM reference.
1228 wakeref
= intel_runtime_pm_get(gt
->uncore
->rpm
);
1230 engine_mask
&= INTEL_INFO(gt
->i915
)->engine_mask
;
1232 if (flags
& I915_ERROR_CAPTURE
) {
1233 i915_capture_error_state(gt
->i915
);
1234 intel_gt_clear_error_registers(gt
, engine_mask
);
1238 * Try engine reset when available. We fall back to full reset if
1239 * single reset fails.
1241 if (intel_has_reset_engine(gt
) && !intel_gt_is_wedged(gt
)) {
1242 for_each_engine_masked(engine
, gt
, engine_mask
, tmp
) {
1243 BUILD_BUG_ON(I915_RESET_MODESET
>= I915_RESET_ENGINE
);
1244 if (test_and_set_bit(I915_RESET_ENGINE
+ engine
->id
,
1248 if (intel_engine_reset(engine
, msg
) == 0)
1249 engine_mask
&= ~engine
->mask
;
1251 clear_and_wake_up_bit(I915_RESET_ENGINE
+ engine
->id
,
1259 /* Full reset needs the mutex, stop any other user trying to do so. */
1260 if (test_and_set_bit(I915_RESET_BACKOFF
, >
->reset
.flags
)) {
1261 wait_event(gt
->reset
.queue
,
1262 !test_bit(I915_RESET_BACKOFF
, >
->reset
.flags
));
1263 goto out
; /* piggy-back on the other reset */
1266 /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1267 synchronize_rcu_expedited();
1269 /* Prevent any other reset-engine attempt. */
1270 for_each_engine(engine
, gt
, tmp
) {
1271 while (test_and_set_bit(I915_RESET_ENGINE
+ engine
->id
,
1273 wait_on_bit(>
->reset
.flags
,
1274 I915_RESET_ENGINE
+ engine
->id
,
1275 TASK_UNINTERRUPTIBLE
);
1278 intel_gt_reset_global(gt
, engine_mask
, msg
);
1280 for_each_engine(engine
, gt
, tmp
)
1281 clear_bit_unlock(I915_RESET_ENGINE
+ engine
->id
,
1283 clear_bit_unlock(I915_RESET_BACKOFF
, >
->reset
.flags
);
1284 smp_mb__after_atomic();
1285 wake_up_all(>
->reset
.queue
);
1288 intel_runtime_pm_put(gt
->uncore
->rpm
, wakeref
);
1291 int intel_gt_reset_trylock(struct intel_gt
*gt
, int *srcu
)
1293 might_lock(>
->reset
.backoff_srcu
);
1297 while (test_bit(I915_RESET_BACKOFF
, >
->reset
.flags
)) {
1300 if (wait_event_interruptible(gt
->reset
.queue
,
1301 !test_bit(I915_RESET_BACKOFF
,
1307 *srcu
= srcu_read_lock(>
->reset
.backoff_srcu
);
1313 void intel_gt_reset_unlock(struct intel_gt
*gt
, int tag
)
1314 __releases(>
->reset
.backoff_srcu
)
1316 srcu_read_unlock(>
->reset
.backoff_srcu
, tag
);
1319 int intel_gt_terminally_wedged(struct intel_gt
*gt
)
1323 if (!intel_gt_is_wedged(gt
))
1326 if (intel_gt_has_init_error(gt
))
1329 /* Reset still in progress? Maybe we will recover? */
1330 if (wait_event_interruptible(gt
->reset
.queue
,
1331 !test_bit(I915_RESET_BACKOFF
,
1335 return intel_gt_is_wedged(gt
) ? -EIO
: 0;
1338 void intel_gt_set_wedged_on_init(struct intel_gt
*gt
)
1340 BUILD_BUG_ON(I915_RESET_ENGINE
+ I915_NUM_ENGINES
>
1341 I915_WEDGED_ON_INIT
);
1342 intel_gt_set_wedged(gt
);
1343 set_bit(I915_WEDGED_ON_INIT
, >
->reset
.flags
);
1346 void intel_gt_init_reset(struct intel_gt
*gt
)
1348 init_waitqueue_head(>
->reset
.queue
);
1349 mutex_init(>
->reset
.mutex
);
1350 init_srcu_struct(>
->reset
.backoff_srcu
);
1352 /* no GPU until we are ready! */
1353 __set_bit(I915_WEDGED
, >
->reset
.flags
);
1356 void intel_gt_fini_reset(struct intel_gt
*gt
)
1358 cleanup_srcu_struct(>
->reset
.backoff_srcu
);
1361 static void intel_wedge_me(struct work_struct
*work
)
1363 struct intel_wedge_me
*w
= container_of(work
, typeof(*w
), work
.work
);
1365 dev_err(w
->gt
->i915
->drm
.dev
,
1366 "%s timed out, cancelling all in-flight rendering.\n",
1368 intel_gt_set_wedged(w
->gt
);
1371 void __intel_init_wedge(struct intel_wedge_me
*w
,
1372 struct intel_gt
*gt
,
1379 INIT_DELAYED_WORK_ONSTACK(&w
->work
, intel_wedge_me
);
1380 schedule_delayed_work(&w
->work
, timeout
);
1383 void __intel_fini_wedge(struct intel_wedge_me
*w
)
1385 cancel_delayed_work_sync(&w
->work
);
1386 destroy_delayed_work_on_stack(&w
->work
);
1390 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1391 #include "selftest_reset.c"
1392 #include "selftest_hangcheck.c"