treewide: remove redundant IS_ERR() before error code check
[linux/fpc-iii.git] / drivers / gpu / drm / i915 / gt / intel_reset.c
blobbeee0cf89bced38af4e664ab7a129dea5c76f0a5
1 /*
2 * SPDX-License-Identifier: MIT
4 * Copyright © 2008-2018 Intel Corporation
5 */
7 #include <linux/sched/mm.h>
8 #include <linux/stop_machine.h>
10 #include "display/intel_display_types.h"
11 #include "display/intel_overlay.h"
13 #include "gem/i915_gem_context.h"
15 #include "i915_drv.h"
16 #include "i915_gpu_error.h"
17 #include "i915_irq.h"
18 #include "intel_engine_pm.h"
19 #include "intel_gt.h"
20 #include "intel_gt_pm.h"
21 #include "intel_reset.h"
23 #include "uc/intel_guc.h"
24 #include "uc/intel_guc_submission.h"
26 #define RESET_MAX_RETRIES 3
28 /* XXX How to handle concurrent GGTT updates using tiling registers? */
29 #define RESET_UNDER_STOP_MACHINE 0
31 static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
33 intel_uncore_rmw_fw(uncore, reg, 0, set);
36 static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
38 intel_uncore_rmw_fw(uncore, reg, clr, 0);
41 static void engine_skip_context(struct i915_request *rq)
43 struct intel_engine_cs *engine = rq->engine;
44 struct intel_context *hung_ctx = rq->context;
46 if (!i915_request_is_active(rq))
47 return;
49 lockdep_assert_held(&engine->active.lock);
50 list_for_each_entry_continue(rq, &engine->active.requests, sched.link)
51 if (rq->context == hung_ctx)
52 i915_request_skip(rq, -EIO);
55 static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
57 struct drm_i915_file_private *file_priv = ctx->file_priv;
58 unsigned long prev_hang;
59 unsigned int score;
61 if (IS_ERR_OR_NULL(file_priv))
62 return;
64 score = 0;
65 if (banned)
66 score = I915_CLIENT_SCORE_CONTEXT_BAN;
68 prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
69 if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
70 score += I915_CLIENT_SCORE_HANG_FAST;
72 if (score) {
73 atomic_add(score, &file_priv->ban_score);
75 DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
76 ctx->name, score,
77 atomic_read(&file_priv->ban_score));
81 static bool mark_guilty(struct i915_request *rq)
83 struct i915_gem_context *ctx;
84 unsigned long prev_hang;
85 bool banned;
86 int i;
88 rcu_read_lock();
89 ctx = rcu_dereference(rq->context->gem_context);
90 if (ctx && !kref_get_unless_zero(&ctx->ref))
91 ctx = NULL;
92 rcu_read_unlock();
93 if (!ctx)
94 return false;
96 if (i915_gem_context_is_closed(ctx)) {
97 intel_context_set_banned(rq->context);
98 banned = true;
99 goto out;
102 atomic_inc(&ctx->guilty_count);
104 /* Cool contexts are too cool to be banned! (Used for reset testing.) */
105 if (!i915_gem_context_is_bannable(ctx)) {
106 banned = false;
107 goto out;
110 dev_notice(ctx->i915->drm.dev,
111 "%s context reset due to GPU hang\n",
112 ctx->name);
114 /* Record the timestamp for the last N hangs */
115 prev_hang = ctx->hang_timestamp[0];
116 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
117 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
118 ctx->hang_timestamp[i] = jiffies;
120 /* If we have hung N+1 times in rapid succession, we ban the context! */
121 banned = !i915_gem_context_is_recoverable(ctx);
122 if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
123 banned = true;
124 if (banned) {
125 DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n",
126 ctx->name, atomic_read(&ctx->guilty_count));
127 intel_context_set_banned(rq->context);
130 client_mark_guilty(ctx, banned);
132 out:
133 i915_gem_context_put(ctx);
134 return banned;
137 static void mark_innocent(struct i915_request *rq)
139 struct i915_gem_context *ctx;
141 rcu_read_lock();
142 ctx = rcu_dereference(rq->context->gem_context);
143 if (ctx)
144 atomic_inc(&ctx->active_count);
145 rcu_read_unlock();
148 void __i915_request_reset(struct i915_request *rq, bool guilty)
150 RQ_TRACE(rq, "guilty? %s\n", yesno(guilty));
152 GEM_BUG_ON(i915_request_completed(rq));
154 rcu_read_lock(); /* protect the GEM context */
155 if (guilty) {
156 i915_request_skip(rq, -EIO);
157 if (mark_guilty(rq))
158 engine_skip_context(rq);
159 } else {
160 dma_fence_set_error(&rq->fence, -EAGAIN);
161 mark_innocent(rq);
163 rcu_read_unlock();
166 static bool i915_in_reset(struct pci_dev *pdev)
168 u8 gdrst;
170 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
171 return gdrst & GRDOM_RESET_STATUS;
174 static int i915_do_reset(struct intel_gt *gt,
175 intel_engine_mask_t engine_mask,
176 unsigned int retry)
178 struct pci_dev *pdev = gt->i915->drm.pdev;
179 int err;
181 /* Assert reset for at least 20 usec, and wait for acknowledgement. */
182 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
183 udelay(50);
184 err = wait_for_atomic(i915_in_reset(pdev), 50);
186 /* Clear the reset request. */
187 pci_write_config_byte(pdev, I915_GDRST, 0);
188 udelay(50);
189 if (!err)
190 err = wait_for_atomic(!i915_in_reset(pdev), 50);
192 return err;
195 static bool g4x_reset_complete(struct pci_dev *pdev)
197 u8 gdrst;
199 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
200 return (gdrst & GRDOM_RESET_ENABLE) == 0;
203 static int g33_do_reset(struct intel_gt *gt,
204 intel_engine_mask_t engine_mask,
205 unsigned int retry)
207 struct pci_dev *pdev = gt->i915->drm.pdev;
209 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
210 return wait_for_atomic(g4x_reset_complete(pdev), 50);
213 static int g4x_do_reset(struct intel_gt *gt,
214 intel_engine_mask_t engine_mask,
215 unsigned int retry)
217 struct pci_dev *pdev = gt->i915->drm.pdev;
218 struct intel_uncore *uncore = gt->uncore;
219 int ret;
221 /* WaVcpClkGateDisableForMediaReset:ctg,elk */
222 rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
223 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
225 pci_write_config_byte(pdev, I915_GDRST,
226 GRDOM_MEDIA | GRDOM_RESET_ENABLE);
227 ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
228 if (ret) {
229 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
230 goto out;
233 pci_write_config_byte(pdev, I915_GDRST,
234 GRDOM_RENDER | GRDOM_RESET_ENABLE);
235 ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
236 if (ret) {
237 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
238 goto out;
241 out:
242 pci_write_config_byte(pdev, I915_GDRST, 0);
244 rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
245 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
247 return ret;
250 static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
251 unsigned int retry)
253 struct intel_uncore *uncore = gt->uncore;
254 int ret;
256 intel_uncore_write_fw(uncore, ILK_GDSR,
257 ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
258 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
259 ILK_GRDOM_RESET_ENABLE, 0,
260 5000, 0,
261 NULL);
262 if (ret) {
263 DRM_DEBUG_DRIVER("Wait for render reset failed\n");
264 goto out;
267 intel_uncore_write_fw(uncore, ILK_GDSR,
268 ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
269 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
270 ILK_GRDOM_RESET_ENABLE, 0,
271 5000, 0,
272 NULL);
273 if (ret) {
274 DRM_DEBUG_DRIVER("Wait for media reset failed\n");
275 goto out;
278 out:
279 intel_uncore_write_fw(uncore, ILK_GDSR, 0);
280 intel_uncore_posting_read_fw(uncore, ILK_GDSR);
281 return ret;
284 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
285 static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
287 struct intel_uncore *uncore = gt->uncore;
288 int err;
291 * GEN6_GDRST is not in the gt power well, no need to check
292 * for fifo space for the write or forcewake the chip for
293 * the read
295 intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
297 /* Wait for the device to ack the reset requests */
298 err = __intel_wait_for_register_fw(uncore,
299 GEN6_GDRST, hw_domain_mask, 0,
300 500, 0,
301 NULL);
302 if (err)
303 DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
304 hw_domain_mask);
306 return err;
309 static int gen6_reset_engines(struct intel_gt *gt,
310 intel_engine_mask_t engine_mask,
311 unsigned int retry)
313 static const u32 hw_engine_mask[] = {
314 [RCS0] = GEN6_GRDOM_RENDER,
315 [BCS0] = GEN6_GRDOM_BLT,
316 [VCS0] = GEN6_GRDOM_MEDIA,
317 [VCS1] = GEN8_GRDOM_MEDIA2,
318 [VECS0] = GEN6_GRDOM_VECS,
320 struct intel_engine_cs *engine;
321 u32 hw_mask;
323 if (engine_mask == ALL_ENGINES) {
324 hw_mask = GEN6_GRDOM_FULL;
325 } else {
326 intel_engine_mask_t tmp;
328 hw_mask = 0;
329 for_each_engine_masked(engine, gt, engine_mask, tmp) {
330 GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
331 hw_mask |= hw_engine_mask[engine->id];
335 return gen6_hw_domain_reset(gt, hw_mask);
338 static int gen11_lock_sfc(struct intel_engine_cs *engine, u32 *hw_mask)
340 struct intel_uncore *uncore = engine->uncore;
341 u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
342 i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
343 u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
344 i915_reg_t sfc_usage;
345 u32 sfc_usage_bit;
346 u32 sfc_reset_bit;
347 int ret;
349 switch (engine->class) {
350 case VIDEO_DECODE_CLASS:
351 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
352 return 0;
354 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
355 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
357 sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
358 sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT;
360 sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
361 sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
362 sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
363 break;
365 case VIDEO_ENHANCEMENT_CLASS:
366 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
367 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
369 sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
370 sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT;
372 sfc_usage = GEN11_VECS_SFC_USAGE(engine);
373 sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
374 sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
375 break;
377 default:
378 return 0;
382 * If the engine is using a SFC, tell the engine that a software reset
383 * is going to happen. The engine will then try to force lock the SFC.
384 * If SFC ends up being locked to the engine we want to reset, we have
385 * to reset it as well (we will unlock it once the reset sequence is
386 * completed).
388 if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
389 return 0;
391 rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
393 ret = __intel_wait_for_register_fw(uncore,
394 sfc_forced_lock_ack,
395 sfc_forced_lock_ack_bit,
396 sfc_forced_lock_ack_bit,
397 1000, 0, NULL);
399 /* Was the SFC released while we were trying to lock it? */
400 if (!(intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit))
401 return 0;
403 if (ret) {
404 DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
405 return ret;
408 *hw_mask |= sfc_reset_bit;
409 return 0;
412 static void gen11_unlock_sfc(struct intel_engine_cs *engine)
414 struct intel_uncore *uncore = engine->uncore;
415 u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access;
416 i915_reg_t sfc_forced_lock;
417 u32 sfc_forced_lock_bit;
419 switch (engine->class) {
420 case VIDEO_DECODE_CLASS:
421 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
422 return;
424 sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
425 sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
426 break;
428 case VIDEO_ENHANCEMENT_CLASS:
429 sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
430 sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
431 break;
433 default:
434 return;
437 rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit);
440 static int gen11_reset_engines(struct intel_gt *gt,
441 intel_engine_mask_t engine_mask,
442 unsigned int retry)
444 static const u32 hw_engine_mask[] = {
445 [RCS0] = GEN11_GRDOM_RENDER,
446 [BCS0] = GEN11_GRDOM_BLT,
447 [VCS0] = GEN11_GRDOM_MEDIA,
448 [VCS1] = GEN11_GRDOM_MEDIA2,
449 [VCS2] = GEN11_GRDOM_MEDIA3,
450 [VCS3] = GEN11_GRDOM_MEDIA4,
451 [VECS0] = GEN11_GRDOM_VECS,
452 [VECS1] = GEN11_GRDOM_VECS2,
454 struct intel_engine_cs *engine;
455 intel_engine_mask_t tmp;
456 u32 hw_mask;
457 int ret;
459 if (engine_mask == ALL_ENGINES) {
460 hw_mask = GEN11_GRDOM_FULL;
461 } else {
462 hw_mask = 0;
463 for_each_engine_masked(engine, gt, engine_mask, tmp) {
464 GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
465 hw_mask |= hw_engine_mask[engine->id];
466 ret = gen11_lock_sfc(engine, &hw_mask);
467 if (ret)
468 goto sfc_unlock;
472 ret = gen6_hw_domain_reset(gt, hw_mask);
474 sfc_unlock:
476 * We unlock the SFC based on the lock status and not the result of
477 * gen11_lock_sfc to make sure that we clean properly if something
478 * wrong happened during the lock (e.g. lock acquired after timeout
479 * expiration).
481 if (engine_mask != ALL_ENGINES)
482 for_each_engine_masked(engine, gt, engine_mask, tmp)
483 gen11_unlock_sfc(engine);
485 return ret;
488 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
490 struct intel_uncore *uncore = engine->uncore;
491 const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
492 u32 request, mask, ack;
493 int ret;
495 ack = intel_uncore_read_fw(uncore, reg);
496 if (ack & RESET_CTL_CAT_ERROR) {
498 * For catastrophic errors, ready-for-reset sequence
499 * needs to be bypassed: HAS#396813
501 request = RESET_CTL_CAT_ERROR;
502 mask = RESET_CTL_CAT_ERROR;
504 /* Catastrophic errors need to be cleared by HW */
505 ack = 0;
506 } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
507 request = RESET_CTL_REQUEST_RESET;
508 mask = RESET_CTL_READY_TO_RESET;
509 ack = RESET_CTL_READY_TO_RESET;
510 } else {
511 return 0;
514 intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
515 ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
516 700, 0, NULL);
517 if (ret)
518 DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
519 engine->name, request,
520 intel_uncore_read_fw(uncore, reg));
522 return ret;
525 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
527 intel_uncore_write_fw(engine->uncore,
528 RING_RESET_CTL(engine->mmio_base),
529 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
532 static int gen8_reset_engines(struct intel_gt *gt,
533 intel_engine_mask_t engine_mask,
534 unsigned int retry)
536 struct intel_engine_cs *engine;
537 const bool reset_non_ready = retry >= 1;
538 intel_engine_mask_t tmp;
539 int ret;
541 for_each_engine_masked(engine, gt, engine_mask, tmp) {
542 ret = gen8_engine_reset_prepare(engine);
543 if (ret && !reset_non_ready)
544 goto skip_reset;
547 * If this is not the first failed attempt to prepare,
548 * we decide to proceed anyway.
550 * By doing so we risk context corruption and with
551 * some gens (kbl), possible system hang if reset
552 * happens during active bb execution.
554 * We rather take context corruption instead of
555 * failed reset with a wedged driver/gpu. And
556 * active bb execution case should be covered by
557 * stop_engines() we have before the reset.
561 if (INTEL_GEN(gt->i915) >= 11)
562 ret = gen11_reset_engines(gt, engine_mask, retry);
563 else
564 ret = gen6_reset_engines(gt, engine_mask, retry);
566 skip_reset:
567 for_each_engine_masked(engine, gt, engine_mask, tmp)
568 gen8_engine_reset_cancel(engine);
570 return ret;
573 static int mock_reset(struct intel_gt *gt,
574 intel_engine_mask_t mask,
575 unsigned int retry)
577 return 0;
580 typedef int (*reset_func)(struct intel_gt *,
581 intel_engine_mask_t engine_mask,
582 unsigned int retry);
584 static reset_func intel_get_gpu_reset(const struct intel_gt *gt)
586 struct drm_i915_private *i915 = gt->i915;
588 if (is_mock_gt(gt))
589 return mock_reset;
590 else if (INTEL_GEN(i915) >= 8)
591 return gen8_reset_engines;
592 else if (INTEL_GEN(i915) >= 6)
593 return gen6_reset_engines;
594 else if (INTEL_GEN(i915) >= 5)
595 return ilk_do_reset;
596 else if (IS_G4X(i915))
597 return g4x_do_reset;
598 else if (IS_G33(i915) || IS_PINEVIEW(i915))
599 return g33_do_reset;
600 else if (INTEL_GEN(i915) >= 3)
601 return i915_do_reset;
602 else
603 return NULL;
606 int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
608 const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
609 reset_func reset;
610 int ret = -ETIMEDOUT;
611 int retry;
613 reset = intel_get_gpu_reset(gt);
614 if (!reset)
615 return -ENODEV;
618 * If the power well sleeps during the reset, the reset
619 * request may be dropped and never completes (causing -EIO).
621 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
622 for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
623 GT_TRACE(gt, "engine_mask=%x\n", engine_mask);
624 preempt_disable();
625 ret = reset(gt, engine_mask, retry);
626 preempt_enable();
628 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
630 return ret;
633 bool intel_has_gpu_reset(const struct intel_gt *gt)
635 if (!i915_modparams.reset)
636 return NULL;
638 return intel_get_gpu_reset(gt);
641 bool intel_has_reset_engine(const struct intel_gt *gt)
643 if (i915_modparams.reset < 2)
644 return false;
646 return INTEL_INFO(gt->i915)->has_reset_engine;
649 int intel_reset_guc(struct intel_gt *gt)
651 u32 guc_domain =
652 INTEL_GEN(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
653 int ret;
655 GEM_BUG_ON(!HAS_GT_UC(gt->i915));
657 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
658 ret = gen6_hw_domain_reset(gt, guc_domain);
659 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
661 return ret;
665 * Ensure irq handler finishes, and not run again.
666 * Also return the active request so that we only search for it once.
668 static void reset_prepare_engine(struct intel_engine_cs *engine)
671 * During the reset sequence, we must prevent the engine from
672 * entering RC6. As the context state is undefined until we restart
673 * the engine, if it does enter RC6 during the reset, the state
674 * written to the powercontext is undefined and so we may lose
675 * GPU state upon resume, i.e. fail to restart after a reset.
677 intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
678 if (engine->reset.prepare)
679 engine->reset.prepare(engine);
682 static void revoke_mmaps(struct intel_gt *gt)
684 int i;
686 for (i = 0; i < gt->ggtt->num_fences; i++) {
687 struct drm_vma_offset_node *node;
688 struct i915_vma *vma;
689 u64 vma_offset;
691 vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
692 if (!vma)
693 continue;
695 if (!i915_vma_has_userfault(vma))
696 continue;
698 GEM_BUG_ON(vma->fence != &gt->ggtt->fence_regs[i]);
700 if (!vma->mmo)
701 continue;
703 node = &vma->mmo->vma_node;
704 vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
706 unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
707 drm_vma_node_offset_addr(node) + vma_offset,
708 vma->size,
713 static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
715 struct intel_engine_cs *engine;
716 intel_engine_mask_t awake = 0;
717 enum intel_engine_id id;
719 for_each_engine(engine, gt, id) {
720 if (intel_engine_pm_get_if_awake(engine))
721 awake |= engine->mask;
722 reset_prepare_engine(engine);
725 intel_uc_reset_prepare(&gt->uc);
727 return awake;
730 static void gt_revoke(struct intel_gt *gt)
732 revoke_mmaps(gt);
735 static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
737 struct intel_engine_cs *engine;
738 enum intel_engine_id id;
739 int err;
742 * Everything depends on having the GTT running, so we need to start
743 * there.
745 err = i915_ggtt_enable_hw(gt->i915);
746 if (err)
747 return err;
749 for_each_engine(engine, gt, id)
750 __intel_engine_reset(engine, stalled_mask & engine->mask);
752 i915_gem_restore_fences(gt->ggtt);
754 return err;
757 static void reset_finish_engine(struct intel_engine_cs *engine)
759 if (engine->reset.finish)
760 engine->reset.finish(engine);
761 intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
763 intel_engine_signal_breadcrumbs(engine);
766 static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
768 struct intel_engine_cs *engine;
769 enum intel_engine_id id;
771 for_each_engine(engine, gt, id) {
772 reset_finish_engine(engine);
773 if (awake & engine->mask)
774 intel_engine_pm_put(engine);
778 static void nop_submit_request(struct i915_request *request)
780 struct intel_engine_cs *engine = request->engine;
781 unsigned long flags;
783 RQ_TRACE(request, "-EIO\n");
784 dma_fence_set_error(&request->fence, -EIO);
786 spin_lock_irqsave(&engine->active.lock, flags);
787 __i915_request_submit(request);
788 i915_request_mark_complete(request);
789 spin_unlock_irqrestore(&engine->active.lock, flags);
791 intel_engine_signal_breadcrumbs(engine);
794 static void __intel_gt_set_wedged(struct intel_gt *gt)
796 struct intel_engine_cs *engine;
797 intel_engine_mask_t awake;
798 enum intel_engine_id id;
800 if (test_bit(I915_WEDGED, &gt->reset.flags))
801 return;
803 if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(gt)) {
804 struct drm_printer p = drm_debug_printer(__func__);
806 for_each_engine(engine, gt, id)
807 intel_engine_dump(engine, &p, "%s\n", engine->name);
810 GT_TRACE(gt, "start\n");
813 * First, stop submission to hw, but do not yet complete requests by
814 * rolling the global seqno forward (since this would complete requests
815 * for which we haven't set the fence error to EIO yet).
817 awake = reset_prepare(gt);
819 /* Even if the GPU reset fails, it should still stop the engines */
820 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
821 __intel_gt_reset(gt, ALL_ENGINES);
823 for_each_engine(engine, gt, id)
824 engine->submit_request = nop_submit_request;
827 * Make sure no request can slip through without getting completed by
828 * either this call here to intel_engine_write_global_seqno, or the one
829 * in nop_submit_request.
831 synchronize_rcu_expedited();
832 set_bit(I915_WEDGED, &gt->reset.flags);
834 /* Mark all executing requests as skipped */
835 for_each_engine(engine, gt, id)
836 if (engine->reset.cancel)
837 engine->reset.cancel(engine);
839 reset_finish(gt, awake);
841 GT_TRACE(gt, "end\n");
844 void intel_gt_set_wedged(struct intel_gt *gt)
846 intel_wakeref_t wakeref;
848 mutex_lock(&gt->reset.mutex);
849 with_intel_runtime_pm(gt->uncore->rpm, wakeref)
850 __intel_gt_set_wedged(gt);
851 mutex_unlock(&gt->reset.mutex);
854 static bool __intel_gt_unset_wedged(struct intel_gt *gt)
856 struct intel_gt_timelines *timelines = &gt->timelines;
857 struct intel_timeline *tl;
858 bool ok;
860 if (!test_bit(I915_WEDGED, &gt->reset.flags))
861 return true;
863 /* Never fully initialised, recovery impossible */
864 if (test_bit(I915_WEDGED_ON_INIT, &gt->reset.flags))
865 return false;
867 GT_TRACE(gt, "start\n");
870 * Before unwedging, make sure that all pending operations
871 * are flushed and errored out - we may have requests waiting upon
872 * third party fences. We marked all inflight requests as EIO, and
873 * every execbuf since returned EIO, for consistency we want all
874 * the currently pending requests to also be marked as EIO, which
875 * is done inside our nop_submit_request - and so we must wait.
877 * No more can be submitted until we reset the wedged bit.
879 spin_lock(&timelines->lock);
880 list_for_each_entry(tl, &timelines->active_list, link) {
881 struct dma_fence *fence;
883 fence = i915_active_fence_get(&tl->last_request);
884 if (!fence)
885 continue;
887 spin_unlock(&timelines->lock);
890 * All internal dependencies (i915_requests) will have
891 * been flushed by the set-wedge, but we may be stuck waiting
892 * for external fences. These should all be capped to 10s
893 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
894 * in the worst case.
896 dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
897 dma_fence_put(fence);
899 /* Restart iteration after droping lock */
900 spin_lock(&timelines->lock);
901 tl = list_entry(&timelines->active_list, typeof(*tl), link);
903 spin_unlock(&timelines->lock);
905 /* We must reset pending GPU events before restoring our submission */
906 ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
907 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
908 ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
909 if (!ok) {
911 * Warn CI about the unrecoverable wedged condition.
912 * Time for a reboot.
914 add_taint_for_CI(TAINT_WARN);
915 return false;
919 * Undo nop_submit_request. We prevent all new i915 requests from
920 * being queued (by disallowing execbuf whilst wedged) so having
921 * waited for all active requests above, we know the system is idle
922 * and do not have to worry about a thread being inside
923 * engine->submit_request() as we swap over. So unlike installing
924 * the nop_submit_request on reset, we can do this from normal
925 * context and do not require stop_machine().
927 intel_engines_reset_default_submission(gt);
929 GT_TRACE(gt, "end\n");
931 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
932 clear_bit(I915_WEDGED, &gt->reset.flags);
934 return true;
937 bool intel_gt_unset_wedged(struct intel_gt *gt)
939 bool result;
941 mutex_lock(&gt->reset.mutex);
942 result = __intel_gt_unset_wedged(gt);
943 mutex_unlock(&gt->reset.mutex);
945 return result;
948 static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
950 int err, i;
952 gt_revoke(gt);
954 err = __intel_gt_reset(gt, ALL_ENGINES);
955 for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
956 msleep(10 * (i + 1));
957 err = __intel_gt_reset(gt, ALL_ENGINES);
959 if (err)
960 return err;
962 return gt_reset(gt, stalled_mask);
965 static int resume(struct intel_gt *gt)
967 struct intel_engine_cs *engine;
968 enum intel_engine_id id;
969 int ret;
971 for_each_engine(engine, gt, id) {
972 ret = engine->resume(engine);
973 if (ret)
974 return ret;
977 return 0;
981 * intel_gt_reset - reset chip after a hang
982 * @gt: #intel_gt to reset
983 * @stalled_mask: mask of the stalled engines with the guilty requests
984 * @reason: user error message for why we are resetting
986 * Reset the chip. Useful if a hang is detected. Marks the device as wedged
987 * on failure.
989 * Procedure is fairly simple:
990 * - reset the chip using the reset reg
991 * - re-init context state
992 * - re-init hardware status page
993 * - re-init ring buffer
994 * - re-init interrupt state
995 * - re-init display
997 void intel_gt_reset(struct intel_gt *gt,
998 intel_engine_mask_t stalled_mask,
999 const char *reason)
1001 intel_engine_mask_t awake;
1002 int ret;
1004 GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
1006 might_sleep();
1007 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1008 mutex_lock(&gt->reset.mutex);
1010 /* Clear any previous failed attempts at recovery. Time to try again. */
1011 if (!__intel_gt_unset_wedged(gt))
1012 goto unlock;
1014 if (reason)
1015 dev_notice(gt->i915->drm.dev,
1016 "Resetting chip for %s\n", reason);
1017 atomic_inc(&gt->i915->gpu_error.reset_count);
1019 awake = reset_prepare(gt);
1021 if (!intel_has_gpu_reset(gt)) {
1022 if (i915_modparams.reset)
1023 dev_err(gt->i915->drm.dev, "GPU reset not supported\n");
1024 else
1025 DRM_DEBUG_DRIVER("GPU reset disabled\n");
1026 goto error;
1029 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1030 intel_runtime_pm_disable_interrupts(gt->i915);
1032 if (do_reset(gt, stalled_mask)) {
1033 dev_err(gt->i915->drm.dev, "Failed to reset chip\n");
1034 goto taint;
1037 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1038 intel_runtime_pm_enable_interrupts(gt->i915);
1040 intel_overlay_reset(gt->i915);
1043 * Next we need to restore the context, but we don't use those
1044 * yet either...
1046 * Ring buffer needs to be re-initialized in the KMS case, or if X
1047 * was running at the time of the reset (i.e. we weren't VT
1048 * switched away).
1050 ret = intel_gt_init_hw(gt);
1051 if (ret) {
1052 DRM_ERROR("Failed to initialise HW following reset (%d)\n",
1053 ret);
1054 goto taint;
1057 ret = resume(gt);
1058 if (ret)
1059 goto taint;
1061 finish:
1062 reset_finish(gt, awake);
1063 unlock:
1064 mutex_unlock(&gt->reset.mutex);
1065 return;
1067 taint:
1069 * History tells us that if we cannot reset the GPU now, we
1070 * never will. This then impacts everything that is run
1071 * subsequently. On failing the reset, we mark the driver
1072 * as wedged, preventing further execution on the GPU.
1073 * We also want to go one step further and add a taint to the
1074 * kernel so that any subsequent faults can be traced back to
1075 * this failure. This is important for CI, where if the
1076 * GPU/driver fails we would like to reboot and restart testing
1077 * rather than continue on into oblivion. For everyone else,
1078 * the system should still plod along, but they have been warned!
1080 add_taint_for_CI(TAINT_WARN);
1081 error:
1082 __intel_gt_set_wedged(gt);
1083 goto finish;
1086 static inline int intel_gt_reset_engine(struct intel_engine_cs *engine)
1088 return __intel_gt_reset(engine->gt, engine->mask);
1092 * intel_engine_reset - reset GPU engine to recover from a hang
1093 * @engine: engine to reset
1094 * @msg: reason for GPU reset; or NULL for no dev_notice()
1096 * Reset a specific GPU engine. Useful if a hang is detected.
1097 * Returns zero on successful reset or otherwise an error code.
1099 * Procedure is:
1100 * - identifies the request that caused the hang and it is dropped
1101 * - reset engine (which will force the engine to idle)
1102 * - re-init/configure engine
1104 int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
1106 struct intel_gt *gt = engine->gt;
1107 bool uses_guc = intel_engine_in_guc_submission_mode(engine);
1108 int ret;
1110 ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
1111 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags));
1113 if (!intel_engine_pm_get_if_awake(engine))
1114 return 0;
1116 reset_prepare_engine(engine);
1118 if (msg)
1119 dev_notice(engine->i915->drm.dev,
1120 "Resetting %s for %s\n", engine->name, msg);
1121 atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
1123 if (!uses_guc)
1124 ret = intel_gt_reset_engine(engine);
1125 else
1126 ret = intel_guc_reset_engine(&engine->gt->uc.guc, engine);
1127 if (ret) {
1128 /* If we fail here, we expect to fallback to a global reset */
1129 DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
1130 uses_guc ? "GuC " : "",
1131 engine->name, ret);
1132 goto out;
1136 * The request that caused the hang is stuck on elsp, we know the
1137 * active request and can drop it, adjust head to skip the offending
1138 * request to resume executing remaining requests in the queue.
1140 __intel_engine_reset(engine, true);
1143 * The engine and its registers (and workarounds in case of render)
1144 * have been reset to their default values. Follow the init_ring
1145 * process to program RING_MODE, HWSP and re-enable submission.
1147 ret = engine->resume(engine);
1149 out:
1150 intel_engine_cancel_stop_cs(engine);
1151 reset_finish_engine(engine);
1152 intel_engine_pm_put_async(engine);
1153 return ret;
1156 static void intel_gt_reset_global(struct intel_gt *gt,
1157 u32 engine_mask,
1158 const char *reason)
1160 struct kobject *kobj = &gt->i915->drm.primary->kdev->kobj;
1161 char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1162 char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1163 char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1164 struct intel_wedge_me w;
1166 kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1168 DRM_DEBUG_DRIVER("resetting chip\n");
1169 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1171 /* Use a watchdog to ensure that our reset completes */
1172 intel_wedge_on_timeout(&w, gt, 5 * HZ) {
1173 intel_prepare_reset(gt->i915);
1175 /* Flush everyone using a resource about to be clobbered */
1176 synchronize_srcu_expedited(&gt->reset.backoff_srcu);
1178 intel_gt_reset(gt, engine_mask, reason);
1180 intel_finish_reset(gt->i915);
1183 if (!test_bit(I915_WEDGED, &gt->reset.flags))
1184 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1188 * intel_gt_handle_error - handle a gpu error
1189 * @gt: the intel_gt
1190 * @engine_mask: mask representing engines that are hung
1191 * @flags: control flags
1192 * @fmt: Error message format string
1194 * Do some basic checking of register state at error time and
1195 * dump it to the syslog. Also call i915_capture_error_state() to make
1196 * sure we get a record and make it available in debugfs. Fire a uevent
1197 * so userspace knows something bad happened (should trigger collection
1198 * of a ring dump etc.).
1200 void intel_gt_handle_error(struct intel_gt *gt,
1201 intel_engine_mask_t engine_mask,
1202 unsigned long flags,
1203 const char *fmt, ...)
1205 struct intel_engine_cs *engine;
1206 intel_wakeref_t wakeref;
1207 intel_engine_mask_t tmp;
1208 char error_msg[80];
1209 char *msg = NULL;
1211 if (fmt) {
1212 va_list args;
1214 va_start(args, fmt);
1215 vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1216 va_end(args);
1218 msg = error_msg;
1222 * In most cases it's guaranteed that we get here with an RPM
1223 * reference held, for example because there is a pending GPU
1224 * request that won't finish until the reset is done. This
1225 * isn't the case at least when we get here by doing a
1226 * simulated reset via debugfs, so get an RPM reference.
1228 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1230 engine_mask &= INTEL_INFO(gt->i915)->engine_mask;
1232 if (flags & I915_ERROR_CAPTURE) {
1233 i915_capture_error_state(gt->i915);
1234 intel_gt_clear_error_registers(gt, engine_mask);
1238 * Try engine reset when available. We fall back to full reset if
1239 * single reset fails.
1241 if (intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
1242 for_each_engine_masked(engine, gt, engine_mask, tmp) {
1243 BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1244 if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1245 &gt->reset.flags))
1246 continue;
1248 if (intel_engine_reset(engine, msg) == 0)
1249 engine_mask &= ~engine->mask;
1251 clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1252 &gt->reset.flags);
1256 if (!engine_mask)
1257 goto out;
1259 /* Full reset needs the mutex, stop any other user trying to do so. */
1260 if (test_and_set_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1261 wait_event(gt->reset.queue,
1262 !test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
1263 goto out; /* piggy-back on the other reset */
1266 /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1267 synchronize_rcu_expedited();
1269 /* Prevent any other reset-engine attempt. */
1270 for_each_engine(engine, gt, tmp) {
1271 while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1272 &gt->reset.flags))
1273 wait_on_bit(&gt->reset.flags,
1274 I915_RESET_ENGINE + engine->id,
1275 TASK_UNINTERRUPTIBLE);
1278 intel_gt_reset_global(gt, engine_mask, msg);
1280 for_each_engine(engine, gt, tmp)
1281 clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1282 &gt->reset.flags);
1283 clear_bit_unlock(I915_RESET_BACKOFF, &gt->reset.flags);
1284 smp_mb__after_atomic();
1285 wake_up_all(&gt->reset.queue);
1287 out:
1288 intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1291 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
1293 might_lock(&gt->reset.backoff_srcu);
1294 might_sleep();
1296 rcu_read_lock();
1297 while (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
1298 rcu_read_unlock();
1300 if (wait_event_interruptible(gt->reset.queue,
1301 !test_bit(I915_RESET_BACKOFF,
1302 &gt->reset.flags)))
1303 return -EINTR;
1305 rcu_read_lock();
1307 *srcu = srcu_read_lock(&gt->reset.backoff_srcu);
1308 rcu_read_unlock();
1310 return 0;
1313 void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1314 __releases(&gt->reset.backoff_srcu)
1316 srcu_read_unlock(&gt->reset.backoff_srcu, tag);
1319 int intel_gt_terminally_wedged(struct intel_gt *gt)
1321 might_sleep();
1323 if (!intel_gt_is_wedged(gt))
1324 return 0;
1326 if (intel_gt_has_init_error(gt))
1327 return -EIO;
1329 /* Reset still in progress? Maybe we will recover? */
1330 if (wait_event_interruptible(gt->reset.queue,
1331 !test_bit(I915_RESET_BACKOFF,
1332 &gt->reset.flags)))
1333 return -EINTR;
1335 return intel_gt_is_wedged(gt) ? -EIO : 0;
1338 void intel_gt_set_wedged_on_init(struct intel_gt *gt)
1340 BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
1341 I915_WEDGED_ON_INIT);
1342 intel_gt_set_wedged(gt);
1343 set_bit(I915_WEDGED_ON_INIT, &gt->reset.flags);
1346 void intel_gt_init_reset(struct intel_gt *gt)
1348 init_waitqueue_head(&gt->reset.queue);
1349 mutex_init(&gt->reset.mutex);
1350 init_srcu_struct(&gt->reset.backoff_srcu);
1352 /* no GPU until we are ready! */
1353 __set_bit(I915_WEDGED, &gt->reset.flags);
1356 void intel_gt_fini_reset(struct intel_gt *gt)
1358 cleanup_srcu_struct(&gt->reset.backoff_srcu);
1361 static void intel_wedge_me(struct work_struct *work)
1363 struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1365 dev_err(w->gt->i915->drm.dev,
1366 "%s timed out, cancelling all in-flight rendering.\n",
1367 w->name);
1368 intel_gt_set_wedged(w->gt);
1371 void __intel_init_wedge(struct intel_wedge_me *w,
1372 struct intel_gt *gt,
1373 long timeout,
1374 const char *name)
1376 w->gt = gt;
1377 w->name = name;
1379 INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1380 schedule_delayed_work(&w->work, timeout);
1383 void __intel_fini_wedge(struct intel_wedge_me *w)
1385 cancel_delayed_work_sync(&w->work);
1386 destroy_delayed_work_on_stack(&w->work);
1387 w->gt = NULL;
1390 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1391 #include "selftest_reset.c"
1392 #include "selftest_hangcheck.c"
1393 #endif