2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 ipehr_is_semaphore_wait(struct intel_engine_cs
*engine
, u32 ipehr
)
30 ipehr
&= ~MI_SEMAPHORE_SYNC_MASK
;
31 return ipehr
== (MI_SEMAPHORE_MBOX
| MI_SEMAPHORE_COMPARE
|
32 MI_SEMAPHORE_REGISTER
);
35 static struct intel_engine_cs
*
36 semaphore_wait_to_signaller_ring(struct intel_engine_cs
*engine
, u32 ipehr
,
39 struct drm_i915_private
*dev_priv
= engine
->i915
;
40 u32 sync_bits
= ipehr
& MI_SEMAPHORE_SYNC_MASK
;
41 struct intel_engine_cs
*signaller
;
42 enum intel_engine_id id
;
44 for_each_engine(signaller
, dev_priv
, id
) {
45 if (engine
== signaller
)
48 if (sync_bits
== signaller
->semaphore
.mbox
.wait
[engine
->hw_id
])
52 DRM_DEBUG_DRIVER("No signaller ring found for %s, ipehr 0x%08x\n",
55 return ERR_PTR(-ENODEV
);
58 static struct intel_engine_cs
*
59 semaphore_waits_for(struct intel_engine_cs
*engine
, u32
*seqno
)
61 struct drm_i915_private
*dev_priv
= engine
->i915
;
68 * This function does not support execlist mode - any attempt to
69 * proceed further into this function will result in a kernel panic
70 * when dereferencing ring->buffer, which is not set up in execlist
73 * The correct way of doing it would be to derive the currently
74 * executing ring buffer from the current context, which is derived
75 * from the currently running request. Unfortunately, to get the
76 * current request we would have to grab the struct_mutex before doing
77 * anything else, which would be ill-advised since some other thread
78 * might have grabbed it already and managed to hang itself, causing
79 * the hang checker to deadlock.
81 * Therefore, this function does not support execlist mode in its
82 * current form. Just return NULL and move on.
84 if (engine
->buffer
== NULL
)
87 ipehr
= I915_READ(RING_IPEHR(engine
->mmio_base
));
88 if (!ipehr_is_semaphore_wait(engine
, ipehr
))
92 * HEAD is likely pointing to the dword after the actual command,
93 * so scan backwards until we find the MBOX. But limit it to just 3
94 * or 4 dwords depending on the semaphore wait command size.
95 * Note that we don't care about ACTHD here since that might
96 * point at at batch, and semaphores are always emitted into the
99 head
= I915_READ_HEAD(engine
) & HEAD_ADDR
;
100 backwards
= (INTEL_GEN(dev_priv
) >= 8) ? 5 : 4;
101 vaddr
= (void __iomem
*)engine
->buffer
->vaddr
;
103 for (i
= backwards
; i
; --i
) {
105 * Be paranoid and presume the hw has gone off into the wild -
106 * our ring is smaller than what the hardware (and hence
107 * HEAD_ADDR) allows. Also handles wrap-around.
109 head
&= engine
->buffer
->size
- 1;
111 /* This here seems to blow up */
112 cmd
= ioread32(vaddr
+ head
);
122 *seqno
= ioread32(vaddr
+ head
+ 4) + 1;
123 return semaphore_wait_to_signaller_ring(engine
, ipehr
, offset
);
126 static int semaphore_passed(struct intel_engine_cs
*engine
)
128 struct drm_i915_private
*dev_priv
= engine
->i915
;
129 struct intel_engine_cs
*signaller
;
132 engine
->hangcheck
.deadlock
++;
134 signaller
= semaphore_waits_for(engine
, &seqno
);
135 if (signaller
== NULL
)
138 if (IS_ERR(signaller
))
141 /* Prevent pathological recursion due to driver bugs */
142 if (signaller
->hangcheck
.deadlock
>= I915_NUM_ENGINES
)
145 if (i915_seqno_passed(intel_engine_get_seqno(signaller
), seqno
))
148 /* cursory check for an unkickable deadlock */
149 if (I915_READ_CTL(signaller
) & RING_WAIT_SEMAPHORE
&&
150 semaphore_passed(signaller
) < 0)
156 static void semaphore_clear_deadlocks(struct drm_i915_private
*dev_priv
)
158 struct intel_engine_cs
*engine
;
159 enum intel_engine_id id
;
161 for_each_engine(engine
, dev_priv
, id
)
162 engine
->hangcheck
.deadlock
= 0;
165 static bool instdone_unchanged(u32 current_instdone
, u32
*old_instdone
)
167 u32 tmp
= current_instdone
| *old_instdone
;
170 unchanged
= tmp
== *old_instdone
;
171 *old_instdone
|= tmp
;
176 static bool subunits_stuck(struct intel_engine_cs
*engine
)
178 struct drm_i915_private
*dev_priv
= engine
->i915
;
179 struct intel_instdone instdone
;
180 struct intel_instdone
*accu_instdone
= &engine
->hangcheck
.instdone
;
185 if (engine
->id
!= RCS
)
188 intel_engine_get_instdone(engine
, &instdone
);
190 /* There might be unstable subunit states even when
191 * actual head is not moving. Filter out the unstable ones by
192 * accumulating the undone -> done transitions and only
193 * consider those as progress.
195 stuck
= instdone_unchanged(instdone
.instdone
,
196 &accu_instdone
->instdone
);
197 stuck
&= instdone_unchanged(instdone
.slice_common
,
198 &accu_instdone
->slice_common
);
200 for_each_instdone_slice_subslice(dev_priv
, slice
, subslice
) {
201 stuck
&= instdone_unchanged(instdone
.sampler
[slice
][subslice
],
202 &accu_instdone
->sampler
[slice
][subslice
]);
203 stuck
&= instdone_unchanged(instdone
.row
[slice
][subslice
],
204 &accu_instdone
->row
[slice
][subslice
]);
210 static enum intel_engine_hangcheck_action
211 head_stuck(struct intel_engine_cs
*engine
, u64 acthd
)
213 if (acthd
!= engine
->hangcheck
.acthd
) {
215 /* Clear subunit states on head movement */
216 memset(&engine
->hangcheck
.instdone
, 0,
217 sizeof(engine
->hangcheck
.instdone
));
219 return ENGINE_ACTIVE_HEAD
;
222 if (!subunits_stuck(engine
))
223 return ENGINE_ACTIVE_SUBUNITS
;
228 static enum intel_engine_hangcheck_action
229 engine_stuck(struct intel_engine_cs
*engine
, u64 acthd
)
231 struct drm_i915_private
*dev_priv
= engine
->i915
;
232 enum intel_engine_hangcheck_action ha
;
235 ha
= head_stuck(engine
, acthd
);
236 if (ha
!= ENGINE_DEAD
)
239 if (IS_GEN2(dev_priv
))
242 /* Is the chip hanging on a WAIT_FOR_EVENT?
243 * If so we can simply poke the RB_WAIT bit
244 * and break the hang. This should work on
245 * all but the second generation chipsets.
247 tmp
= I915_READ_CTL(engine
);
248 if (tmp
& RING_WAIT
) {
249 i915_handle_error(dev_priv
, BIT(engine
->id
), 0,
250 "stuck wait on %s", engine
->name
);
251 I915_WRITE_CTL(engine
, tmp
);
252 return ENGINE_WAIT_KICK
;
255 if (IS_GEN(dev_priv
, 6, 7) && tmp
& RING_WAIT_SEMAPHORE
) {
256 switch (semaphore_passed(engine
)) {
260 i915_handle_error(dev_priv
, ALL_ENGINES
, 0,
261 "stuck semaphore on %s",
263 I915_WRITE_CTL(engine
, tmp
);
264 return ENGINE_WAIT_KICK
;
273 static void hangcheck_load_sample(struct intel_engine_cs
*engine
,
274 struct intel_engine_hangcheck
*hc
)
276 /* We don't strictly need an irq-barrier here, as we are not
277 * serving an interrupt request, be paranoid in case the
278 * barrier has side-effects (such as preventing a broken
279 * cacheline snoop) and so be sure that we can see the seqno
280 * advance. If the seqno should stick, due to a stale
281 * cacheline, we would erroneously declare the GPU hung.
283 if (engine
->irq_seqno_barrier
)
284 engine
->irq_seqno_barrier(engine
);
286 hc
->acthd
= intel_engine_get_active_head(engine
);
287 hc
->seqno
= intel_engine_get_seqno(engine
);
290 static void hangcheck_store_sample(struct intel_engine_cs
*engine
,
291 const struct intel_engine_hangcheck
*hc
)
293 engine
->hangcheck
.acthd
= hc
->acthd
;
294 engine
->hangcheck
.seqno
= hc
->seqno
;
295 engine
->hangcheck
.action
= hc
->action
;
296 engine
->hangcheck
.stalled
= hc
->stalled
;
297 engine
->hangcheck
.wedged
= hc
->wedged
;
300 static enum intel_engine_hangcheck_action
301 hangcheck_get_action(struct intel_engine_cs
*engine
,
302 const struct intel_engine_hangcheck
*hc
)
304 if (engine
->hangcheck
.seqno
!= hc
->seqno
)
305 return ENGINE_ACTIVE_SEQNO
;
307 if (intel_engine_is_idle(engine
))
310 return engine_stuck(engine
, hc
->acthd
);
313 static void hangcheck_accumulate_sample(struct intel_engine_cs
*engine
,
314 struct intel_engine_hangcheck
*hc
)
316 unsigned long timeout
= I915_ENGINE_DEAD_TIMEOUT
;
318 hc
->action
= hangcheck_get_action(engine
, hc
);
320 /* We always increment the progress
321 * if the engine is busy and still processing
322 * the same request, so that no single request
323 * can run indefinitely (such as a chain of
324 * batches). The only time we do not increment
325 * the hangcheck score on this ring, if this
326 * engine is in a legitimate wait for another
327 * engine. In that case the waiting engine is a
328 * victim and we want to be sure we catch the
329 * right culprit. Then every time we do kick
330 * the ring, make it as a progress as the seqno
331 * advancement might ensure and if not, it
332 * will catch the hanging engine.
335 switch (hc
->action
) {
337 case ENGINE_ACTIVE_SEQNO
:
338 /* Clear head and subunit states on seqno movement */
341 memset(&engine
->hangcheck
.instdone
, 0,
342 sizeof(engine
->hangcheck
.instdone
));
344 /* Intentional fall through */
345 case ENGINE_WAIT_KICK
:
347 engine
->hangcheck
.action_timestamp
= jiffies
;
350 case ENGINE_ACTIVE_HEAD
:
351 case ENGINE_ACTIVE_SUBUNITS
:
353 * Seqno stuck with still active engine gets leeway,
354 * in hopes that it is just a long shader.
356 timeout
= I915_SEQNO_DEAD_TIMEOUT
;
360 if (GEM_SHOW_DEBUG()) {
361 struct drm_printer p
= drm_debug_printer("hangcheck");
362 intel_engine_dump(engine
, &p
, "%s\n", engine
->name
);
367 MISSING_CASE(hc
->action
);
370 hc
->stalled
= time_after(jiffies
,
371 engine
->hangcheck
.action_timestamp
+ timeout
);
372 hc
->wedged
= time_after(jiffies
,
373 engine
->hangcheck
.action_timestamp
+
374 I915_ENGINE_WEDGED_TIMEOUT
);
377 static void hangcheck_declare_hang(struct drm_i915_private
*i915
,
381 struct intel_engine_cs
*engine
;
386 /* If some rings hung but others were still busy, only
387 * blame the hanging rings in the synopsis.
391 len
= scnprintf(msg
, sizeof(msg
),
392 "%s on ", stuck
== hung
? "no progress" : "hang");
393 for_each_engine_masked(engine
, i915
, hung
, tmp
)
394 len
+= scnprintf(msg
+ len
, sizeof(msg
) - len
,
395 "%s, ", engine
->name
);
398 return i915_handle_error(i915
, hung
, I915_ERROR_CAPTURE
, "%s", msg
);
402 * This is called when the chip hasn't reported back with completed
403 * batchbuffers in a long time. We keep track per ring seqno progress and
404 * if there are no progress, hangcheck score for that ring is increased.
405 * Further, acthd is inspected to see if the ring is stuck. On stuck case
406 * we kick the ring. If we see no progress on three subsequent calls
407 * we assume chip is wedged and try to fix it by resetting the chip.
409 static void i915_hangcheck_elapsed(struct work_struct
*work
)
411 struct drm_i915_private
*dev_priv
=
412 container_of(work
, typeof(*dev_priv
),
413 gpu_error
.hangcheck_work
.work
);
414 struct intel_engine_cs
*engine
;
415 enum intel_engine_id id
;
416 unsigned int hung
= 0, stuck
= 0, wedged
= 0;
418 if (!i915_modparams
.enable_hangcheck
)
421 if (!READ_ONCE(dev_priv
->gt
.awake
))
424 if (i915_terminally_wedged(&dev_priv
->gpu_error
))
427 /* As enabling the GPU requires fairly extensive mmio access,
428 * periodically arm the mmio checker to see if we are triggering
429 * any invalid access.
431 intel_uncore_arm_unclaimed_mmio_detection(dev_priv
);
433 for_each_engine(engine
, dev_priv
, id
) {
434 struct intel_engine_hangcheck hc
;
436 semaphore_clear_deadlocks(dev_priv
);
438 hangcheck_load_sample(engine
, &hc
);
439 hangcheck_accumulate_sample(engine
, &hc
);
440 hangcheck_store_sample(engine
, &hc
);
442 if (engine
->hangcheck
.stalled
) {
443 hung
|= intel_engine_flag(engine
);
444 if (hc
.action
!= ENGINE_DEAD
)
445 stuck
|= intel_engine_flag(engine
);
448 if (engine
->hangcheck
.wedged
)
449 wedged
|= intel_engine_flag(engine
);
453 dev_err(dev_priv
->drm
.dev
,
454 "GPU recovery timed out,"
455 " cancelling all in-flight rendering.\n");
457 i915_gem_set_wedged(dev_priv
);
461 hangcheck_declare_hang(dev_priv
, hung
, stuck
);
463 /* Reset timer in case GPU hangs without another request being added */
464 i915_queue_hangcheck(dev_priv
);
467 void intel_engine_init_hangcheck(struct intel_engine_cs
*engine
)
469 memset(&engine
->hangcheck
, 0, sizeof(engine
->hangcheck
));
470 engine
->hangcheck
.action_timestamp
= jiffies
;
473 void intel_hangcheck_init(struct drm_i915_private
*i915
)
475 INIT_DELAYED_WORK(&i915
->gpu_error
.hangcheck_work
,
476 i915_hangcheck_elapsed
);
479 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
480 #include "selftests/intel_hangcheck.c"