2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 ipehr_is_semaphore_wait(struct intel_engine_cs
*engine
, u32 ipehr
)
30 if (INTEL_GEN(engine
->i915
) >= 8) {
31 return (ipehr
>> 23) == 0x1c;
33 ipehr
&= ~MI_SEMAPHORE_SYNC_MASK
;
34 return ipehr
== (MI_SEMAPHORE_MBOX
| MI_SEMAPHORE_COMPARE
|
35 MI_SEMAPHORE_REGISTER
);
39 static struct intel_engine_cs
*
40 semaphore_wait_to_signaller_ring(struct intel_engine_cs
*engine
, u32 ipehr
,
43 struct drm_i915_private
*dev_priv
= engine
->i915
;
44 struct intel_engine_cs
*signaller
;
45 enum intel_engine_id id
;
47 if (INTEL_GEN(dev_priv
) >= 8) {
48 for_each_engine(signaller
, dev_priv
, id
) {
49 if (engine
== signaller
)
52 if (offset
== signaller
->semaphore
.signal_ggtt
[engine
->hw_id
])
56 u32 sync_bits
= ipehr
& MI_SEMAPHORE_SYNC_MASK
;
58 for_each_engine(signaller
, dev_priv
, id
) {
59 if(engine
== signaller
)
62 if (sync_bits
== signaller
->semaphore
.mbox
.wait
[engine
->hw_id
])
67 DRM_DEBUG_DRIVER("No signaller ring found for %s, ipehr 0x%08x, offset 0x%016llx\n",
68 engine
->name
, ipehr
, offset
);
70 return ERR_PTR(-ENODEV
);
73 static struct intel_engine_cs
*
74 semaphore_waits_for(struct intel_engine_cs
*engine
, u32
*seqno
)
76 struct drm_i915_private
*dev_priv
= engine
->i915
;
83 * This function does not support execlist mode - any attempt to
84 * proceed further into this function will result in a kernel panic
85 * when dereferencing ring->buffer, which is not set up in execlist
88 * The correct way of doing it would be to derive the currently
89 * executing ring buffer from the current context, which is derived
90 * from the currently running request. Unfortunately, to get the
91 * current request we would have to grab the struct_mutex before doing
92 * anything else, which would be ill-advised since some other thread
93 * might have grabbed it already and managed to hang itself, causing
94 * the hang checker to deadlock.
96 * Therefore, this function does not support execlist mode in its
97 * current form. Just return NULL and move on.
99 if (engine
->buffer
== NULL
)
102 ipehr
= I915_READ(RING_IPEHR(engine
->mmio_base
));
103 if (!ipehr_is_semaphore_wait(engine
, ipehr
))
107 * HEAD is likely pointing to the dword after the actual command,
108 * so scan backwards until we find the MBOX. But limit it to just 3
109 * or 4 dwords depending on the semaphore wait command size.
110 * Note that we don't care about ACTHD here since that might
111 * point at at batch, and semaphores are always emitted into the
114 head
= I915_READ_HEAD(engine
) & HEAD_ADDR
;
115 backwards
= (INTEL_GEN(dev_priv
) >= 8) ? 5 : 4;
116 vaddr
= (void __iomem
*)engine
->buffer
->vaddr
;
118 for (i
= backwards
; i
; --i
) {
120 * Be paranoid and presume the hw has gone off into the wild -
121 * our ring is smaller than what the hardware (and hence
122 * HEAD_ADDR) allows. Also handles wrap-around.
124 head
&= engine
->buffer
->size
- 1;
126 /* This here seems to blow up */
127 cmd
= ioread32(vaddr
+ head
);
137 *seqno
= ioread32(vaddr
+ head
+ 4) + 1;
138 if (INTEL_GEN(dev_priv
) >= 8) {
139 offset
= ioread32(vaddr
+ head
+ 12);
141 offset
|= ioread32(vaddr
+ head
+ 8);
143 return semaphore_wait_to_signaller_ring(engine
, ipehr
, offset
);
146 static int semaphore_passed(struct intel_engine_cs
*engine
)
148 struct drm_i915_private
*dev_priv
= engine
->i915
;
149 struct intel_engine_cs
*signaller
;
152 engine
->hangcheck
.deadlock
++;
154 signaller
= semaphore_waits_for(engine
, &seqno
);
155 if (signaller
== NULL
)
158 if (IS_ERR(signaller
))
161 /* Prevent pathological recursion due to driver bugs */
162 if (signaller
->hangcheck
.deadlock
>= I915_NUM_ENGINES
)
165 if (i915_seqno_passed(intel_engine_get_seqno(signaller
), seqno
))
168 /* cursory check for an unkickable deadlock */
169 if (I915_READ_CTL(signaller
) & RING_WAIT_SEMAPHORE
&&
170 semaphore_passed(signaller
) < 0)
176 static void semaphore_clear_deadlocks(struct drm_i915_private
*dev_priv
)
178 struct intel_engine_cs
*engine
;
179 enum intel_engine_id id
;
181 for_each_engine(engine
, dev_priv
, id
)
182 engine
->hangcheck
.deadlock
= 0;
185 static bool instdone_unchanged(u32 current_instdone
, u32
*old_instdone
)
187 u32 tmp
= current_instdone
| *old_instdone
;
190 unchanged
= tmp
== *old_instdone
;
191 *old_instdone
|= tmp
;
196 static bool subunits_stuck(struct intel_engine_cs
*engine
)
198 struct drm_i915_private
*dev_priv
= engine
->i915
;
199 struct intel_instdone instdone
;
200 struct intel_instdone
*accu_instdone
= &engine
->hangcheck
.instdone
;
205 if (engine
->id
!= RCS
)
208 intel_engine_get_instdone(engine
, &instdone
);
210 /* There might be unstable subunit states even when
211 * actual head is not moving. Filter out the unstable ones by
212 * accumulating the undone -> done transitions and only
213 * consider those as progress.
215 stuck
= instdone_unchanged(instdone
.instdone
,
216 &accu_instdone
->instdone
);
217 stuck
&= instdone_unchanged(instdone
.slice_common
,
218 &accu_instdone
->slice_common
);
220 for_each_instdone_slice_subslice(dev_priv
, slice
, subslice
) {
221 stuck
&= instdone_unchanged(instdone
.sampler
[slice
][subslice
],
222 &accu_instdone
->sampler
[slice
][subslice
]);
223 stuck
&= instdone_unchanged(instdone
.row
[slice
][subslice
],
224 &accu_instdone
->row
[slice
][subslice
]);
230 static enum intel_engine_hangcheck_action
231 head_stuck(struct intel_engine_cs
*engine
, u64 acthd
)
233 if (acthd
!= engine
->hangcheck
.acthd
) {
235 /* Clear subunit states on head movement */
236 memset(&engine
->hangcheck
.instdone
, 0,
237 sizeof(engine
->hangcheck
.instdone
));
239 return ENGINE_ACTIVE_HEAD
;
242 if (!subunits_stuck(engine
))
243 return ENGINE_ACTIVE_SUBUNITS
;
248 static enum intel_engine_hangcheck_action
249 engine_stuck(struct intel_engine_cs
*engine
, u64 acthd
)
251 struct drm_i915_private
*dev_priv
= engine
->i915
;
252 enum intel_engine_hangcheck_action ha
;
255 ha
= head_stuck(engine
, acthd
);
256 if (ha
!= ENGINE_DEAD
)
259 if (IS_GEN2(dev_priv
))
262 /* Is the chip hanging on a WAIT_FOR_EVENT?
263 * If so we can simply poke the RB_WAIT bit
264 * and break the hang. This should work on
265 * all but the second generation chipsets.
267 tmp
= I915_READ_CTL(engine
);
268 if (tmp
& RING_WAIT
) {
269 i915_handle_error(dev_priv
, 0,
270 "Kicking stuck wait on %s",
272 I915_WRITE_CTL(engine
, tmp
);
273 return ENGINE_WAIT_KICK
;
276 if (INTEL_GEN(dev_priv
) >= 6 && tmp
& RING_WAIT_SEMAPHORE
) {
277 switch (semaphore_passed(engine
)) {
281 i915_handle_error(dev_priv
, 0,
282 "Kicking stuck semaphore on %s",
284 I915_WRITE_CTL(engine
, tmp
);
285 return ENGINE_WAIT_KICK
;
294 static void hangcheck_load_sample(struct intel_engine_cs
*engine
,
295 struct intel_engine_hangcheck
*hc
)
297 /* We don't strictly need an irq-barrier here, as we are not
298 * serving an interrupt request, be paranoid in case the
299 * barrier has side-effects (such as preventing a broken
300 * cacheline snoop) and so be sure that we can see the seqno
301 * advance. If the seqno should stick, due to a stale
302 * cacheline, we would erroneously declare the GPU hung.
304 if (engine
->irq_seqno_barrier
)
305 engine
->irq_seqno_barrier(engine
);
307 hc
->acthd
= intel_engine_get_active_head(engine
);
308 hc
->seqno
= intel_engine_get_seqno(engine
);
311 static void hangcheck_store_sample(struct intel_engine_cs
*engine
,
312 const struct intel_engine_hangcheck
*hc
)
314 engine
->hangcheck
.acthd
= hc
->acthd
;
315 engine
->hangcheck
.seqno
= hc
->seqno
;
316 engine
->hangcheck
.action
= hc
->action
;
317 engine
->hangcheck
.stalled
= hc
->stalled
;
320 static enum intel_engine_hangcheck_action
321 hangcheck_get_action(struct intel_engine_cs
*engine
,
322 const struct intel_engine_hangcheck
*hc
)
324 if (engine
->hangcheck
.seqno
!= hc
->seqno
)
325 return ENGINE_ACTIVE_SEQNO
;
327 if (i915_seqno_passed(hc
->seqno
, intel_engine_last_submit(engine
)))
330 return engine_stuck(engine
, hc
->acthd
);
333 static void hangcheck_accumulate_sample(struct intel_engine_cs
*engine
,
334 struct intel_engine_hangcheck
*hc
)
336 unsigned long timeout
= I915_ENGINE_DEAD_TIMEOUT
;
338 hc
->action
= hangcheck_get_action(engine
, hc
);
340 /* We always increment the progress
341 * if the engine is busy and still processing
342 * the same request, so that no single request
343 * can run indefinitely (such as a chain of
344 * batches). The only time we do not increment
345 * the hangcheck score on this ring, if this
346 * engine is in a legitimate wait for another
347 * engine. In that case the waiting engine is a
348 * victim and we want to be sure we catch the
349 * right culprit. Then every time we do kick
350 * the ring, make it as a progress as the seqno
351 * advancement might ensure and if not, it
352 * will catch the hanging engine.
355 switch (hc
->action
) {
357 case ENGINE_ACTIVE_SEQNO
:
358 /* Clear head and subunit states on seqno movement */
361 memset(&engine
->hangcheck
.instdone
, 0,
362 sizeof(engine
->hangcheck
.instdone
));
364 /* Intentional fall through */
365 case ENGINE_WAIT_KICK
:
367 engine
->hangcheck
.action_timestamp
= jiffies
;
370 case ENGINE_ACTIVE_HEAD
:
371 case ENGINE_ACTIVE_SUBUNITS
:
372 /* Seqno stuck with still active engine gets leeway,
373 * in hopes that it is just a long shader.
375 timeout
= I915_SEQNO_DEAD_TIMEOUT
;
382 MISSING_CASE(hc
->action
);
385 hc
->stalled
= time_after(jiffies
,
386 engine
->hangcheck
.action_timestamp
+ timeout
);
389 static void hangcheck_declare_hang(struct drm_i915_private
*i915
,
393 struct intel_engine_cs
*engine
;
398 /* If some rings hung but others were still busy, only
399 * blame the hanging rings in the synopsis.
403 len
= scnprintf(msg
, sizeof(msg
),
404 "%s on ", stuck
== hung
? "No progress" : "Hang");
405 for_each_engine_masked(engine
, i915
, hung
, tmp
)
406 len
+= scnprintf(msg
+ len
, sizeof(msg
) - len
,
407 "%s, ", engine
->name
);
410 return i915_handle_error(i915
, hung
, msg
);
414 * This is called when the chip hasn't reported back with completed
415 * batchbuffers in a long time. We keep track per ring seqno progress and
416 * if there are no progress, hangcheck score for that ring is increased.
417 * Further, acthd is inspected to see if the ring is stuck. On stuck case
418 * we kick the ring. If we see no progress on three subsequent calls
419 * we assume chip is wedged and try to fix it by resetting the chip.
421 static void i915_hangcheck_elapsed(struct work_struct
*work
)
423 struct drm_i915_private
*dev_priv
=
424 container_of(work
, typeof(*dev_priv
),
425 gpu_error
.hangcheck_work
.work
);
426 struct intel_engine_cs
*engine
;
427 enum intel_engine_id id
;
428 unsigned int hung
= 0, stuck
= 0;
431 if (!i915
.enable_hangcheck
)
434 if (!READ_ONCE(dev_priv
->gt
.awake
))
437 if (i915_terminally_wedged(&dev_priv
->gpu_error
))
440 /* As enabling the GPU requires fairly extensive mmio access,
441 * periodically arm the mmio checker to see if we are triggering
442 * any invalid access.
444 intel_uncore_arm_unclaimed_mmio_detection(dev_priv
);
446 for_each_engine(engine
, dev_priv
, id
) {
447 struct intel_engine_hangcheck cur_state
, *hc
= &cur_state
;
448 const bool busy
= intel_engine_has_waiter(engine
);
450 semaphore_clear_deadlocks(dev_priv
);
452 hangcheck_load_sample(engine
, hc
);
453 hangcheck_accumulate_sample(engine
, hc
);
454 hangcheck_store_sample(engine
, hc
);
456 if (engine
->hangcheck
.stalled
) {
457 hung
|= intel_engine_flag(engine
);
458 if (hc
->action
!= ENGINE_DEAD
)
459 stuck
|= intel_engine_flag(engine
);
466 hangcheck_declare_hang(dev_priv
, hung
, stuck
);
468 /* Reset timer in case GPU hangs without another request being added */
470 i915_queue_hangcheck(dev_priv
);
473 void intel_engine_init_hangcheck(struct intel_engine_cs
*engine
)
475 memset(&engine
->hangcheck
, 0, sizeof(engine
->hangcheck
));
478 void intel_hangcheck_init(struct drm_i915_private
*i915
)
480 INIT_DELAYED_WORK(&i915
->gpu_error
.hangcheck_work
,
481 i915_hangcheck_elapsed
);