Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
[cris-mirror.git] / drivers / gpu / drm / i915 / selftests / intel_hangcheck.c
blobd1f91a533afa14b708d2487ec731f27ddd7855b7
1 /*
2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
25 #include <linux/kthread.h>
27 #include "../i915_selftest.h"
29 #include "mock_context.h"
30 #include "mock_drm.h"
32 struct hang {
33 struct drm_i915_private *i915;
34 struct drm_i915_gem_object *hws;
35 struct drm_i915_gem_object *obj;
36 u32 *seqno;
37 u32 *batch;
40 static int hang_init(struct hang *h, struct drm_i915_private *i915)
42 void *vaddr;
43 int err;
45 memset(h, 0, sizeof(*h));
46 h->i915 = i915;
48 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
49 if (IS_ERR(h->hws))
50 return PTR_ERR(h->hws);
52 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
53 if (IS_ERR(h->obj)) {
54 err = PTR_ERR(h->obj);
55 goto err_hws;
58 i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
59 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
60 if (IS_ERR(vaddr)) {
61 err = PTR_ERR(vaddr);
62 goto err_obj;
64 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
66 vaddr = i915_gem_object_pin_map(h->obj,
67 HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
68 if (IS_ERR(vaddr)) {
69 err = PTR_ERR(vaddr);
70 goto err_unpin_hws;
72 h->batch = vaddr;
74 return 0;
76 err_unpin_hws:
77 i915_gem_object_unpin_map(h->hws);
78 err_obj:
79 i915_gem_object_put(h->obj);
80 err_hws:
81 i915_gem_object_put(h->hws);
82 return err;
85 static u64 hws_address(const struct i915_vma *hws,
86 const struct drm_i915_gem_request *rq)
88 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
91 static int emit_recurse_batch(struct hang *h,
92 struct drm_i915_gem_request *rq)
94 struct drm_i915_private *i915 = h->i915;
95 struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
96 struct i915_vma *hws, *vma;
97 unsigned int flags;
98 u32 *batch;
99 int err;
101 vma = i915_vma_instance(h->obj, vm, NULL);
102 if (IS_ERR(vma))
103 return PTR_ERR(vma);
105 hws = i915_vma_instance(h->hws, vm, NULL);
106 if (IS_ERR(hws))
107 return PTR_ERR(hws);
109 err = i915_vma_pin(vma, 0, 0, PIN_USER);
110 if (err)
111 return err;
113 err = i915_vma_pin(hws, 0, 0, PIN_USER);
114 if (err)
115 goto unpin_vma;
117 i915_vma_move_to_active(vma, rq, 0);
118 if (!i915_gem_object_has_active_reference(vma->obj)) {
119 i915_gem_object_get(vma->obj);
120 i915_gem_object_set_active_reference(vma->obj);
123 i915_vma_move_to_active(hws, rq, 0);
124 if (!i915_gem_object_has_active_reference(hws->obj)) {
125 i915_gem_object_get(hws->obj);
126 i915_gem_object_set_active_reference(hws->obj);
129 batch = h->batch;
130 if (INTEL_GEN(i915) >= 8) {
131 *batch++ = MI_STORE_DWORD_IMM_GEN4;
132 *batch++ = lower_32_bits(hws_address(hws, rq));
133 *batch++ = upper_32_bits(hws_address(hws, rq));
134 *batch++ = rq->fence.seqno;
135 *batch++ = MI_ARB_CHECK;
137 memset(batch, 0, 1024);
138 batch += 1024 / sizeof(*batch);
140 *batch++ = MI_ARB_CHECK;
141 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
142 *batch++ = lower_32_bits(vma->node.start);
143 *batch++ = upper_32_bits(vma->node.start);
144 } else if (INTEL_GEN(i915) >= 6) {
145 *batch++ = MI_STORE_DWORD_IMM_GEN4;
146 *batch++ = 0;
147 *batch++ = lower_32_bits(hws_address(hws, rq));
148 *batch++ = rq->fence.seqno;
149 *batch++ = MI_ARB_CHECK;
151 memset(batch, 0, 1024);
152 batch += 1024 / sizeof(*batch);
154 *batch++ = MI_ARB_CHECK;
155 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
156 *batch++ = lower_32_bits(vma->node.start);
157 } else if (INTEL_GEN(i915) >= 4) {
158 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
159 *batch++ = 0;
160 *batch++ = lower_32_bits(hws_address(hws, rq));
161 *batch++ = rq->fence.seqno;
162 *batch++ = MI_ARB_CHECK;
164 memset(batch, 0, 1024);
165 batch += 1024 / sizeof(*batch);
167 *batch++ = MI_ARB_CHECK;
168 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
169 *batch++ = lower_32_bits(vma->node.start);
170 } else {
171 *batch++ = MI_STORE_DWORD_IMM;
172 *batch++ = lower_32_bits(hws_address(hws, rq));
173 *batch++ = rq->fence.seqno;
174 *batch++ = MI_ARB_CHECK;
176 memset(batch, 0, 1024);
177 batch += 1024 / sizeof(*batch);
179 *batch++ = MI_ARB_CHECK;
180 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
181 *batch++ = lower_32_bits(vma->node.start);
183 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
184 i915_gem_chipset_flush(h->i915);
186 flags = 0;
187 if (INTEL_GEN(vm->i915) <= 5)
188 flags |= I915_DISPATCH_SECURE;
190 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
192 i915_vma_unpin(hws);
193 unpin_vma:
194 i915_vma_unpin(vma);
195 return err;
198 static struct drm_i915_gem_request *
199 hang_create_request(struct hang *h,
200 struct intel_engine_cs *engine,
201 struct i915_gem_context *ctx)
203 struct drm_i915_gem_request *rq;
204 int err;
206 if (i915_gem_object_is_active(h->obj)) {
207 struct drm_i915_gem_object *obj;
208 void *vaddr;
210 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
211 if (IS_ERR(obj))
212 return ERR_CAST(obj);
214 vaddr = i915_gem_object_pin_map(obj,
215 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
216 if (IS_ERR(vaddr)) {
217 i915_gem_object_put(obj);
218 return ERR_CAST(vaddr);
221 i915_gem_object_unpin_map(h->obj);
222 i915_gem_object_put(h->obj);
224 h->obj = obj;
225 h->batch = vaddr;
228 rq = i915_gem_request_alloc(engine, ctx);
229 if (IS_ERR(rq))
230 return rq;
232 err = emit_recurse_batch(h, rq);
233 if (err) {
234 __i915_add_request(rq, false);
235 return ERR_PTR(err);
238 return rq;
241 static u32 hws_seqno(const struct hang *h,
242 const struct drm_i915_gem_request *rq)
244 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
247 static void hang_fini(struct hang *h)
249 *h->batch = MI_BATCH_BUFFER_END;
250 i915_gem_chipset_flush(h->i915);
252 i915_gem_object_unpin_map(h->obj);
253 i915_gem_object_put(h->obj);
255 i915_gem_object_unpin_map(h->hws);
256 i915_gem_object_put(h->hws);
258 i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
261 static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
263 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
264 rq->fence.seqno),
265 10) &&
266 wait_for(i915_seqno_passed(hws_seqno(h, rq),
267 rq->fence.seqno),
268 1000));
271 static int igt_hang_sanitycheck(void *arg)
273 struct drm_i915_private *i915 = arg;
274 struct drm_i915_gem_request *rq;
275 struct intel_engine_cs *engine;
276 enum intel_engine_id id;
277 struct hang h;
278 int err;
280 /* Basic check that we can execute our hanging batch */
282 mutex_lock(&i915->drm.struct_mutex);
283 err = hang_init(&h, i915);
284 if (err)
285 goto unlock;
287 for_each_engine(engine, i915, id) {
288 long timeout;
290 if (!intel_engine_can_store_dword(engine))
291 continue;
293 rq = hang_create_request(&h, engine, i915->kernel_context);
294 if (IS_ERR(rq)) {
295 err = PTR_ERR(rq);
296 pr_err("Failed to create request for %s, err=%d\n",
297 engine->name, err);
298 goto fini;
301 i915_gem_request_get(rq);
303 *h.batch = MI_BATCH_BUFFER_END;
304 i915_gem_chipset_flush(i915);
306 __i915_add_request(rq, true);
308 timeout = i915_wait_request(rq,
309 I915_WAIT_LOCKED,
310 MAX_SCHEDULE_TIMEOUT);
311 i915_gem_request_put(rq);
313 if (timeout < 0) {
314 err = timeout;
315 pr_err("Wait for request failed on %s, err=%d\n",
316 engine->name, err);
317 goto fini;
321 fini:
322 hang_fini(&h);
323 unlock:
324 mutex_unlock(&i915->drm.struct_mutex);
325 return err;
328 static void global_reset_lock(struct drm_i915_private *i915)
330 struct intel_engine_cs *engine;
331 enum intel_engine_id id;
333 pr_debug("%s: current gpu_error=%08lx\n",
334 __func__, i915->gpu_error.flags);
336 while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
337 wait_event(i915->gpu_error.reset_queue,
338 !test_bit(I915_RESET_BACKOFF,
339 &i915->gpu_error.flags));
341 for_each_engine(engine, i915, id) {
342 while (test_and_set_bit(I915_RESET_ENGINE + id,
343 &i915->gpu_error.flags))
344 wait_on_bit(&i915->gpu_error.flags,
345 I915_RESET_ENGINE + id,
346 TASK_UNINTERRUPTIBLE);
350 static void global_reset_unlock(struct drm_i915_private *i915)
352 struct intel_engine_cs *engine;
353 enum intel_engine_id id;
355 for_each_engine(engine, i915, id)
356 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
358 clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
359 wake_up_all(&i915->gpu_error.reset_queue);
362 static int igt_global_reset(void *arg)
364 struct drm_i915_private *i915 = arg;
365 unsigned int reset_count;
366 int err = 0;
368 /* Check that we can issue a global GPU reset */
370 global_reset_lock(i915);
371 set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
373 mutex_lock(&i915->drm.struct_mutex);
374 reset_count = i915_reset_count(&i915->gpu_error);
376 i915_reset(i915, I915_RESET_QUIET);
378 if (i915_reset_count(&i915->gpu_error) == reset_count) {
379 pr_err("No GPU reset recorded!\n");
380 err = -EINVAL;
382 mutex_unlock(&i915->drm.struct_mutex);
384 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
385 global_reset_unlock(i915);
387 if (i915_terminally_wedged(&i915->gpu_error))
388 err = -EIO;
390 return err;
393 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
395 struct intel_engine_cs *engine;
396 enum intel_engine_id id;
397 struct hang h;
398 int err = 0;
400 /* Check that we can issue an engine reset on an idle engine (no-op) */
402 if (!intel_has_reset_engine(i915))
403 return 0;
405 if (active) {
406 mutex_lock(&i915->drm.struct_mutex);
407 err = hang_init(&h, i915);
408 mutex_unlock(&i915->drm.struct_mutex);
409 if (err)
410 return err;
413 for_each_engine(engine, i915, id) {
414 unsigned int reset_count, reset_engine_count;
415 IGT_TIMEOUT(end_time);
417 if (active && !intel_engine_can_store_dword(engine))
418 continue;
420 reset_count = i915_reset_count(&i915->gpu_error);
421 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
422 engine);
424 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
425 do {
426 if (active) {
427 struct drm_i915_gem_request *rq;
429 mutex_lock(&i915->drm.struct_mutex);
430 rq = hang_create_request(&h, engine,
431 i915->kernel_context);
432 if (IS_ERR(rq)) {
433 err = PTR_ERR(rq);
434 mutex_unlock(&i915->drm.struct_mutex);
435 break;
438 i915_gem_request_get(rq);
439 __i915_add_request(rq, true);
440 mutex_unlock(&i915->drm.struct_mutex);
442 if (!wait_for_hang(&h, rq)) {
443 struct drm_printer p = drm_info_printer(i915->drm.dev);
445 pr_err("%s: Failed to start request %x, at %x\n",
446 __func__, rq->fence.seqno, hws_seqno(&h, rq));
447 intel_engine_dump(engine, &p,
448 "%s\n", engine->name);
450 i915_gem_request_put(rq);
451 err = -EIO;
452 break;
455 i915_gem_request_put(rq);
458 engine->hangcheck.stalled = true;
459 engine->hangcheck.seqno =
460 intel_engine_get_seqno(engine);
462 err = i915_reset_engine(engine, I915_RESET_QUIET);
463 if (err) {
464 pr_err("i915_reset_engine failed\n");
465 break;
468 if (i915_reset_count(&i915->gpu_error) != reset_count) {
469 pr_err("Full GPU reset recorded! (engine reset expected)\n");
470 err = -EINVAL;
471 break;
474 reset_engine_count += active;
475 if (i915_reset_engine_count(&i915->gpu_error, engine) !=
476 reset_engine_count) {
477 pr_err("%s engine reset %srecorded!\n",
478 engine->name, active ? "not " : "");
479 err = -EINVAL;
480 break;
483 engine->hangcheck.stalled = false;
484 } while (time_before(jiffies, end_time));
485 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
487 if (err)
488 break;
490 cond_resched();
493 if (i915_terminally_wedged(&i915->gpu_error))
494 err = -EIO;
496 if (active) {
497 mutex_lock(&i915->drm.struct_mutex);
498 hang_fini(&h);
499 mutex_unlock(&i915->drm.struct_mutex);
502 return err;
505 static int igt_reset_idle_engine(void *arg)
507 return __igt_reset_engine(arg, false);
510 static int igt_reset_active_engine(void *arg)
512 return __igt_reset_engine(arg, true);
515 static int active_engine(void *data)
517 struct intel_engine_cs *engine = data;
518 struct drm_i915_gem_request *rq[2] = {};
519 struct i915_gem_context *ctx[2];
520 struct drm_file *file;
521 unsigned long count = 0;
522 int err = 0;
524 file = mock_file(engine->i915);
525 if (IS_ERR(file))
526 return PTR_ERR(file);
528 mutex_lock(&engine->i915->drm.struct_mutex);
529 ctx[0] = live_context(engine->i915, file);
530 mutex_unlock(&engine->i915->drm.struct_mutex);
531 if (IS_ERR(ctx[0])) {
532 err = PTR_ERR(ctx[0]);
533 goto err_file;
536 mutex_lock(&engine->i915->drm.struct_mutex);
537 ctx[1] = live_context(engine->i915, file);
538 mutex_unlock(&engine->i915->drm.struct_mutex);
539 if (IS_ERR(ctx[1])) {
540 err = PTR_ERR(ctx[1]);
541 i915_gem_context_put(ctx[0]);
542 goto err_file;
545 while (!kthread_should_stop()) {
546 unsigned int idx = count++ & 1;
547 struct drm_i915_gem_request *old = rq[idx];
548 struct drm_i915_gem_request *new;
550 mutex_lock(&engine->i915->drm.struct_mutex);
551 new = i915_gem_request_alloc(engine, ctx[idx]);
552 if (IS_ERR(new)) {
553 mutex_unlock(&engine->i915->drm.struct_mutex);
554 err = PTR_ERR(new);
555 break;
558 rq[idx] = i915_gem_request_get(new);
559 i915_add_request(new);
560 mutex_unlock(&engine->i915->drm.struct_mutex);
562 if (old) {
563 i915_wait_request(old, 0, MAX_SCHEDULE_TIMEOUT);
564 i915_gem_request_put(old);
568 for (count = 0; count < ARRAY_SIZE(rq); count++)
569 i915_gem_request_put(rq[count]);
571 err_file:
572 mock_file_free(engine->i915, file);
573 return err;
576 static int __igt_reset_engine_others(struct drm_i915_private *i915,
577 bool active)
579 struct intel_engine_cs *engine, *other;
580 enum intel_engine_id id, tmp;
581 struct hang h;
582 int err = 0;
584 /* Check that issuing a reset on one engine does not interfere
585 * with any other engine.
588 if (!intel_has_reset_engine(i915))
589 return 0;
591 if (active) {
592 mutex_lock(&i915->drm.struct_mutex);
593 err = hang_init(&h, i915);
594 mutex_unlock(&i915->drm.struct_mutex);
595 if (err)
596 return err;
599 for_each_engine(engine, i915, id) {
600 struct task_struct *threads[I915_NUM_ENGINES] = {};
601 unsigned long resets[I915_NUM_ENGINES];
602 unsigned long global = i915_reset_count(&i915->gpu_error);
603 unsigned long count = 0;
604 IGT_TIMEOUT(end_time);
606 if (active && !intel_engine_can_store_dword(engine))
607 continue;
609 memset(threads, 0, sizeof(threads));
610 for_each_engine(other, i915, tmp) {
611 struct task_struct *tsk;
613 resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
614 other);
616 if (other == engine)
617 continue;
619 tsk = kthread_run(active_engine, other,
620 "igt/%s", other->name);
621 if (IS_ERR(tsk)) {
622 err = PTR_ERR(tsk);
623 goto unwind;
626 threads[tmp] = tsk;
627 get_task_struct(tsk);
630 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
631 do {
632 if (active) {
633 struct drm_i915_gem_request *rq;
635 mutex_lock(&i915->drm.struct_mutex);
636 rq = hang_create_request(&h, engine,
637 i915->kernel_context);
638 if (IS_ERR(rq)) {
639 err = PTR_ERR(rq);
640 mutex_unlock(&i915->drm.struct_mutex);
641 break;
644 i915_gem_request_get(rq);
645 __i915_add_request(rq, true);
646 mutex_unlock(&i915->drm.struct_mutex);
648 if (!wait_for_hang(&h, rq)) {
649 struct drm_printer p = drm_info_printer(i915->drm.dev);
651 pr_err("%s: Failed to start request %x, at %x\n",
652 __func__, rq->fence.seqno, hws_seqno(&h, rq));
653 intel_engine_dump(engine, &p,
654 "%s\n", engine->name);
656 i915_gem_request_put(rq);
657 err = -EIO;
658 break;
661 i915_gem_request_put(rq);
664 engine->hangcheck.stalled = true;
665 engine->hangcheck.seqno =
666 intel_engine_get_seqno(engine);
668 err = i915_reset_engine(engine, I915_RESET_QUIET);
669 if (err) {
670 pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
671 engine->name, active ? "active" : "idle", err);
672 break;
675 engine->hangcheck.stalled = false;
676 count++;
677 } while (time_before(jiffies, end_time));
678 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
679 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
680 engine->name, active ? "active" : "idle", count);
682 if (i915_reset_engine_count(&i915->gpu_error, engine) -
683 resets[engine->id] != (active ? count : 0)) {
684 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
685 engine->name, active ? "active" : "idle", count,
686 i915_reset_engine_count(&i915->gpu_error,
687 engine) - resets[engine->id]);
688 if (!err)
689 err = -EINVAL;
692 unwind:
693 for_each_engine(other, i915, tmp) {
694 int ret;
696 if (!threads[tmp])
697 continue;
699 ret = kthread_stop(threads[tmp]);
700 if (ret) {
701 pr_err("kthread for other engine %s failed, err=%d\n",
702 other->name, ret);
703 if (!err)
704 err = ret;
706 put_task_struct(threads[tmp]);
708 if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
709 other)) {
710 pr_err("Innocent engine %s was reset (count=%ld)\n",
711 other->name,
712 i915_reset_engine_count(&i915->gpu_error,
713 other) - resets[tmp]);
714 if (!err)
715 err = -EINVAL;
719 if (global != i915_reset_count(&i915->gpu_error)) {
720 pr_err("Global reset (count=%ld)!\n",
721 i915_reset_count(&i915->gpu_error) - global);
722 if (!err)
723 err = -EINVAL;
726 if (err)
727 break;
729 cond_resched();
732 if (i915_terminally_wedged(&i915->gpu_error))
733 err = -EIO;
735 if (active) {
736 mutex_lock(&i915->drm.struct_mutex);
737 hang_fini(&h);
738 mutex_unlock(&i915->drm.struct_mutex);
741 return err;
744 static int igt_reset_idle_engine_others(void *arg)
746 return __igt_reset_engine_others(arg, false);
749 static int igt_reset_active_engine_others(void *arg)
751 return __igt_reset_engine_others(arg, true);
754 static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
756 u32 reset_count;
758 rq->engine->hangcheck.stalled = true;
759 rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
761 reset_count = i915_reset_count(&rq->i915->gpu_error);
763 set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
764 wake_up_all(&rq->i915->gpu_error.wait_queue);
766 return reset_count;
769 static int igt_wait_reset(void *arg)
771 struct drm_i915_private *i915 = arg;
772 struct drm_i915_gem_request *rq;
773 unsigned int reset_count;
774 struct hang h;
775 long timeout;
776 int err;
778 if (!intel_engine_can_store_dword(i915->engine[RCS]))
779 return 0;
781 /* Check that we detect a stuck waiter and issue a reset */
783 global_reset_lock(i915);
785 mutex_lock(&i915->drm.struct_mutex);
786 err = hang_init(&h, i915);
787 if (err)
788 goto unlock;
790 rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context);
791 if (IS_ERR(rq)) {
792 err = PTR_ERR(rq);
793 goto fini;
796 i915_gem_request_get(rq);
797 __i915_add_request(rq, true);
799 if (!wait_for_hang(&h, rq)) {
800 struct drm_printer p = drm_info_printer(i915->drm.dev);
802 pr_err("%s: Failed to start request %x, at %x\n",
803 __func__, rq->fence.seqno, hws_seqno(&h, rq));
804 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
806 i915_reset(i915, 0);
807 i915_gem_set_wedged(i915);
809 err = -EIO;
810 goto out_rq;
813 reset_count = fake_hangcheck(rq);
815 timeout = i915_wait_request(rq, I915_WAIT_LOCKED, 10);
816 if (timeout < 0) {
817 pr_err("i915_wait_request failed on a stuck request: err=%ld\n",
818 timeout);
819 err = timeout;
820 goto out_rq;
823 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
824 if (i915_reset_count(&i915->gpu_error) == reset_count) {
825 pr_err("No GPU reset recorded!\n");
826 err = -EINVAL;
827 goto out_rq;
830 out_rq:
831 i915_gem_request_put(rq);
832 fini:
833 hang_fini(&h);
834 unlock:
835 mutex_unlock(&i915->drm.struct_mutex);
836 global_reset_unlock(i915);
838 if (i915_terminally_wedged(&i915->gpu_error))
839 return -EIO;
841 return err;
844 static int igt_reset_queue(void *arg)
846 struct drm_i915_private *i915 = arg;
847 struct intel_engine_cs *engine;
848 enum intel_engine_id id;
849 struct hang h;
850 int err;
852 /* Check that we replay pending requests following a hang */
854 global_reset_lock(i915);
856 mutex_lock(&i915->drm.struct_mutex);
857 err = hang_init(&h, i915);
858 if (err)
859 goto unlock;
861 for_each_engine(engine, i915, id) {
862 struct drm_i915_gem_request *prev;
863 IGT_TIMEOUT(end_time);
864 unsigned int count;
866 if (!intel_engine_can_store_dword(engine))
867 continue;
869 prev = hang_create_request(&h, engine, i915->kernel_context);
870 if (IS_ERR(prev)) {
871 err = PTR_ERR(prev);
872 goto fini;
875 i915_gem_request_get(prev);
876 __i915_add_request(prev, true);
878 count = 0;
879 do {
880 struct drm_i915_gem_request *rq;
881 unsigned int reset_count;
883 rq = hang_create_request(&h,
884 engine,
885 i915->kernel_context);
886 if (IS_ERR(rq)) {
887 err = PTR_ERR(rq);
888 goto fini;
891 i915_gem_request_get(rq);
892 __i915_add_request(rq, true);
894 if (!wait_for_hang(&h, prev)) {
895 struct drm_printer p = drm_info_printer(i915->drm.dev);
897 pr_err("%s: Failed to start request %x, at %x\n",
898 __func__, prev->fence.seqno, hws_seqno(&h, prev));
899 intel_engine_dump(prev->engine, &p,
900 "%s\n", prev->engine->name);
902 i915_gem_request_put(rq);
903 i915_gem_request_put(prev);
905 i915_reset(i915, 0);
906 i915_gem_set_wedged(i915);
908 err = -EIO;
909 goto fini;
912 reset_count = fake_hangcheck(prev);
914 i915_reset(i915, I915_RESET_QUIET);
916 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
917 &i915->gpu_error.flags));
919 if (prev->fence.error != -EIO) {
920 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
921 prev->fence.error);
922 i915_gem_request_put(rq);
923 i915_gem_request_put(prev);
924 err = -EINVAL;
925 goto fini;
928 if (rq->fence.error) {
929 pr_err("Fence error status not zero [%d] after unrelated reset\n",
930 rq->fence.error);
931 i915_gem_request_put(rq);
932 i915_gem_request_put(prev);
933 err = -EINVAL;
934 goto fini;
937 if (i915_reset_count(&i915->gpu_error) == reset_count) {
938 pr_err("No GPU reset recorded!\n");
939 i915_gem_request_put(rq);
940 i915_gem_request_put(prev);
941 err = -EINVAL;
942 goto fini;
945 i915_gem_request_put(prev);
946 prev = rq;
947 count++;
948 } while (time_before(jiffies, end_time));
949 pr_info("%s: Completed %d resets\n", engine->name, count);
951 *h.batch = MI_BATCH_BUFFER_END;
952 i915_gem_chipset_flush(i915);
954 i915_gem_request_put(prev);
957 fini:
958 hang_fini(&h);
959 unlock:
960 mutex_unlock(&i915->drm.struct_mutex);
961 global_reset_unlock(i915);
963 if (i915_terminally_wedged(&i915->gpu_error))
964 return -EIO;
966 return err;
969 static int igt_handle_error(void *arg)
971 struct drm_i915_private *i915 = arg;
972 struct intel_engine_cs *engine = i915->engine[RCS];
973 struct hang h;
974 struct drm_i915_gem_request *rq;
975 struct i915_gpu_state *error;
976 int err;
978 /* Check that we can issue a global GPU and engine reset */
980 if (!intel_has_reset_engine(i915))
981 return 0;
983 if (!intel_engine_can_store_dword(i915->engine[RCS]))
984 return 0;
986 mutex_lock(&i915->drm.struct_mutex);
988 err = hang_init(&h, i915);
989 if (err)
990 goto err_unlock;
992 rq = hang_create_request(&h, engine, i915->kernel_context);
993 if (IS_ERR(rq)) {
994 err = PTR_ERR(rq);
995 goto err_fini;
998 i915_gem_request_get(rq);
999 __i915_add_request(rq, true);
1001 if (!wait_for_hang(&h, rq)) {
1002 struct drm_printer p = drm_info_printer(i915->drm.dev);
1004 pr_err("%s: Failed to start request %x, at %x\n",
1005 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1006 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1008 i915_reset(i915, 0);
1009 i915_gem_set_wedged(i915);
1011 err = -EIO;
1012 goto err_request;
1015 mutex_unlock(&i915->drm.struct_mutex);
1017 /* Temporarily disable error capture */
1018 error = xchg(&i915->gpu_error.first_error, (void *)-1);
1020 engine->hangcheck.stalled = true;
1021 engine->hangcheck.seqno = intel_engine_get_seqno(engine);
1023 i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
1025 xchg(&i915->gpu_error.first_error, error);
1027 mutex_lock(&i915->drm.struct_mutex);
1029 if (rq->fence.error != -EIO) {
1030 pr_err("Guilty request not identified!\n");
1031 err = -EINVAL;
1032 goto err_request;
1035 err_request:
1036 i915_gem_request_put(rq);
1037 err_fini:
1038 hang_fini(&h);
1039 err_unlock:
1040 mutex_unlock(&i915->drm.struct_mutex);
1041 return err;
1044 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1046 static const struct i915_subtest tests[] = {
1047 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1048 SUBTEST(igt_hang_sanitycheck),
1049 SUBTEST(igt_reset_idle_engine),
1050 SUBTEST(igt_reset_active_engine),
1051 SUBTEST(igt_reset_idle_engine_others),
1052 SUBTEST(igt_reset_active_engine_others),
1053 SUBTEST(igt_wait_reset),
1054 SUBTEST(igt_reset_queue),
1055 SUBTEST(igt_handle_error),
1057 bool saved_hangcheck;
1058 int err;
1060 if (!intel_has_gpu_reset(i915))
1061 return 0;
1063 intel_runtime_pm_get(i915);
1064 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1066 err = i915_subtests(tests, i915);
1068 i915_modparams.enable_hangcheck = saved_hangcheck;
1069 intel_runtime_pm_put(i915);
1071 return err;