2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include <linux/kthread.h>
27 #include "gem/i915_gem_context.h"
30 #include "intel_engine_heartbeat.h"
31 #include "intel_engine_pm.h"
32 #include "selftest_engine_heartbeat.h"
34 #include "i915_selftest.h"
35 #include "selftests/i915_random.h"
36 #include "selftests/igt_flush_test.h"
37 #include "selftests/igt_reset.h"
38 #include "selftests/igt_atomic.h"
40 #include "selftests/mock_drm.h"
42 #include "gem/selftests/mock_context.h"
43 #include "gem/selftests/igt_gem_utils.h"
45 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
49 struct drm_i915_gem_object
*hws
;
50 struct drm_i915_gem_object
*obj
;
51 struct i915_gem_context
*ctx
;
56 static int hang_init(struct hang
*h
, struct intel_gt
*gt
)
61 memset(h
, 0, sizeof(*h
));
64 h
->ctx
= kernel_context(gt
->i915
);
66 return PTR_ERR(h
->ctx
);
68 GEM_BUG_ON(i915_gem_context_is_bannable(h
->ctx
));
70 h
->hws
= i915_gem_object_create_internal(gt
->i915
, PAGE_SIZE
);
72 err
= PTR_ERR(h
->hws
);
76 h
->obj
= i915_gem_object_create_internal(gt
->i915
, PAGE_SIZE
);
78 err
= PTR_ERR(h
->obj
);
82 i915_gem_object_set_cache_coherency(h
->hws
, I915_CACHE_LLC
);
83 vaddr
= i915_gem_object_pin_map(h
->hws
, I915_MAP_WB
);
88 h
->seqno
= memset(vaddr
, 0xff, PAGE_SIZE
);
90 vaddr
= i915_gem_object_pin_map(h
->obj
,
91 i915_coherent_map_type(gt
->i915
));
101 i915_gem_object_unpin_map(h
->hws
);
103 i915_gem_object_put(h
->obj
);
105 i915_gem_object_put(h
->hws
);
107 kernel_context_close(h
->ctx
);
111 static u64
hws_address(const struct i915_vma
*hws
,
112 const struct i915_request
*rq
)
114 return hws
->node
.start
+ offset_in_page(sizeof(u32
)*rq
->fence
.context
);
117 static int move_to_active(struct i915_vma
*vma
,
118 struct i915_request
*rq
,
124 err
= i915_request_await_object(rq
, vma
->obj
,
125 flags
& EXEC_OBJECT_WRITE
);
127 err
= i915_vma_move_to_active(vma
, rq
, flags
);
128 i915_vma_unlock(vma
);
133 static struct i915_request
*
134 hang_create_request(struct hang
*h
, struct intel_engine_cs
*engine
)
136 struct intel_gt
*gt
= h
->gt
;
137 struct i915_address_space
*vm
= i915_gem_context_get_vm_rcu(h
->ctx
);
138 struct drm_i915_gem_object
*obj
;
139 struct i915_request
*rq
= NULL
;
140 struct i915_vma
*hws
, *vma
;
146 obj
= i915_gem_object_create_internal(gt
->i915
, PAGE_SIZE
);
149 return ERR_CAST(obj
);
152 vaddr
= i915_gem_object_pin_map(obj
, i915_coherent_map_type(gt
->i915
));
154 i915_gem_object_put(obj
);
156 return ERR_CAST(vaddr
);
159 i915_gem_object_unpin_map(h
->obj
);
160 i915_gem_object_put(h
->obj
);
165 vma
= i915_vma_instance(h
->obj
, vm
, NULL
);
168 return ERR_CAST(vma
);
171 hws
= i915_vma_instance(h
->hws
, vm
, NULL
);
174 return ERR_CAST(hws
);
177 err
= i915_vma_pin(vma
, 0, 0, PIN_USER
);
183 err
= i915_vma_pin(hws
, 0, 0, PIN_USER
);
187 rq
= igt_request_alloc(h
->ctx
, engine
);
193 err
= move_to_active(vma
, rq
, 0);
197 err
= move_to_active(hws
, rq
, 0);
202 if (INTEL_GEN(gt
->i915
) >= 8) {
203 *batch
++ = MI_STORE_DWORD_IMM_GEN4
;
204 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
205 *batch
++ = upper_32_bits(hws_address(hws
, rq
));
206 *batch
++ = rq
->fence
.seqno
;
209 memset(batch
, 0, 1024);
210 batch
+= 1024 / sizeof(*batch
);
213 *batch
++ = MI_BATCH_BUFFER_START
| 1 << 8 | 1;
214 *batch
++ = lower_32_bits(vma
->node
.start
);
215 *batch
++ = upper_32_bits(vma
->node
.start
);
216 } else if (INTEL_GEN(gt
->i915
) >= 6) {
217 *batch
++ = MI_STORE_DWORD_IMM_GEN4
;
219 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
220 *batch
++ = rq
->fence
.seqno
;
223 memset(batch
, 0, 1024);
224 batch
+= 1024 / sizeof(*batch
);
227 *batch
++ = MI_BATCH_BUFFER_START
| 1 << 8;
228 *batch
++ = lower_32_bits(vma
->node
.start
);
229 } else if (INTEL_GEN(gt
->i915
) >= 4) {
230 *batch
++ = MI_STORE_DWORD_IMM_GEN4
| MI_USE_GGTT
;
232 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
233 *batch
++ = rq
->fence
.seqno
;
236 memset(batch
, 0, 1024);
237 batch
+= 1024 / sizeof(*batch
);
240 *batch
++ = MI_BATCH_BUFFER_START
| 2 << 6;
241 *batch
++ = lower_32_bits(vma
->node
.start
);
243 *batch
++ = MI_STORE_DWORD_IMM
| MI_MEM_VIRTUAL
;
244 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
245 *batch
++ = rq
->fence
.seqno
;
248 memset(batch
, 0, 1024);
249 batch
+= 1024 / sizeof(*batch
);
252 *batch
++ = MI_BATCH_BUFFER_START
| 2 << 6;
253 *batch
++ = lower_32_bits(vma
->node
.start
);
255 *batch
++ = MI_BATCH_BUFFER_END
; /* not reached */
256 intel_gt_chipset_flush(engine
->gt
);
258 if (rq
->engine
->emit_init_breadcrumb
) {
259 err
= rq
->engine
->emit_init_breadcrumb(rq
);
265 if (INTEL_GEN(gt
->i915
) <= 5)
266 flags
|= I915_DISPATCH_SECURE
;
268 err
= rq
->engine
->emit_bb_start(rq
, vma
->node
.start
, PAGE_SIZE
, flags
);
272 i915_request_set_error_once(rq
, err
);
273 i915_request_add(rq
);
280 return err
? ERR_PTR(err
) : rq
;
283 static u32
hws_seqno(const struct hang
*h
, const struct i915_request
*rq
)
285 return READ_ONCE(h
->seqno
[rq
->fence
.context
% (PAGE_SIZE
/sizeof(u32
))]);
288 static void hang_fini(struct hang
*h
)
290 *h
->batch
= MI_BATCH_BUFFER_END
;
291 intel_gt_chipset_flush(h
->gt
);
293 i915_gem_object_unpin_map(h
->obj
);
294 i915_gem_object_put(h
->obj
);
296 i915_gem_object_unpin_map(h
->hws
);
297 i915_gem_object_put(h
->hws
);
299 kernel_context_close(h
->ctx
);
301 igt_flush_test(h
->gt
->i915
);
304 static bool wait_until_running(struct hang
*h
, struct i915_request
*rq
)
306 return !(wait_for_us(i915_seqno_passed(hws_seqno(h
, rq
),
309 wait_for(i915_seqno_passed(hws_seqno(h
, rq
),
314 static int igt_hang_sanitycheck(void *arg
)
316 struct intel_gt
*gt
= arg
;
317 struct i915_request
*rq
;
318 struct intel_engine_cs
*engine
;
319 enum intel_engine_id id
;
323 /* Basic check that we can execute our hanging batch */
325 err
= hang_init(&h
, gt
);
329 for_each_engine(engine
, gt
, id
) {
330 struct intel_wedge_me w
;
333 if (!intel_engine_can_store_dword(engine
))
336 rq
= hang_create_request(&h
, engine
);
339 pr_err("Failed to create request for %s, err=%d\n",
344 i915_request_get(rq
);
346 *h
.batch
= MI_BATCH_BUFFER_END
;
347 intel_gt_chipset_flush(engine
->gt
);
349 i915_request_add(rq
);
352 intel_wedge_on_timeout(&w
, gt
, HZ
/ 10 /* 100ms */)
353 timeout
= i915_request_wait(rq
, 0,
354 MAX_SCHEDULE_TIMEOUT
);
355 if (intel_gt_is_wedged(gt
))
358 i915_request_put(rq
);
362 pr_err("Wait for request failed on %s, err=%d\n",
373 static bool wait_for_idle(struct intel_engine_cs
*engine
)
375 return wait_for(intel_engine_is_idle(engine
), IGT_IDLE_TIMEOUT
) == 0;
378 static int igt_reset_nop(void *arg
)
380 struct intel_gt
*gt
= arg
;
381 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
382 struct intel_engine_cs
*engine
;
383 unsigned int reset_count
, count
;
384 enum intel_engine_id id
;
385 IGT_TIMEOUT(end_time
);
388 /* Check that we can reset during non-user portions of requests */
390 reset_count
= i915_reset_count(global
);
393 for_each_engine(engine
, gt
, id
) {
394 struct intel_context
*ce
;
397 ce
= intel_context_create(engine
);
403 for (i
= 0; i
< 16; i
++) {
404 struct i915_request
*rq
;
406 rq
= intel_context_create_request(ce
);
412 i915_request_add(rq
);
415 intel_context_put(ce
);
418 igt_global_reset_lock(gt
);
419 intel_gt_reset(gt
, ALL_ENGINES
, NULL
);
420 igt_global_reset_unlock(gt
);
422 if (intel_gt_is_wedged(gt
)) {
427 if (i915_reset_count(global
) != reset_count
+ ++count
) {
428 pr_err("Full GPU reset not recorded!\n");
433 err
= igt_flush_test(gt
->i915
);
436 } while (time_before(jiffies
, end_time
));
437 pr_info("%s: %d resets\n", __func__
, count
);
439 if (igt_flush_test(gt
->i915
))
444 static int igt_reset_nop_engine(void *arg
)
446 struct intel_gt
*gt
= arg
;
447 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
448 struct intel_engine_cs
*engine
;
449 enum intel_engine_id id
;
451 /* Check that we can engine-reset during non-user portions */
453 if (!intel_has_reset_engine(gt
))
456 for_each_engine(engine
, gt
, id
) {
457 unsigned int reset_count
, reset_engine_count
, count
;
458 struct intel_context
*ce
;
459 IGT_TIMEOUT(end_time
);
462 ce
= intel_context_create(engine
);
466 reset_count
= i915_reset_count(global
);
467 reset_engine_count
= i915_reset_engine_count(global
, engine
);
470 st_engine_heartbeat_disable(engine
);
471 set_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
475 if (!wait_for_idle(engine
)) {
476 pr_err("%s failed to idle before reset\n",
482 for (i
= 0; i
< 16; i
++) {
483 struct i915_request
*rq
;
485 rq
= intel_context_create_request(ce
);
487 struct drm_printer p
=
488 drm_info_printer(gt
->i915
->drm
.dev
);
489 intel_engine_dump(engine
, &p
,
490 "%s(%s): failed to submit request\n",
494 GEM_TRACE("%s(%s): failed to submit request\n",
499 intel_gt_set_wedged(gt
);
505 i915_request_add(rq
);
507 err
= intel_engine_reset(engine
, NULL
);
509 pr_err("i915_reset_engine failed\n");
513 if (i915_reset_count(global
) != reset_count
) {
514 pr_err("Full GPU reset recorded! (engine reset expected)\n");
519 if (i915_reset_engine_count(global
, engine
) !=
520 reset_engine_count
+ ++count
) {
521 pr_err("%s engine reset not recorded!\n",
526 } while (time_before(jiffies
, end_time
));
527 clear_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
528 st_engine_heartbeat_enable(engine
);
530 pr_info("%s(%s): %d resets\n", __func__
, engine
->name
, count
);
532 intel_context_put(ce
);
533 if (igt_flush_test(gt
->i915
))
542 static int __igt_reset_engine(struct intel_gt
*gt
, bool active
)
544 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
545 struct intel_engine_cs
*engine
;
546 enum intel_engine_id id
;
550 /* Check that we can issue an engine reset on an idle engine (no-op) */
552 if (!intel_has_reset_engine(gt
))
556 err
= hang_init(&h
, gt
);
561 for_each_engine(engine
, gt
, id
) {
562 unsigned int reset_count
, reset_engine_count
;
563 IGT_TIMEOUT(end_time
);
565 if (active
&& !intel_engine_can_store_dword(engine
))
568 if (!wait_for_idle(engine
)) {
569 pr_err("%s failed to idle before reset\n",
575 reset_count
= i915_reset_count(global
);
576 reset_engine_count
= i915_reset_engine_count(global
, engine
);
578 st_engine_heartbeat_disable(engine
);
579 set_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
582 struct i915_request
*rq
;
584 rq
= hang_create_request(&h
, engine
);
590 i915_request_get(rq
);
591 i915_request_add(rq
);
593 if (!wait_until_running(&h
, rq
)) {
594 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
596 pr_err("%s: Failed to start request %llx, at %x\n",
597 __func__
, rq
->fence
.seqno
, hws_seqno(&h
, rq
));
598 intel_engine_dump(engine
, &p
,
599 "%s\n", engine
->name
);
601 i915_request_put(rq
);
606 i915_request_put(rq
);
609 err
= intel_engine_reset(engine
, NULL
);
611 pr_err("i915_reset_engine failed\n");
615 if (i915_reset_count(global
) != reset_count
) {
616 pr_err("Full GPU reset recorded! (engine reset expected)\n");
621 if (i915_reset_engine_count(global
, engine
) !=
622 ++reset_engine_count
) {
623 pr_err("%s engine reset not recorded!\n",
628 } while (time_before(jiffies
, end_time
));
629 clear_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
630 st_engine_heartbeat_enable(engine
);
635 err
= igt_flush_test(gt
->i915
);
640 if (intel_gt_is_wedged(gt
))
649 static int igt_reset_idle_engine(void *arg
)
651 return __igt_reset_engine(arg
, false);
654 static int igt_reset_active_engine(void *arg
)
656 return __igt_reset_engine(arg
, true);
659 struct active_engine
{
660 struct task_struct
*task
;
661 struct intel_engine_cs
*engine
;
662 unsigned long resets
;
666 #define TEST_ACTIVE BIT(0)
667 #define TEST_OTHERS BIT(1)
668 #define TEST_SELF BIT(2)
669 #define TEST_PRIORITY BIT(3)
671 static int active_request_put(struct i915_request
*rq
)
678 if (i915_request_wait(rq
, 0, 5 * HZ
) < 0) {
679 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
685 intel_gt_set_wedged(rq
->engine
->gt
);
689 i915_request_put(rq
);
694 static int active_engine(void *data
)
696 I915_RND_STATE(prng
);
697 struct active_engine
*arg
= data
;
698 struct intel_engine_cs
*engine
= arg
->engine
;
699 struct i915_request
*rq
[8] = {};
700 struct intel_context
*ce
[ARRAY_SIZE(rq
)];
704 for (count
= 0; count
< ARRAY_SIZE(ce
); count
++) {
705 ce
[count
] = intel_context_create(engine
);
706 if (IS_ERR(ce
[count
])) {
707 err
= PTR_ERR(ce
[count
]);
709 intel_context_put(ce
[count
]);
715 while (!kthread_should_stop()) {
716 unsigned int idx
= count
++ & (ARRAY_SIZE(rq
) - 1);
717 struct i915_request
*old
= rq
[idx
];
718 struct i915_request
*new;
720 new = intel_context_create_request(ce
[idx
]);
726 rq
[idx
] = i915_request_get(new);
727 i915_request_add(new);
729 if (engine
->schedule
&& arg
->flags
& TEST_PRIORITY
) {
730 struct i915_sched_attr attr
= {
732 i915_prandom_u32_max_state(512, &prng
),
734 engine
->schedule(rq
[idx
], &attr
);
737 err
= active_request_put(old
);
744 for (count
= 0; count
< ARRAY_SIZE(rq
); count
++) {
745 int err__
= active_request_put(rq
[count
]);
747 /* Keep the first error */
751 intel_context_put(ce
[count
]);
757 static int __igt_reset_engines(struct intel_gt
*gt
,
758 const char *test_name
,
761 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
762 struct intel_engine_cs
*engine
, *other
;
763 enum intel_engine_id id
, tmp
;
767 /* Check that issuing a reset on one engine does not interfere
768 * with any other engine.
771 if (!intel_has_reset_engine(gt
))
774 if (flags
& TEST_ACTIVE
) {
775 err
= hang_init(&h
, gt
);
779 if (flags
& TEST_PRIORITY
)
780 h
.ctx
->sched
.priority
= 1024;
783 for_each_engine(engine
, gt
, id
) {
784 struct active_engine threads
[I915_NUM_ENGINES
] = {};
785 unsigned long device
= i915_reset_count(global
);
786 unsigned long count
= 0, reported
;
787 IGT_TIMEOUT(end_time
);
789 if (flags
& TEST_ACTIVE
&&
790 !intel_engine_can_store_dword(engine
))
793 if (!wait_for_idle(engine
)) {
794 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
795 engine
->name
, test_name
);
800 memset(threads
, 0, sizeof(threads
));
801 for_each_engine(other
, gt
, tmp
) {
802 struct task_struct
*tsk
;
804 threads
[tmp
].resets
=
805 i915_reset_engine_count(global
, other
);
807 if (other
== engine
&& !(flags
& TEST_SELF
))
810 if (other
!= engine
&& !(flags
& TEST_OTHERS
))
813 threads
[tmp
].engine
= other
;
814 threads
[tmp
].flags
= flags
;
816 tsk
= kthread_run(active_engine
, &threads
[tmp
],
817 "igt/%s", other
->name
);
823 threads
[tmp
].task
= tsk
;
824 get_task_struct(tsk
);
827 yield(); /* start all threads before we begin */
829 st_engine_heartbeat_disable(engine
);
830 set_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
832 struct i915_request
*rq
= NULL
;
834 if (flags
& TEST_ACTIVE
) {
835 rq
= hang_create_request(&h
, engine
);
841 i915_request_get(rq
);
842 i915_request_add(rq
);
844 if (!wait_until_running(&h
, rq
)) {
845 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
847 pr_err("%s: Failed to start request %llx, at %x\n",
848 __func__
, rq
->fence
.seqno
, hws_seqno(&h
, rq
));
849 intel_engine_dump(engine
, &p
,
850 "%s\n", engine
->name
);
852 i915_request_put(rq
);
858 err
= intel_engine_reset(engine
, NULL
);
860 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
861 engine
->name
, test_name
, err
);
868 if (rq
->fence
.error
!= -EIO
) {
869 pr_err("i915_reset_engine(%s:%s):"
870 " failed to reset request %llx:%lld\n",
871 engine
->name
, test_name
,
874 i915_request_put(rq
);
877 intel_gt_set_wedged(gt
);
882 if (i915_request_wait(rq
, 0, HZ
/ 5) < 0) {
883 struct drm_printer p
=
884 drm_info_printer(gt
->i915
->drm
.dev
);
886 pr_err("i915_reset_engine(%s:%s):"
887 " failed to complete request %llx:%lld after reset\n",
888 engine
->name
, test_name
,
891 intel_engine_dump(engine
, &p
,
892 "%s\n", engine
->name
);
893 i915_request_put(rq
);
896 intel_gt_set_wedged(gt
);
901 i915_request_put(rq
);
904 if (!(flags
& TEST_SELF
) && !wait_for_idle(engine
)) {
905 struct drm_printer p
=
906 drm_info_printer(gt
->i915
->drm
.dev
);
908 pr_err("i915_reset_engine(%s:%s):"
909 " failed to idle after reset\n",
910 engine
->name
, test_name
);
911 intel_engine_dump(engine
, &p
,
912 "%s\n", engine
->name
);
917 } while (time_before(jiffies
, end_time
));
918 clear_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
919 st_engine_heartbeat_enable(engine
);
921 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
922 engine
->name
, test_name
, count
);
924 reported
= i915_reset_engine_count(global
, engine
);
925 reported
-= threads
[engine
->id
].resets
;
926 if (reported
!= count
) {
927 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
928 engine
->name
, test_name
, count
, reported
);
934 for_each_engine(other
, gt
, tmp
) {
937 if (!threads
[tmp
].task
)
940 ret
= kthread_stop(threads
[tmp
].task
);
942 pr_err("kthread for other engine %s failed, err=%d\n",
947 put_task_struct(threads
[tmp
].task
);
949 if (other
->uabi_class
!= engine
->uabi_class
&&
950 threads
[tmp
].resets
!=
951 i915_reset_engine_count(global
, other
)) {
952 pr_err("Innocent engine %s was reset (count=%ld)\n",
954 i915_reset_engine_count(global
, other
) -
955 threads
[tmp
].resets
);
961 if (device
!= i915_reset_count(global
)) {
962 pr_err("Global reset (count=%ld)!\n",
963 i915_reset_count(global
) - device
);
971 err
= igt_flush_test(gt
->i915
);
976 if (intel_gt_is_wedged(gt
))
979 if (flags
& TEST_ACTIVE
)
985 static int igt_reset_engines(void *arg
)
987 static const struct {
992 { "active", TEST_ACTIVE
},
993 { "others-idle", TEST_OTHERS
},
994 { "others-active", TEST_OTHERS
| TEST_ACTIVE
},
997 TEST_OTHERS
| TEST_ACTIVE
| TEST_PRIORITY
1001 TEST_ACTIVE
| TEST_PRIORITY
| TEST_SELF
,
1005 struct intel_gt
*gt
= arg
;
1009 for (p
= phases
; p
->name
; p
++) {
1010 if (p
->flags
& TEST_PRIORITY
) {
1011 if (!(gt
->i915
->caps
.scheduler
& I915_SCHEDULER_CAP_PRIORITY
))
1015 err
= __igt_reset_engines(arg
, p
->name
, p
->flags
);
1023 static u32
fake_hangcheck(struct intel_gt
*gt
, intel_engine_mask_t mask
)
1025 u32 count
= i915_reset_count(>
->i915
->gpu_error
);
1027 intel_gt_reset(gt
, mask
, NULL
);
1032 static int igt_reset_wait(void *arg
)
1034 struct intel_gt
*gt
= arg
;
1035 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
1036 struct intel_engine_cs
*engine
= gt
->engine
[RCS0
];
1037 struct i915_request
*rq
;
1038 unsigned int reset_count
;
1043 if (!engine
|| !intel_engine_can_store_dword(engine
))
1046 /* Check that we detect a stuck waiter and issue a reset */
1048 igt_global_reset_lock(gt
);
1050 err
= hang_init(&h
, gt
);
1054 rq
= hang_create_request(&h
, engine
);
1060 i915_request_get(rq
);
1061 i915_request_add(rq
);
1063 if (!wait_until_running(&h
, rq
)) {
1064 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
1066 pr_err("%s: Failed to start request %llx, at %x\n",
1067 __func__
, rq
->fence
.seqno
, hws_seqno(&h
, rq
));
1068 intel_engine_dump(rq
->engine
, &p
, "%s\n", rq
->engine
->name
);
1070 intel_gt_set_wedged(gt
);
1076 reset_count
= fake_hangcheck(gt
, ALL_ENGINES
);
1078 timeout
= i915_request_wait(rq
, 0, 10);
1080 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1086 if (i915_reset_count(global
) == reset_count
) {
1087 pr_err("No GPU reset recorded!\n");
1093 i915_request_put(rq
);
1097 igt_global_reset_unlock(gt
);
1099 if (intel_gt_is_wedged(gt
))
1106 struct completion completion
;
1107 struct i915_vma
*vma
;
1110 static int evict_vma(void *data
)
1112 struct evict_vma
*arg
= data
;
1113 struct i915_address_space
*vm
= arg
->vma
->vm
;
1114 struct drm_mm_node evict
= arg
->vma
->node
;
1117 complete(&arg
->completion
);
1119 mutex_lock(&vm
->mutex
);
1120 err
= i915_gem_evict_for_node(vm
, &evict
, 0);
1121 mutex_unlock(&vm
->mutex
);
1126 static int evict_fence(void *data
)
1128 struct evict_vma
*arg
= data
;
1131 complete(&arg
->completion
);
1133 /* Mark the fence register as dirty to force the mmio update. */
1134 err
= i915_gem_object_set_tiling(arg
->vma
->obj
, I915_TILING_Y
, 512);
1136 pr_err("Invalid Y-tiling settings; err:%d\n", err
);
1140 err
= i915_vma_pin(arg
->vma
, 0, 0, PIN_GLOBAL
| PIN_MAPPABLE
);
1142 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err
);
1146 err
= i915_vma_pin_fence(arg
->vma
);
1147 i915_vma_unpin(arg
->vma
);
1149 pr_err("Unable to pin Y-tiled fence; err:%d\n", err
);
1153 i915_vma_unpin_fence(arg
->vma
);
1158 static int __igt_reset_evict_vma(struct intel_gt
*gt
,
1159 struct i915_address_space
*vm
,
1163 struct intel_engine_cs
*engine
= gt
->engine
[RCS0
];
1164 struct drm_i915_gem_object
*obj
;
1165 struct task_struct
*tsk
= NULL
;
1166 struct i915_request
*rq
;
1167 struct evict_vma arg
;
1169 unsigned int pin_flags
;
1172 if (!gt
->ggtt
->num_fences
&& flags
& EXEC_OBJECT_NEEDS_FENCE
)
1175 if (!engine
|| !intel_engine_can_store_dword(engine
))
1178 /* Check that we can recover an unbind stuck on a hanging request */
1180 err
= hang_init(&h
, gt
);
1184 obj
= i915_gem_object_create_internal(gt
->i915
, SZ_1M
);
1190 if (flags
& EXEC_OBJECT_NEEDS_FENCE
) {
1191 err
= i915_gem_object_set_tiling(obj
, I915_TILING_X
, 512);
1193 pr_err("Invalid X-tiling settings; err:%d\n", err
);
1198 arg
.vma
= i915_vma_instance(obj
, vm
, NULL
);
1199 if (IS_ERR(arg
.vma
)) {
1200 err
= PTR_ERR(arg
.vma
);
1204 rq
= hang_create_request(&h
, engine
);
1210 pin_flags
= i915_vma_is_ggtt(arg
.vma
) ? PIN_GLOBAL
: PIN_USER
;
1212 if (flags
& EXEC_OBJECT_NEEDS_FENCE
)
1213 pin_flags
|= PIN_MAPPABLE
;
1215 err
= i915_vma_pin(arg
.vma
, 0, 0, pin_flags
);
1217 i915_request_add(rq
);
1221 if (flags
& EXEC_OBJECT_NEEDS_FENCE
) {
1222 err
= i915_vma_pin_fence(arg
.vma
);
1224 pr_err("Unable to pin X-tiled fence; err:%d\n", err
);
1225 i915_vma_unpin(arg
.vma
);
1226 i915_request_add(rq
);
1231 i915_vma_lock(arg
.vma
);
1232 err
= i915_request_await_object(rq
, arg
.vma
->obj
,
1233 flags
& EXEC_OBJECT_WRITE
);
1235 err
= i915_vma_move_to_active(arg
.vma
, rq
, flags
);
1236 i915_vma_unlock(arg
.vma
);
1238 if (flags
& EXEC_OBJECT_NEEDS_FENCE
)
1239 i915_vma_unpin_fence(arg
.vma
);
1240 i915_vma_unpin(arg
.vma
);
1242 i915_request_get(rq
);
1243 i915_request_add(rq
);
1247 if (!wait_until_running(&h
, rq
)) {
1248 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
1250 pr_err("%s: Failed to start request %llx, at %x\n",
1251 __func__
, rq
->fence
.seqno
, hws_seqno(&h
, rq
));
1252 intel_engine_dump(rq
->engine
, &p
, "%s\n", rq
->engine
->name
);
1254 intel_gt_set_wedged(gt
);
1258 init_completion(&arg
.completion
);
1260 tsk
= kthread_run(fn
, &arg
, "igt/evict_vma");
1266 get_task_struct(tsk
);
1268 wait_for_completion(&arg
.completion
);
1270 if (wait_for(!list_empty(&rq
->fence
.cb_list
), 10)) {
1271 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
1273 pr_err("igt/evict_vma kthread did not wait\n");
1274 intel_engine_dump(rq
->engine
, &p
, "%s\n", rq
->engine
->name
);
1276 intel_gt_set_wedged(gt
);
1281 igt_global_reset_lock(gt
);
1282 fake_hangcheck(gt
, rq
->engine
->mask
);
1283 igt_global_reset_unlock(gt
);
1286 struct intel_wedge_me w
;
1288 /* The reset, even indirectly, should take less than 10ms. */
1289 intel_wedge_on_timeout(&w
, gt
, HZ
/ 10 /* 100ms */)
1290 err
= kthread_stop(tsk
);
1292 put_task_struct(tsk
);
1296 i915_request_put(rq
);
1298 i915_gem_object_put(obj
);
1301 if (intel_gt_is_wedged(gt
))
1307 static int igt_reset_evict_ggtt(void *arg
)
1309 struct intel_gt
*gt
= arg
;
1311 return __igt_reset_evict_vma(gt
, >
->ggtt
->vm
,
1312 evict_vma
, EXEC_OBJECT_WRITE
);
1315 static int igt_reset_evict_ppgtt(void *arg
)
1317 struct intel_gt
*gt
= arg
;
1318 struct i915_ppgtt
*ppgtt
;
1321 /* aliasing == global gtt locking, covered above */
1322 if (INTEL_PPGTT(gt
->i915
) < INTEL_PPGTT_FULL
)
1325 ppgtt
= i915_ppgtt_create(gt
);
1327 return PTR_ERR(ppgtt
);
1329 err
= __igt_reset_evict_vma(gt
, &ppgtt
->vm
,
1330 evict_vma
, EXEC_OBJECT_WRITE
);
1331 i915_vm_put(&ppgtt
->vm
);
1336 static int igt_reset_evict_fence(void *arg
)
1338 struct intel_gt
*gt
= arg
;
1340 return __igt_reset_evict_vma(gt
, >
->ggtt
->vm
,
1341 evict_fence
, EXEC_OBJECT_NEEDS_FENCE
);
1344 static int wait_for_others(struct intel_gt
*gt
,
1345 struct intel_engine_cs
*exclude
)
1347 struct intel_engine_cs
*engine
;
1348 enum intel_engine_id id
;
1350 for_each_engine(engine
, gt
, id
) {
1351 if (engine
== exclude
)
1354 if (!wait_for_idle(engine
))
1361 static int igt_reset_queue(void *arg
)
1363 struct intel_gt
*gt
= arg
;
1364 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
1365 struct intel_engine_cs
*engine
;
1366 enum intel_engine_id id
;
1370 /* Check that we replay pending requests following a hang */
1372 igt_global_reset_lock(gt
);
1374 err
= hang_init(&h
, gt
);
1378 for_each_engine(engine
, gt
, id
) {
1379 struct i915_request
*prev
;
1380 IGT_TIMEOUT(end_time
);
1383 if (!intel_engine_can_store_dword(engine
))
1386 prev
= hang_create_request(&h
, engine
);
1388 err
= PTR_ERR(prev
);
1392 i915_request_get(prev
);
1393 i915_request_add(prev
);
1397 struct i915_request
*rq
;
1398 unsigned int reset_count
;
1400 rq
= hang_create_request(&h
, engine
);
1406 i915_request_get(rq
);
1407 i915_request_add(rq
);
1410 * XXX We don't handle resetting the kernel context
1411 * very well. If we trigger a device reset twice in
1412 * quick succession while the kernel context is
1413 * executing, we may end up skipping the breadcrumb.
1414 * This is really only a problem for the selftest as
1415 * normally there is a large interlude between resets
1416 * (hangcheck), or we focus on resetting just one
1417 * engine and so avoid repeatedly resetting innocents.
1419 err
= wait_for_others(gt
, engine
);
1421 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1422 __func__
, engine
->name
);
1423 i915_request_put(rq
);
1424 i915_request_put(prev
);
1427 intel_gt_set_wedged(gt
);
1431 if (!wait_until_running(&h
, prev
)) {
1432 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
1434 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1435 __func__
, engine
->name
,
1436 prev
->fence
.seqno
, hws_seqno(&h
, prev
));
1437 intel_engine_dump(engine
, &p
,
1438 "%s\n", engine
->name
);
1440 i915_request_put(rq
);
1441 i915_request_put(prev
);
1443 intel_gt_set_wedged(gt
);
1449 reset_count
= fake_hangcheck(gt
, BIT(id
));
1451 if (prev
->fence
.error
!= -EIO
) {
1452 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1454 i915_request_put(rq
);
1455 i915_request_put(prev
);
1460 if (rq
->fence
.error
) {
1461 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1463 i915_request_put(rq
);
1464 i915_request_put(prev
);
1469 if (i915_reset_count(global
) == reset_count
) {
1470 pr_err("No GPU reset recorded!\n");
1471 i915_request_put(rq
);
1472 i915_request_put(prev
);
1477 i915_request_put(prev
);
1480 } while (time_before(jiffies
, end_time
));
1481 pr_info("%s: Completed %d resets\n", engine
->name
, count
);
1483 *h
.batch
= MI_BATCH_BUFFER_END
;
1484 intel_gt_chipset_flush(engine
->gt
);
1486 i915_request_put(prev
);
1488 err
= igt_flush_test(gt
->i915
);
1496 igt_global_reset_unlock(gt
);
1498 if (intel_gt_is_wedged(gt
))
1504 static int igt_handle_error(void *arg
)
1506 struct intel_gt
*gt
= arg
;
1507 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
1508 struct intel_engine_cs
*engine
= gt
->engine
[RCS0
];
1510 struct i915_request
*rq
;
1511 struct i915_gpu_coredump
*error
;
1514 /* Check that we can issue a global GPU and engine reset */
1516 if (!intel_has_reset_engine(gt
))
1519 if (!engine
|| !intel_engine_can_store_dword(engine
))
1522 err
= hang_init(&h
, gt
);
1526 rq
= hang_create_request(&h
, engine
);
1532 i915_request_get(rq
);
1533 i915_request_add(rq
);
1535 if (!wait_until_running(&h
, rq
)) {
1536 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
1538 pr_err("%s: Failed to start request %llx, at %x\n",
1539 __func__
, rq
->fence
.seqno
, hws_seqno(&h
, rq
));
1540 intel_engine_dump(rq
->engine
, &p
, "%s\n", rq
->engine
->name
);
1542 intel_gt_set_wedged(gt
);
1548 /* Temporarily disable error capture */
1549 error
= xchg(&global
->first_error
, (void *)-1);
1551 intel_gt_handle_error(gt
, engine
->mask
, 0, NULL
);
1553 xchg(&global
->first_error
, error
);
1555 if (rq
->fence
.error
!= -EIO
) {
1556 pr_err("Guilty request not identified!\n");
1562 i915_request_put(rq
);
1568 static int __igt_atomic_reset_engine(struct intel_engine_cs
*engine
,
1569 const struct igt_atomic_section
*p
,
1572 struct tasklet_struct
* const t
= &engine
->execlists
.tasklet
;
1575 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1576 engine
->name
, mode
, p
->name
);
1579 p
->critical_section_begin();
1581 err
= intel_engine_reset(engine
, NULL
);
1583 p
->critical_section_end();
1587 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1588 engine
->name
, mode
, p
->name
);
1593 static int igt_atomic_reset_engine(struct intel_engine_cs
*engine
,
1594 const struct igt_atomic_section
*p
)
1596 struct i915_request
*rq
;
1600 err
= __igt_atomic_reset_engine(engine
, p
, "idle");
1604 err
= hang_init(&h
, engine
->gt
);
1608 rq
= hang_create_request(&h
, engine
);
1614 i915_request_get(rq
);
1615 i915_request_add(rq
);
1617 if (wait_until_running(&h
, rq
)) {
1618 err
= __igt_atomic_reset_engine(engine
, p
, "active");
1620 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1621 __func__
, engine
->name
,
1622 rq
->fence
.seqno
, hws_seqno(&h
, rq
));
1623 intel_gt_set_wedged(engine
->gt
);
1628 struct intel_wedge_me w
;
1630 intel_wedge_on_timeout(&w
, engine
->gt
, HZ
/ 20 /* 50ms */)
1631 i915_request_wait(rq
, 0, MAX_SCHEDULE_TIMEOUT
);
1632 if (intel_gt_is_wedged(engine
->gt
))
1636 i915_request_put(rq
);
1642 static int igt_reset_engines_atomic(void *arg
)
1644 struct intel_gt
*gt
= arg
;
1645 const typeof(*igt_atomic_phases
) *p
;
1648 /* Check that the engines resets are usable from atomic context */
1650 if (!intel_has_reset_engine(gt
))
1653 if (intel_uc_uses_guc_submission(>
->uc
))
1656 igt_global_reset_lock(gt
);
1658 /* Flush any requests before we get started and check basics */
1659 if (!igt_force_reset(gt
))
1662 for (p
= igt_atomic_phases
; p
->name
; p
++) {
1663 struct intel_engine_cs
*engine
;
1664 enum intel_engine_id id
;
1666 for_each_engine(engine
, gt
, id
) {
1667 err
= igt_atomic_reset_engine(engine
, p
);
1674 /* As we poke around the guts, do a full reset before continuing. */
1675 igt_force_reset(gt
);
1677 igt_global_reset_unlock(gt
);
1682 int intel_hangcheck_live_selftests(struct drm_i915_private
*i915
)
1684 static const struct i915_subtest tests
[] = {
1685 SUBTEST(igt_hang_sanitycheck
),
1686 SUBTEST(igt_reset_nop
),
1687 SUBTEST(igt_reset_nop_engine
),
1688 SUBTEST(igt_reset_idle_engine
),
1689 SUBTEST(igt_reset_active_engine
),
1690 SUBTEST(igt_reset_engines
),
1691 SUBTEST(igt_reset_engines_atomic
),
1692 SUBTEST(igt_reset_queue
),
1693 SUBTEST(igt_reset_wait
),
1694 SUBTEST(igt_reset_evict_ggtt
),
1695 SUBTEST(igt_reset_evict_ppgtt
),
1696 SUBTEST(igt_reset_evict_fence
),
1697 SUBTEST(igt_handle_error
),
1699 struct intel_gt
*gt
= &i915
->gt
;
1700 intel_wakeref_t wakeref
;
1703 if (!intel_has_gpu_reset(gt
))
1706 if (intel_gt_is_wedged(gt
))
1707 return -EIO
; /* we're long past hope of a successful reset */
1709 wakeref
= intel_runtime_pm_get(gt
->uncore
->rpm
);
1711 err
= intel_gt_live_subtests(tests
, gt
);
1713 intel_runtime_pm_put(gt
->uncore
->rpm
, wakeref
);