2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include <linux/kthread.h>
27 #include "gem/i915_gem_context.h"
30 #include "intel_engine_heartbeat.h"
31 #include "intel_engine_pm.h"
33 #include "i915_selftest.h"
34 #include "selftests/i915_random.h"
35 #include "selftests/igt_flush_test.h"
36 #include "selftests/igt_reset.h"
37 #include "selftests/igt_atomic.h"
39 #include "selftests/mock_drm.h"
41 #include "gem/selftests/mock_context.h"
42 #include "gem/selftests/igt_gem_utils.h"
44 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
48 struct drm_i915_gem_object
*hws
;
49 struct drm_i915_gem_object
*obj
;
50 struct i915_gem_context
*ctx
;
55 static int hang_init(struct hang
*h
, struct intel_gt
*gt
)
60 memset(h
, 0, sizeof(*h
));
63 h
->ctx
= kernel_context(gt
->i915
);
65 return PTR_ERR(h
->ctx
);
67 GEM_BUG_ON(i915_gem_context_is_bannable(h
->ctx
));
69 h
->hws
= i915_gem_object_create_internal(gt
->i915
, PAGE_SIZE
);
71 err
= PTR_ERR(h
->hws
);
75 h
->obj
= i915_gem_object_create_internal(gt
->i915
, PAGE_SIZE
);
77 err
= PTR_ERR(h
->obj
);
81 i915_gem_object_set_cache_coherency(h
->hws
, I915_CACHE_LLC
);
82 vaddr
= i915_gem_object_pin_map(h
->hws
, I915_MAP_WB
);
87 h
->seqno
= memset(vaddr
, 0xff, PAGE_SIZE
);
89 vaddr
= i915_gem_object_pin_map(h
->obj
,
90 i915_coherent_map_type(gt
->i915
));
100 i915_gem_object_unpin_map(h
->hws
);
102 i915_gem_object_put(h
->obj
);
104 i915_gem_object_put(h
->hws
);
106 kernel_context_close(h
->ctx
);
110 static u64
hws_address(const struct i915_vma
*hws
,
111 const struct i915_request
*rq
)
113 return hws
->node
.start
+ offset_in_page(sizeof(u32
)*rq
->fence
.context
);
116 static int move_to_active(struct i915_vma
*vma
,
117 struct i915_request
*rq
,
123 err
= i915_request_await_object(rq
, vma
->obj
,
124 flags
& EXEC_OBJECT_WRITE
);
126 err
= i915_vma_move_to_active(vma
, rq
, flags
);
127 i915_vma_unlock(vma
);
132 static struct i915_request
*
133 hang_create_request(struct hang
*h
, struct intel_engine_cs
*engine
)
135 struct intel_gt
*gt
= h
->gt
;
136 struct i915_address_space
*vm
= i915_gem_context_get_vm_rcu(h
->ctx
);
137 struct drm_i915_gem_object
*obj
;
138 struct i915_request
*rq
= NULL
;
139 struct i915_vma
*hws
, *vma
;
145 obj
= i915_gem_object_create_internal(gt
->i915
, PAGE_SIZE
);
148 return ERR_CAST(obj
);
151 vaddr
= i915_gem_object_pin_map(obj
, i915_coherent_map_type(gt
->i915
));
153 i915_gem_object_put(obj
);
155 return ERR_CAST(vaddr
);
158 i915_gem_object_unpin_map(h
->obj
);
159 i915_gem_object_put(h
->obj
);
164 vma
= i915_vma_instance(h
->obj
, vm
, NULL
);
167 return ERR_CAST(vma
);
170 hws
= i915_vma_instance(h
->hws
, vm
, NULL
);
173 return ERR_CAST(hws
);
176 err
= i915_vma_pin(vma
, 0, 0, PIN_USER
);
182 err
= i915_vma_pin(hws
, 0, 0, PIN_USER
);
186 rq
= igt_request_alloc(h
->ctx
, engine
);
192 err
= move_to_active(vma
, rq
, 0);
196 err
= move_to_active(hws
, rq
, 0);
201 if (INTEL_GEN(gt
->i915
) >= 8) {
202 *batch
++ = MI_STORE_DWORD_IMM_GEN4
;
203 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
204 *batch
++ = upper_32_bits(hws_address(hws
, rq
));
205 *batch
++ = rq
->fence
.seqno
;
206 *batch
++ = MI_ARB_CHECK
;
208 memset(batch
, 0, 1024);
209 batch
+= 1024 / sizeof(*batch
);
211 *batch
++ = MI_ARB_CHECK
;
212 *batch
++ = MI_BATCH_BUFFER_START
| 1 << 8 | 1;
213 *batch
++ = lower_32_bits(vma
->node
.start
);
214 *batch
++ = upper_32_bits(vma
->node
.start
);
215 } else if (INTEL_GEN(gt
->i915
) >= 6) {
216 *batch
++ = MI_STORE_DWORD_IMM_GEN4
;
218 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
219 *batch
++ = rq
->fence
.seqno
;
220 *batch
++ = MI_ARB_CHECK
;
222 memset(batch
, 0, 1024);
223 batch
+= 1024 / sizeof(*batch
);
225 *batch
++ = MI_ARB_CHECK
;
226 *batch
++ = MI_BATCH_BUFFER_START
| 1 << 8;
227 *batch
++ = lower_32_bits(vma
->node
.start
);
228 } else if (INTEL_GEN(gt
->i915
) >= 4) {
229 *batch
++ = MI_STORE_DWORD_IMM_GEN4
| MI_USE_GGTT
;
231 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
232 *batch
++ = rq
->fence
.seqno
;
233 *batch
++ = MI_ARB_CHECK
;
235 memset(batch
, 0, 1024);
236 batch
+= 1024 / sizeof(*batch
);
238 *batch
++ = MI_ARB_CHECK
;
239 *batch
++ = MI_BATCH_BUFFER_START
| 2 << 6;
240 *batch
++ = lower_32_bits(vma
->node
.start
);
242 *batch
++ = MI_STORE_DWORD_IMM
| MI_MEM_VIRTUAL
;
243 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
244 *batch
++ = rq
->fence
.seqno
;
245 *batch
++ = MI_ARB_CHECK
;
247 memset(batch
, 0, 1024);
248 batch
+= 1024 / sizeof(*batch
);
250 *batch
++ = MI_ARB_CHECK
;
251 *batch
++ = MI_BATCH_BUFFER_START
| 2 << 6;
252 *batch
++ = lower_32_bits(vma
->node
.start
);
254 *batch
++ = MI_BATCH_BUFFER_END
; /* not reached */
255 intel_gt_chipset_flush(engine
->gt
);
257 if (rq
->engine
->emit_init_breadcrumb
) {
258 err
= rq
->engine
->emit_init_breadcrumb(rq
);
264 if (INTEL_GEN(gt
->i915
) <= 5)
265 flags
|= I915_DISPATCH_SECURE
;
267 err
= rq
->engine
->emit_bb_start(rq
, vma
->node
.start
, PAGE_SIZE
, flags
);
271 i915_request_skip(rq
, err
);
272 i915_request_add(rq
);
279 return err
? ERR_PTR(err
) : rq
;
282 static u32
hws_seqno(const struct hang
*h
, const struct i915_request
*rq
)
284 return READ_ONCE(h
->seqno
[rq
->fence
.context
% (PAGE_SIZE
/sizeof(u32
))]);
287 static void hang_fini(struct hang
*h
)
289 *h
->batch
= MI_BATCH_BUFFER_END
;
290 intel_gt_chipset_flush(h
->gt
);
292 i915_gem_object_unpin_map(h
->obj
);
293 i915_gem_object_put(h
->obj
);
295 i915_gem_object_unpin_map(h
->hws
);
296 i915_gem_object_put(h
->hws
);
298 kernel_context_close(h
->ctx
);
300 igt_flush_test(h
->gt
->i915
);
303 static bool wait_until_running(struct hang
*h
, struct i915_request
*rq
)
305 return !(wait_for_us(i915_seqno_passed(hws_seqno(h
, rq
),
308 wait_for(i915_seqno_passed(hws_seqno(h
, rq
),
313 static void engine_heartbeat_disable(struct intel_engine_cs
*engine
,
314 unsigned long *saved
)
316 *saved
= engine
->props
.heartbeat_interval_ms
;
317 engine
->props
.heartbeat_interval_ms
= 0;
319 intel_engine_pm_get(engine
);
320 intel_engine_park_heartbeat(engine
);
323 static void engine_heartbeat_enable(struct intel_engine_cs
*engine
,
326 intel_engine_pm_put(engine
);
328 engine
->props
.heartbeat_interval_ms
= saved
;
331 static int igt_hang_sanitycheck(void *arg
)
333 struct intel_gt
*gt
= arg
;
334 struct i915_request
*rq
;
335 struct intel_engine_cs
*engine
;
336 enum intel_engine_id id
;
340 /* Basic check that we can execute our hanging batch */
342 err
= hang_init(&h
, gt
);
346 for_each_engine(engine
, gt
, id
) {
347 struct intel_wedge_me w
;
350 if (!intel_engine_can_store_dword(engine
))
353 rq
= hang_create_request(&h
, engine
);
356 pr_err("Failed to create request for %s, err=%d\n",
361 i915_request_get(rq
);
363 *h
.batch
= MI_BATCH_BUFFER_END
;
364 intel_gt_chipset_flush(engine
->gt
);
366 i915_request_add(rq
);
369 intel_wedge_on_timeout(&w
, gt
, HZ
/ 10 /* 100ms */)
370 timeout
= i915_request_wait(rq
, 0,
371 MAX_SCHEDULE_TIMEOUT
);
372 if (intel_gt_is_wedged(gt
))
375 i915_request_put(rq
);
379 pr_err("Wait for request failed on %s, err=%d\n",
390 static bool wait_for_idle(struct intel_engine_cs
*engine
)
392 return wait_for(intel_engine_is_idle(engine
), IGT_IDLE_TIMEOUT
) == 0;
395 static int igt_reset_nop(void *arg
)
397 struct intel_gt
*gt
= arg
;
398 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
399 struct intel_engine_cs
*engine
;
400 unsigned int reset_count
, count
;
401 enum intel_engine_id id
;
402 IGT_TIMEOUT(end_time
);
405 /* Check that we can reset during non-user portions of requests */
407 reset_count
= i915_reset_count(global
);
410 for_each_engine(engine
, gt
, id
) {
411 struct intel_context
*ce
;
414 ce
= intel_context_create(engine
);
420 for (i
= 0; i
< 16; i
++) {
421 struct i915_request
*rq
;
423 rq
= intel_context_create_request(ce
);
429 i915_request_add(rq
);
432 intel_context_put(ce
);
435 igt_global_reset_lock(gt
);
436 intel_gt_reset(gt
, ALL_ENGINES
, NULL
);
437 igt_global_reset_unlock(gt
);
439 if (intel_gt_is_wedged(gt
)) {
444 if (i915_reset_count(global
) != reset_count
+ ++count
) {
445 pr_err("Full GPU reset not recorded!\n");
450 err
= igt_flush_test(gt
->i915
);
453 } while (time_before(jiffies
, end_time
));
454 pr_info("%s: %d resets\n", __func__
, count
);
456 if (igt_flush_test(gt
->i915
))
461 static int igt_reset_nop_engine(void *arg
)
463 struct intel_gt
*gt
= arg
;
464 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
465 struct intel_engine_cs
*engine
;
466 enum intel_engine_id id
;
468 /* Check that we can engine-reset during non-user portions */
470 if (!intel_has_reset_engine(gt
))
473 for_each_engine(engine
, gt
, id
) {
474 unsigned int reset_count
, reset_engine_count
, count
;
475 struct intel_context
*ce
;
476 unsigned long heartbeat
;
477 IGT_TIMEOUT(end_time
);
480 ce
= intel_context_create(engine
);
484 reset_count
= i915_reset_count(global
);
485 reset_engine_count
= i915_reset_engine_count(global
, engine
);
488 engine_heartbeat_disable(engine
, &heartbeat
);
489 set_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
493 if (!wait_for_idle(engine
)) {
494 pr_err("%s failed to idle before reset\n",
500 for (i
= 0; i
< 16; i
++) {
501 struct i915_request
*rq
;
503 rq
= intel_context_create_request(ce
);
509 i915_request_add(rq
);
511 err
= intel_engine_reset(engine
, NULL
);
513 pr_err("i915_reset_engine failed\n");
517 if (i915_reset_count(global
) != reset_count
) {
518 pr_err("Full GPU reset recorded! (engine reset expected)\n");
523 if (i915_reset_engine_count(global
, engine
) !=
524 reset_engine_count
+ ++count
) {
525 pr_err("%s engine reset not recorded!\n",
530 } while (time_before(jiffies
, end_time
));
531 clear_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
532 engine_heartbeat_enable(engine
, heartbeat
);
534 pr_info("%s(%s): %d resets\n", __func__
, engine
->name
, count
);
536 intel_context_put(ce
);
537 if (igt_flush_test(gt
->i915
))
546 static int __igt_reset_engine(struct intel_gt
*gt
, bool active
)
548 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
549 struct intel_engine_cs
*engine
;
550 enum intel_engine_id id
;
554 /* Check that we can issue an engine reset on an idle engine (no-op) */
556 if (!intel_has_reset_engine(gt
))
560 err
= hang_init(&h
, gt
);
565 for_each_engine(engine
, gt
, id
) {
566 unsigned int reset_count
, reset_engine_count
;
567 unsigned long heartbeat
;
568 IGT_TIMEOUT(end_time
);
570 if (active
&& !intel_engine_can_store_dword(engine
))
573 if (!wait_for_idle(engine
)) {
574 pr_err("%s failed to idle before reset\n",
580 reset_count
= i915_reset_count(global
);
581 reset_engine_count
= i915_reset_engine_count(global
, engine
);
583 engine_heartbeat_disable(engine
, &heartbeat
);
584 set_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
587 struct i915_request
*rq
;
589 rq
= hang_create_request(&h
, engine
);
595 i915_request_get(rq
);
596 i915_request_add(rq
);
598 if (!wait_until_running(&h
, rq
)) {
599 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
601 pr_err("%s: Failed to start request %llx, at %x\n",
602 __func__
, rq
->fence
.seqno
, hws_seqno(&h
, rq
));
603 intel_engine_dump(engine
, &p
,
604 "%s\n", engine
->name
);
606 i915_request_put(rq
);
611 i915_request_put(rq
);
614 err
= intel_engine_reset(engine
, NULL
);
616 pr_err("i915_reset_engine failed\n");
620 if (i915_reset_count(global
) != reset_count
) {
621 pr_err("Full GPU reset recorded! (engine reset expected)\n");
626 if (i915_reset_engine_count(global
, engine
) !=
627 ++reset_engine_count
) {
628 pr_err("%s engine reset not recorded!\n",
633 } while (time_before(jiffies
, end_time
));
634 clear_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
635 engine_heartbeat_enable(engine
, heartbeat
);
640 err
= igt_flush_test(gt
->i915
);
645 if (intel_gt_is_wedged(gt
))
654 static int igt_reset_idle_engine(void *arg
)
656 return __igt_reset_engine(arg
, false);
659 static int igt_reset_active_engine(void *arg
)
661 return __igt_reset_engine(arg
, true);
664 struct active_engine
{
665 struct task_struct
*task
;
666 struct intel_engine_cs
*engine
;
667 unsigned long resets
;
671 #define TEST_ACTIVE BIT(0)
672 #define TEST_OTHERS BIT(1)
673 #define TEST_SELF BIT(2)
674 #define TEST_PRIORITY BIT(3)
676 static int active_request_put(struct i915_request
*rq
)
683 if (i915_request_wait(rq
, 0, 5 * HZ
) < 0) {
684 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
690 intel_gt_set_wedged(rq
->engine
->gt
);
694 i915_request_put(rq
);
699 static int active_engine(void *data
)
701 I915_RND_STATE(prng
);
702 struct active_engine
*arg
= data
;
703 struct intel_engine_cs
*engine
= arg
->engine
;
704 struct i915_request
*rq
[8] = {};
705 struct intel_context
*ce
[ARRAY_SIZE(rq
)];
709 for (count
= 0; count
< ARRAY_SIZE(ce
); count
++) {
710 ce
[count
] = intel_context_create(engine
);
711 if (IS_ERR(ce
[count
])) {
712 err
= PTR_ERR(ce
[count
]);
714 intel_context_put(ce
[count
]);
720 while (!kthread_should_stop()) {
721 unsigned int idx
= count
++ & (ARRAY_SIZE(rq
) - 1);
722 struct i915_request
*old
= rq
[idx
];
723 struct i915_request
*new;
725 new = intel_context_create_request(ce
[idx
]);
731 rq
[idx
] = i915_request_get(new);
732 i915_request_add(new);
734 if (engine
->schedule
&& arg
->flags
& TEST_PRIORITY
) {
735 struct i915_sched_attr attr
= {
737 i915_prandom_u32_max_state(512, &prng
),
739 engine
->schedule(rq
[idx
], &attr
);
742 err
= active_request_put(old
);
749 for (count
= 0; count
< ARRAY_SIZE(rq
); count
++) {
750 int err__
= active_request_put(rq
[count
]);
752 /* Keep the first error */
756 intel_context_put(ce
[count
]);
762 static int __igt_reset_engines(struct intel_gt
*gt
,
763 const char *test_name
,
766 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
767 struct intel_engine_cs
*engine
, *other
;
768 enum intel_engine_id id
, tmp
;
772 /* Check that issuing a reset on one engine does not interfere
773 * with any other engine.
776 if (!intel_has_reset_engine(gt
))
779 if (flags
& TEST_ACTIVE
) {
780 err
= hang_init(&h
, gt
);
784 if (flags
& TEST_PRIORITY
)
785 h
.ctx
->sched
.priority
= 1024;
788 for_each_engine(engine
, gt
, id
) {
789 struct active_engine threads
[I915_NUM_ENGINES
] = {};
790 unsigned long device
= i915_reset_count(global
);
791 unsigned long count
= 0, reported
;
792 unsigned long heartbeat
;
793 IGT_TIMEOUT(end_time
);
795 if (flags
& TEST_ACTIVE
&&
796 !intel_engine_can_store_dword(engine
))
799 if (!wait_for_idle(engine
)) {
800 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
801 engine
->name
, test_name
);
806 memset(threads
, 0, sizeof(threads
));
807 for_each_engine(other
, gt
, tmp
) {
808 struct task_struct
*tsk
;
810 threads
[tmp
].resets
=
811 i915_reset_engine_count(global
, other
);
813 if (!(flags
& TEST_OTHERS
))
816 if (other
== engine
&& !(flags
& TEST_SELF
))
819 threads
[tmp
].engine
= other
;
820 threads
[tmp
].flags
= flags
;
822 tsk
= kthread_run(active_engine
, &threads
[tmp
],
823 "igt/%s", other
->name
);
829 threads
[tmp
].task
= tsk
;
830 get_task_struct(tsk
);
833 yield(); /* start all threads before we begin */
835 engine_heartbeat_disable(engine
, &heartbeat
);
836 set_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
838 struct i915_request
*rq
= NULL
;
840 if (flags
& TEST_ACTIVE
) {
841 rq
= hang_create_request(&h
, engine
);
847 i915_request_get(rq
);
848 i915_request_add(rq
);
850 if (!wait_until_running(&h
, rq
)) {
851 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
853 pr_err("%s: Failed to start request %llx, at %x\n",
854 __func__
, rq
->fence
.seqno
, hws_seqno(&h
, rq
));
855 intel_engine_dump(engine
, &p
,
856 "%s\n", engine
->name
);
858 i915_request_put(rq
);
864 err
= intel_engine_reset(engine
, NULL
);
866 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
867 engine
->name
, test_name
, err
);
874 if (i915_request_wait(rq
, 0, HZ
/ 5) < 0) {
875 struct drm_printer p
=
876 drm_info_printer(gt
->i915
->drm
.dev
);
878 pr_err("i915_reset_engine(%s:%s):"
879 " failed to complete request after reset\n",
880 engine
->name
, test_name
);
881 intel_engine_dump(engine
, &p
,
882 "%s\n", engine
->name
);
883 i915_request_put(rq
);
886 intel_gt_set_wedged(gt
);
891 i915_request_put(rq
);
894 if (!(flags
& TEST_SELF
) && !wait_for_idle(engine
)) {
895 struct drm_printer p
=
896 drm_info_printer(gt
->i915
->drm
.dev
);
898 pr_err("i915_reset_engine(%s:%s):"
899 " failed to idle after reset\n",
900 engine
->name
, test_name
);
901 intel_engine_dump(engine
, &p
,
902 "%s\n", engine
->name
);
907 } while (time_before(jiffies
, end_time
));
908 clear_bit(I915_RESET_ENGINE
+ id
, >
->reset
.flags
);
909 engine_heartbeat_enable(engine
, heartbeat
);
911 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
912 engine
->name
, test_name
, count
);
914 reported
= i915_reset_engine_count(global
, engine
);
915 reported
-= threads
[engine
->id
].resets
;
916 if (reported
!= count
) {
917 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
918 engine
->name
, test_name
, count
, reported
);
924 for_each_engine(other
, gt
, tmp
) {
927 if (!threads
[tmp
].task
)
930 ret
= kthread_stop(threads
[tmp
].task
);
932 pr_err("kthread for other engine %s failed, err=%d\n",
937 put_task_struct(threads
[tmp
].task
);
939 if (other
->uabi_class
!= engine
->uabi_class
&&
940 threads
[tmp
].resets
!=
941 i915_reset_engine_count(global
, other
)) {
942 pr_err("Innocent engine %s was reset (count=%ld)\n",
944 i915_reset_engine_count(global
, other
) -
945 threads
[tmp
].resets
);
951 if (device
!= i915_reset_count(global
)) {
952 pr_err("Global reset (count=%ld)!\n",
953 i915_reset_count(global
) - device
);
961 err
= igt_flush_test(gt
->i915
);
966 if (intel_gt_is_wedged(gt
))
969 if (flags
& TEST_ACTIVE
)
975 static int igt_reset_engines(void *arg
)
977 static const struct {
982 { "active", TEST_ACTIVE
},
983 { "others-idle", TEST_OTHERS
},
984 { "others-active", TEST_OTHERS
| TEST_ACTIVE
},
987 TEST_OTHERS
| TEST_ACTIVE
| TEST_PRIORITY
991 TEST_OTHERS
| TEST_ACTIVE
| TEST_PRIORITY
| TEST_SELF
,
995 struct intel_gt
*gt
= arg
;
999 for (p
= phases
; p
->name
; p
++) {
1000 if (p
->flags
& TEST_PRIORITY
) {
1001 if (!(gt
->i915
->caps
.scheduler
& I915_SCHEDULER_CAP_PRIORITY
))
1005 err
= __igt_reset_engines(arg
, p
->name
, p
->flags
);
1013 static u32
fake_hangcheck(struct intel_gt
*gt
, intel_engine_mask_t mask
)
1015 u32 count
= i915_reset_count(>
->i915
->gpu_error
);
1017 intel_gt_reset(gt
, mask
, NULL
);
1022 static int igt_reset_wait(void *arg
)
1024 struct intel_gt
*gt
= arg
;
1025 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
1026 struct intel_engine_cs
*engine
= gt
->engine
[RCS0
];
1027 struct i915_request
*rq
;
1028 unsigned int reset_count
;
1033 if (!engine
|| !intel_engine_can_store_dword(engine
))
1036 /* Check that we detect a stuck waiter and issue a reset */
1038 igt_global_reset_lock(gt
);
1040 err
= hang_init(&h
, gt
);
1044 rq
= hang_create_request(&h
, engine
);
1050 i915_request_get(rq
);
1051 i915_request_add(rq
);
1053 if (!wait_until_running(&h
, rq
)) {
1054 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
1056 pr_err("%s: Failed to start request %llx, at %x\n",
1057 __func__
, rq
->fence
.seqno
, hws_seqno(&h
, rq
));
1058 intel_engine_dump(rq
->engine
, &p
, "%s\n", rq
->engine
->name
);
1060 intel_gt_set_wedged(gt
);
1066 reset_count
= fake_hangcheck(gt
, ALL_ENGINES
);
1068 timeout
= i915_request_wait(rq
, 0, 10);
1070 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1076 if (i915_reset_count(global
) == reset_count
) {
1077 pr_err("No GPU reset recorded!\n");
1083 i915_request_put(rq
);
1087 igt_global_reset_unlock(gt
);
1089 if (intel_gt_is_wedged(gt
))
1096 struct completion completion
;
1097 struct i915_vma
*vma
;
1100 static int evict_vma(void *data
)
1102 struct evict_vma
*arg
= data
;
1103 struct i915_address_space
*vm
= arg
->vma
->vm
;
1104 struct drm_mm_node evict
= arg
->vma
->node
;
1107 complete(&arg
->completion
);
1109 mutex_lock(&vm
->mutex
);
1110 err
= i915_gem_evict_for_node(vm
, &evict
, 0);
1111 mutex_unlock(&vm
->mutex
);
1116 static int evict_fence(void *data
)
1118 struct evict_vma
*arg
= data
;
1121 complete(&arg
->completion
);
1123 /* Mark the fence register as dirty to force the mmio update. */
1124 err
= i915_gem_object_set_tiling(arg
->vma
->obj
, I915_TILING_Y
, 512);
1126 pr_err("Invalid Y-tiling settings; err:%d\n", err
);
1130 err
= i915_vma_pin(arg
->vma
, 0, 0, PIN_GLOBAL
| PIN_MAPPABLE
);
1132 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err
);
1136 err
= i915_vma_pin_fence(arg
->vma
);
1137 i915_vma_unpin(arg
->vma
);
1139 pr_err("Unable to pin Y-tiled fence; err:%d\n", err
);
1143 i915_vma_unpin_fence(arg
->vma
);
1148 static int __igt_reset_evict_vma(struct intel_gt
*gt
,
1149 struct i915_address_space
*vm
,
1153 struct intel_engine_cs
*engine
= gt
->engine
[RCS0
];
1154 struct drm_i915_gem_object
*obj
;
1155 struct task_struct
*tsk
= NULL
;
1156 struct i915_request
*rq
;
1157 struct evict_vma arg
;
1159 unsigned int pin_flags
;
1162 if (!gt
->ggtt
->num_fences
&& flags
& EXEC_OBJECT_NEEDS_FENCE
)
1165 if (!engine
|| !intel_engine_can_store_dword(engine
))
1168 /* Check that we can recover an unbind stuck on a hanging request */
1170 err
= hang_init(&h
, gt
);
1174 obj
= i915_gem_object_create_internal(gt
->i915
, SZ_1M
);
1180 if (flags
& EXEC_OBJECT_NEEDS_FENCE
) {
1181 err
= i915_gem_object_set_tiling(obj
, I915_TILING_X
, 512);
1183 pr_err("Invalid X-tiling settings; err:%d\n", err
);
1188 arg
.vma
= i915_vma_instance(obj
, vm
, NULL
);
1189 if (IS_ERR(arg
.vma
)) {
1190 err
= PTR_ERR(arg
.vma
);
1194 rq
= hang_create_request(&h
, engine
);
1200 pin_flags
= i915_vma_is_ggtt(arg
.vma
) ? PIN_GLOBAL
: PIN_USER
;
1202 if (flags
& EXEC_OBJECT_NEEDS_FENCE
)
1203 pin_flags
|= PIN_MAPPABLE
;
1205 err
= i915_vma_pin(arg
.vma
, 0, 0, pin_flags
);
1207 i915_request_add(rq
);
1211 if (flags
& EXEC_OBJECT_NEEDS_FENCE
) {
1212 err
= i915_vma_pin_fence(arg
.vma
);
1214 pr_err("Unable to pin X-tiled fence; err:%d\n", err
);
1215 i915_vma_unpin(arg
.vma
);
1216 i915_request_add(rq
);
1221 i915_vma_lock(arg
.vma
);
1222 err
= i915_request_await_object(rq
, arg
.vma
->obj
,
1223 flags
& EXEC_OBJECT_WRITE
);
1225 err
= i915_vma_move_to_active(arg
.vma
, rq
, flags
);
1226 i915_vma_unlock(arg
.vma
);
1228 if (flags
& EXEC_OBJECT_NEEDS_FENCE
)
1229 i915_vma_unpin_fence(arg
.vma
);
1230 i915_vma_unpin(arg
.vma
);
1232 i915_request_get(rq
);
1233 i915_request_add(rq
);
1237 if (!wait_until_running(&h
, rq
)) {
1238 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
1240 pr_err("%s: Failed to start request %llx, at %x\n",
1241 __func__
, rq
->fence
.seqno
, hws_seqno(&h
, rq
));
1242 intel_engine_dump(rq
->engine
, &p
, "%s\n", rq
->engine
->name
);
1244 intel_gt_set_wedged(gt
);
1248 init_completion(&arg
.completion
);
1250 tsk
= kthread_run(fn
, &arg
, "igt/evict_vma");
1256 get_task_struct(tsk
);
1258 wait_for_completion(&arg
.completion
);
1260 if (wait_for(!list_empty(&rq
->fence
.cb_list
), 10)) {
1261 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
1263 pr_err("igt/evict_vma kthread did not wait\n");
1264 intel_engine_dump(rq
->engine
, &p
, "%s\n", rq
->engine
->name
);
1266 intel_gt_set_wedged(gt
);
1271 igt_global_reset_lock(gt
);
1272 fake_hangcheck(gt
, rq
->engine
->mask
);
1273 igt_global_reset_unlock(gt
);
1276 struct intel_wedge_me w
;
1278 /* The reset, even indirectly, should take less than 10ms. */
1279 intel_wedge_on_timeout(&w
, gt
, HZ
/ 10 /* 100ms */)
1280 err
= kthread_stop(tsk
);
1282 put_task_struct(tsk
);
1286 i915_request_put(rq
);
1288 i915_gem_object_put(obj
);
1291 if (intel_gt_is_wedged(gt
))
1297 static int igt_reset_evict_ggtt(void *arg
)
1299 struct intel_gt
*gt
= arg
;
1301 return __igt_reset_evict_vma(gt
, >
->ggtt
->vm
,
1302 evict_vma
, EXEC_OBJECT_WRITE
);
1305 static int igt_reset_evict_ppgtt(void *arg
)
1307 struct intel_gt
*gt
= arg
;
1308 struct i915_ppgtt
*ppgtt
;
1311 /* aliasing == global gtt locking, covered above */
1312 if (INTEL_PPGTT(gt
->i915
) < INTEL_PPGTT_FULL
)
1315 ppgtt
= i915_ppgtt_create(gt
);
1317 return PTR_ERR(ppgtt
);
1319 err
= __igt_reset_evict_vma(gt
, &ppgtt
->vm
,
1320 evict_vma
, EXEC_OBJECT_WRITE
);
1321 i915_vm_put(&ppgtt
->vm
);
1326 static int igt_reset_evict_fence(void *arg
)
1328 struct intel_gt
*gt
= arg
;
1330 return __igt_reset_evict_vma(gt
, >
->ggtt
->vm
,
1331 evict_fence
, EXEC_OBJECT_NEEDS_FENCE
);
1334 static int wait_for_others(struct intel_gt
*gt
,
1335 struct intel_engine_cs
*exclude
)
1337 struct intel_engine_cs
*engine
;
1338 enum intel_engine_id id
;
1340 for_each_engine(engine
, gt
, id
) {
1341 if (engine
== exclude
)
1344 if (!wait_for_idle(engine
))
1351 static int igt_reset_queue(void *arg
)
1353 struct intel_gt
*gt
= arg
;
1354 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
1355 struct intel_engine_cs
*engine
;
1356 enum intel_engine_id id
;
1360 /* Check that we replay pending requests following a hang */
1362 igt_global_reset_lock(gt
);
1364 err
= hang_init(&h
, gt
);
1368 for_each_engine(engine
, gt
, id
) {
1369 struct i915_request
*prev
;
1370 IGT_TIMEOUT(end_time
);
1373 if (!intel_engine_can_store_dword(engine
))
1376 prev
= hang_create_request(&h
, engine
);
1378 err
= PTR_ERR(prev
);
1382 i915_request_get(prev
);
1383 i915_request_add(prev
);
1387 struct i915_request
*rq
;
1388 unsigned int reset_count
;
1390 rq
= hang_create_request(&h
, engine
);
1396 i915_request_get(rq
);
1397 i915_request_add(rq
);
1400 * XXX We don't handle resetting the kernel context
1401 * very well. If we trigger a device reset twice in
1402 * quick succession while the kernel context is
1403 * executing, we may end up skipping the breadcrumb.
1404 * This is really only a problem for the selftest as
1405 * normally there is a large interlude between resets
1406 * (hangcheck), or we focus on resetting just one
1407 * engine and so avoid repeatedly resetting innocents.
1409 err
= wait_for_others(gt
, engine
);
1411 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1412 __func__
, engine
->name
);
1413 i915_request_put(rq
);
1414 i915_request_put(prev
);
1417 intel_gt_set_wedged(gt
);
1421 if (!wait_until_running(&h
, prev
)) {
1422 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
1424 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1425 __func__
, engine
->name
,
1426 prev
->fence
.seqno
, hws_seqno(&h
, prev
));
1427 intel_engine_dump(engine
, &p
,
1428 "%s\n", engine
->name
);
1430 i915_request_put(rq
);
1431 i915_request_put(prev
);
1433 intel_gt_set_wedged(gt
);
1439 reset_count
= fake_hangcheck(gt
, BIT(id
));
1441 if (prev
->fence
.error
!= -EIO
) {
1442 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1444 i915_request_put(rq
);
1445 i915_request_put(prev
);
1450 if (rq
->fence
.error
) {
1451 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1453 i915_request_put(rq
);
1454 i915_request_put(prev
);
1459 if (i915_reset_count(global
) == reset_count
) {
1460 pr_err("No GPU reset recorded!\n");
1461 i915_request_put(rq
);
1462 i915_request_put(prev
);
1467 i915_request_put(prev
);
1470 } while (time_before(jiffies
, end_time
));
1471 pr_info("%s: Completed %d resets\n", engine
->name
, count
);
1473 *h
.batch
= MI_BATCH_BUFFER_END
;
1474 intel_gt_chipset_flush(engine
->gt
);
1476 i915_request_put(prev
);
1478 err
= igt_flush_test(gt
->i915
);
1486 igt_global_reset_unlock(gt
);
1488 if (intel_gt_is_wedged(gt
))
1494 static int igt_handle_error(void *arg
)
1496 struct intel_gt
*gt
= arg
;
1497 struct i915_gpu_error
*global
= >
->i915
->gpu_error
;
1498 struct intel_engine_cs
*engine
= gt
->engine
[RCS0
];
1500 struct i915_request
*rq
;
1501 struct i915_gpu_coredump
*error
;
1504 /* Check that we can issue a global GPU and engine reset */
1506 if (!intel_has_reset_engine(gt
))
1509 if (!engine
|| !intel_engine_can_store_dword(engine
))
1512 err
= hang_init(&h
, gt
);
1516 rq
= hang_create_request(&h
, engine
);
1522 i915_request_get(rq
);
1523 i915_request_add(rq
);
1525 if (!wait_until_running(&h
, rq
)) {
1526 struct drm_printer p
= drm_info_printer(gt
->i915
->drm
.dev
);
1528 pr_err("%s: Failed to start request %llx, at %x\n",
1529 __func__
, rq
->fence
.seqno
, hws_seqno(&h
, rq
));
1530 intel_engine_dump(rq
->engine
, &p
, "%s\n", rq
->engine
->name
);
1532 intel_gt_set_wedged(gt
);
1538 /* Temporarily disable error capture */
1539 error
= xchg(&global
->first_error
, (void *)-1);
1541 intel_gt_handle_error(gt
, engine
->mask
, 0, NULL
);
1543 xchg(&global
->first_error
, error
);
1545 if (rq
->fence
.error
!= -EIO
) {
1546 pr_err("Guilty request not identified!\n");
1552 i915_request_put(rq
);
1558 static int __igt_atomic_reset_engine(struct intel_engine_cs
*engine
,
1559 const struct igt_atomic_section
*p
,
1562 struct tasklet_struct
* const t
= &engine
->execlists
.tasklet
;
1565 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1566 engine
->name
, mode
, p
->name
);
1569 p
->critical_section_begin();
1571 err
= intel_engine_reset(engine
, NULL
);
1573 p
->critical_section_end();
1577 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1578 engine
->name
, mode
, p
->name
);
1583 static int igt_atomic_reset_engine(struct intel_engine_cs
*engine
,
1584 const struct igt_atomic_section
*p
)
1586 struct i915_request
*rq
;
1590 err
= __igt_atomic_reset_engine(engine
, p
, "idle");
1594 err
= hang_init(&h
, engine
->gt
);
1598 rq
= hang_create_request(&h
, engine
);
1604 i915_request_get(rq
);
1605 i915_request_add(rq
);
1607 if (wait_until_running(&h
, rq
)) {
1608 err
= __igt_atomic_reset_engine(engine
, p
, "active");
1610 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1611 __func__
, engine
->name
,
1612 rq
->fence
.seqno
, hws_seqno(&h
, rq
));
1613 intel_gt_set_wedged(engine
->gt
);
1618 struct intel_wedge_me w
;
1620 intel_wedge_on_timeout(&w
, engine
->gt
, HZ
/ 20 /* 50ms */)
1621 i915_request_wait(rq
, 0, MAX_SCHEDULE_TIMEOUT
);
1622 if (intel_gt_is_wedged(engine
->gt
))
1626 i915_request_put(rq
);
1632 static int igt_reset_engines_atomic(void *arg
)
1634 struct intel_gt
*gt
= arg
;
1635 const typeof(*igt_atomic_phases
) *p
;
1638 /* Check that the engines resets are usable from atomic context */
1640 if (!intel_has_reset_engine(gt
))
1643 if (USES_GUC_SUBMISSION(gt
->i915
))
1646 igt_global_reset_lock(gt
);
1648 /* Flush any requests before we get started and check basics */
1649 if (!igt_force_reset(gt
))
1652 for (p
= igt_atomic_phases
; p
->name
; p
++) {
1653 struct intel_engine_cs
*engine
;
1654 enum intel_engine_id id
;
1656 for_each_engine(engine
, gt
, id
) {
1657 err
= igt_atomic_reset_engine(engine
, p
);
1664 /* As we poke around the guts, do a full reset before continuing. */
1665 igt_force_reset(gt
);
1667 igt_global_reset_unlock(gt
);
1672 int intel_hangcheck_live_selftests(struct drm_i915_private
*i915
)
1674 static const struct i915_subtest tests
[] = {
1675 SUBTEST(igt_hang_sanitycheck
),
1676 SUBTEST(igt_reset_nop
),
1677 SUBTEST(igt_reset_nop_engine
),
1678 SUBTEST(igt_reset_idle_engine
),
1679 SUBTEST(igt_reset_active_engine
),
1680 SUBTEST(igt_reset_engines
),
1681 SUBTEST(igt_reset_engines_atomic
),
1682 SUBTEST(igt_reset_queue
),
1683 SUBTEST(igt_reset_wait
),
1684 SUBTEST(igt_reset_evict_ggtt
),
1685 SUBTEST(igt_reset_evict_ppgtt
),
1686 SUBTEST(igt_reset_evict_fence
),
1687 SUBTEST(igt_handle_error
),
1689 struct intel_gt
*gt
= &i915
->gt
;
1690 intel_wakeref_t wakeref
;
1693 if (!intel_has_gpu_reset(gt
))
1696 if (intel_gt_is_wedged(gt
))
1697 return -EIO
; /* we're long past hope of a successful reset */
1699 wakeref
= intel_runtime_pm_get(gt
->uncore
->rpm
);
1701 err
= intel_gt_live_subtests(tests
, gt
);
1703 intel_runtime_pm_put(gt
->uncore
->rpm
, wakeref
);