2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
29 #include "gem/i915_gem_pm.h"
30 #include "gem/selftests/mock_context.h"
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_requests.h"
37 #include "gt/selftest_engine_heartbeat.h"
39 #include "i915_random.h"
40 #include "i915_selftest.h"
41 #include "igt_flush_test.h"
42 #include "igt_live_test.h"
43 #include "igt_spinner.h"
44 #include "lib_sw_fence.h"
47 #include "mock_gem_device.h"
49 static unsigned int num_uabi_engines(struct drm_i915_private
*i915
)
51 struct intel_engine_cs
*engine
;
55 for_each_uabi_engine(engine
, i915
)
61 static struct intel_engine_cs
*rcs0(struct drm_i915_private
*i915
)
63 return intel_engine_lookup_user(i915
, I915_ENGINE_CLASS_RENDER
, 0);
66 static int igt_add_request(void *arg
)
68 struct drm_i915_private
*i915
= arg
;
69 struct i915_request
*request
;
71 /* Basic preliminary test to create a request and let it loose! */
73 request
= mock_request(rcs0(i915
)->kernel_context
, HZ
/ 10);
77 i915_request_add(request
);
82 static int igt_wait_request(void *arg
)
84 const long T
= HZ
/ 4;
85 struct drm_i915_private
*i915
= arg
;
86 struct i915_request
*request
;
89 /* Submit a request, then wait upon it */
91 request
= mock_request(rcs0(i915
)->kernel_context
, T
);
95 i915_request_get(request
);
97 if (i915_request_wait(request
, 0, 0) != -ETIME
) {
98 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
102 if (i915_request_wait(request
, 0, T
) != -ETIME
) {
103 pr_err("request wait succeeded (expected timeout before submit!)\n");
107 if (i915_request_completed(request
)) {
108 pr_err("request completed before submit!!\n");
112 i915_request_add(request
);
114 if (i915_request_wait(request
, 0, 0) != -ETIME
) {
115 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
119 if (i915_request_completed(request
)) {
120 pr_err("request completed immediately!\n");
124 if (i915_request_wait(request
, 0, T
/ 2) != -ETIME
) {
125 pr_err("request wait succeeded (expected timeout!)\n");
129 if (i915_request_wait(request
, 0, T
) == -ETIME
) {
130 pr_err("request wait timed out!\n");
134 if (!i915_request_completed(request
)) {
135 pr_err("request not complete after waiting!\n");
139 if (i915_request_wait(request
, 0, T
) == -ETIME
) {
140 pr_err("request wait timed out when already complete!\n");
146 i915_request_put(request
);
147 mock_device_flush(i915
);
151 static int igt_fence_wait(void *arg
)
153 const long T
= HZ
/ 4;
154 struct drm_i915_private
*i915
= arg
;
155 struct i915_request
*request
;
158 /* Submit a request, treat it as a fence and wait upon it */
160 request
= mock_request(rcs0(i915
)->kernel_context
, T
);
164 if (dma_fence_wait_timeout(&request
->fence
, false, T
) != -ETIME
) {
165 pr_err("fence wait success before submit (expected timeout)!\n");
169 i915_request_add(request
);
171 if (dma_fence_is_signaled(&request
->fence
)) {
172 pr_err("fence signaled immediately!\n");
176 if (dma_fence_wait_timeout(&request
->fence
, false, T
/ 2) != -ETIME
) {
177 pr_err("fence wait success after submit (expected timeout)!\n");
181 if (dma_fence_wait_timeout(&request
->fence
, false, T
) <= 0) {
182 pr_err("fence wait timed out (expected success)!\n");
186 if (!dma_fence_is_signaled(&request
->fence
)) {
187 pr_err("fence unsignaled after waiting!\n");
191 if (dma_fence_wait_timeout(&request
->fence
, false, T
) <= 0) {
192 pr_err("fence wait timed out when complete (expected success)!\n");
198 mock_device_flush(i915
);
202 static int igt_request_rewind(void *arg
)
204 struct drm_i915_private
*i915
= arg
;
205 struct i915_request
*request
, *vip
;
206 struct i915_gem_context
*ctx
[2];
207 struct intel_context
*ce
;
210 ctx
[0] = mock_context(i915
, "A");
212 ce
= i915_gem_context_get_engine(ctx
[0], RCS0
);
213 GEM_BUG_ON(IS_ERR(ce
));
214 request
= mock_request(ce
, 2 * HZ
);
215 intel_context_put(ce
);
221 i915_request_get(request
);
222 i915_request_add(request
);
224 ctx
[1] = mock_context(i915
, "B");
226 ce
= i915_gem_context_get_engine(ctx
[1], RCS0
);
227 GEM_BUG_ON(IS_ERR(ce
));
228 vip
= mock_request(ce
, 0);
229 intel_context_put(ce
);
235 /* Simulate preemption by manual reordering */
236 if (!mock_cancel_request(request
)) {
237 pr_err("failed to cancel request (already executed)!\n");
238 i915_request_add(vip
);
241 i915_request_get(vip
);
242 i915_request_add(vip
);
244 request
->engine
->submit_request(request
);
248 if (i915_request_wait(vip
, 0, HZ
) == -ETIME
) {
249 pr_err("timed out waiting for high priority request\n");
253 if (i915_request_completed(request
)) {
254 pr_err("low priority request already completed\n");
260 i915_request_put(vip
);
262 mock_context_close(ctx
[1]);
263 i915_request_put(request
);
265 mock_context_close(ctx
[0]);
266 mock_device_flush(i915
);
271 struct intel_engine_cs
*engine
;
272 struct i915_gem_context
**contexts
;
273 atomic_long_t num_waits
, num_fences
;
274 int ncontexts
, max_batch
;
275 struct i915_request
*(*request_alloc
)(struct intel_context
*ce
);
278 static struct i915_request
*
279 __mock_request_alloc(struct intel_context
*ce
)
281 return mock_request(ce
, 0);
284 static struct i915_request
*
285 __live_request_alloc(struct intel_context
*ce
)
287 return intel_context_create_request(ce
);
290 static int __igt_breadcrumbs_smoketest(void *arg
)
292 struct smoketest
*t
= arg
;
293 const unsigned int max_batch
= min(t
->ncontexts
, t
->max_batch
) - 1;
294 const unsigned int total
= 4 * t
->ncontexts
+ 1;
295 unsigned int num_waits
= 0, num_fences
= 0;
296 struct i915_request
**requests
;
297 I915_RND_STATE(prng
);
302 * A very simple test to catch the most egregious of list handling bugs.
304 * At its heart, we simply create oodles of requests running across
305 * multiple kthreads and enable signaling on them, for the sole purpose
306 * of stressing our breadcrumb handling. The only inspection we do is
307 * that the fences were marked as signaled.
310 requests
= kcalloc(total
, sizeof(*requests
), GFP_KERNEL
);
314 order
= i915_random_order(total
, &prng
);
320 while (!kthread_should_stop()) {
321 struct i915_sw_fence
*submit
, *wait
;
322 unsigned int n
, count
;
324 submit
= heap_fence_create(GFP_KERNEL
);
330 wait
= heap_fence_create(GFP_KERNEL
);
332 i915_sw_fence_commit(submit
);
333 heap_fence_put(submit
);
338 i915_random_reorder(order
, total
, &prng
);
339 count
= 1 + i915_prandom_u32_max_state(max_batch
, &prng
);
341 for (n
= 0; n
< count
; n
++) {
342 struct i915_gem_context
*ctx
=
343 t
->contexts
[order
[n
] % t
->ncontexts
];
344 struct i915_request
*rq
;
345 struct intel_context
*ce
;
347 ce
= i915_gem_context_get_engine(ctx
, t
->engine
->legacy_idx
);
348 GEM_BUG_ON(IS_ERR(ce
));
349 rq
= t
->request_alloc(ce
);
350 intel_context_put(ce
);
357 err
= i915_sw_fence_await_sw_fence_gfp(&rq
->submit
,
361 requests
[n
] = i915_request_get(rq
);
362 i915_request_add(rq
);
365 err
= i915_sw_fence_await_dma_fence(wait
,
371 i915_request_put(rq
);
377 i915_sw_fence_commit(submit
);
378 i915_sw_fence_commit(wait
);
380 if (!wait_event_timeout(wait
->wait
,
381 i915_sw_fence_done(wait
),
383 struct i915_request
*rq
= requests
[count
- 1];
385 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
386 atomic_read(&wait
->pending
), count
,
387 rq
->fence
.context
, rq
->fence
.seqno
,
391 intel_gt_set_wedged(t
->engine
->gt
);
392 GEM_BUG_ON(!i915_request_completed(rq
));
393 i915_sw_fence_wait(wait
);
397 for (n
= 0; n
< count
; n
++) {
398 struct i915_request
*rq
= requests
[n
];
400 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT
,
402 pr_err("%llu:%llu was not signaled!\n",
403 rq
->fence
.context
, rq
->fence
.seqno
);
407 i915_request_put(rq
);
410 heap_fence_put(wait
);
411 heap_fence_put(submit
);
422 atomic_long_add(num_fences
, &t
->num_fences
);
423 atomic_long_add(num_waits
, &t
->num_waits
);
431 static int mock_breadcrumbs_smoketest(void *arg
)
433 struct drm_i915_private
*i915
= arg
;
434 struct smoketest t
= {
435 .engine
= rcs0(i915
),
438 .request_alloc
= __mock_request_alloc
440 unsigned int ncpus
= num_online_cpus();
441 struct task_struct
**threads
;
446 * Smoketest our breadcrumb/signal handling for requests across multiple
447 * threads. A very simple test to only catch the most egregious of bugs.
448 * See __igt_breadcrumbs_smoketest();
451 threads
= kcalloc(ncpus
, sizeof(*threads
), GFP_KERNEL
);
455 t
.contexts
= kcalloc(t
.ncontexts
, sizeof(*t
.contexts
), GFP_KERNEL
);
461 for (n
= 0; n
< t
.ncontexts
; n
++) {
462 t
.contexts
[n
] = mock_context(t
.engine
->i915
, "mock");
463 if (!t
.contexts
[n
]) {
469 for (n
= 0; n
< ncpus
; n
++) {
470 threads
[n
] = kthread_run(__igt_breadcrumbs_smoketest
,
472 if (IS_ERR(threads
[n
])) {
473 ret
= PTR_ERR(threads
[n
]);
478 get_task_struct(threads
[n
]);
481 yield(); /* start all threads before we begin */
482 msleep(jiffies_to_msecs(i915_selftest
.timeout_jiffies
));
484 for (n
= 0; n
< ncpus
; n
++) {
487 err
= kthread_stop(threads
[n
]);
491 put_task_struct(threads
[n
]);
493 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
494 atomic_long_read(&t
.num_waits
),
495 atomic_long_read(&t
.num_fences
),
499 for (n
= 0; n
< t
.ncontexts
; n
++) {
502 mock_context_close(t
.contexts
[n
]);
510 int i915_request_mock_selftests(void)
512 static const struct i915_subtest tests
[] = {
513 SUBTEST(igt_add_request
),
514 SUBTEST(igt_wait_request
),
515 SUBTEST(igt_fence_wait
),
516 SUBTEST(igt_request_rewind
),
517 SUBTEST(mock_breadcrumbs_smoketest
),
519 struct drm_i915_private
*i915
;
520 intel_wakeref_t wakeref
;
523 i915
= mock_gem_device();
527 with_intel_runtime_pm(&i915
->runtime_pm
, wakeref
)
528 err
= i915_subtests(tests
, i915
);
530 mock_destroy_device(i915
);
535 static int live_nop_request(void *arg
)
537 struct drm_i915_private
*i915
= arg
;
538 struct intel_engine_cs
*engine
;
539 struct igt_live_test t
;
543 * Submit various sized batches of empty requests, to each engine
544 * (individually), and wait for the batch to complete. We can check
545 * the overhead of submitting requests to the hardware.
548 for_each_uabi_engine(engine
, i915
) {
549 unsigned long n
, prime
;
550 IGT_TIMEOUT(end_time
);
551 ktime_t times
[2] = {};
553 err
= igt_live_test_begin(&t
, i915
, __func__
, engine
->name
);
557 intel_engine_pm_get(engine
);
558 for_each_prime_number_from(prime
, 1, 8192) {
559 struct i915_request
*request
= NULL
;
561 times
[1] = ktime_get_raw();
563 for (n
= 0; n
< prime
; n
++) {
564 i915_request_put(request
);
565 request
= i915_request_create(engine
->kernel_context
);
567 return PTR_ERR(request
);
570 * This space is left intentionally blank.
572 * We do not actually want to perform any
573 * action with this request, we just want
574 * to measure the latency in allocation
575 * and submission of our breadcrumbs -
576 * ensuring that the bare request is sufficient
577 * for the system to work (i.e. proper HEAD
578 * tracking of the rings, interrupt handling,
579 * etc). It also gives us the lowest bounds
583 i915_request_get(request
);
584 i915_request_add(request
);
586 i915_request_wait(request
, 0, MAX_SCHEDULE_TIMEOUT
);
587 i915_request_put(request
);
589 times
[1] = ktime_sub(ktime_get_raw(), times
[1]);
593 if (__igt_timeout(end_time
, NULL
))
596 intel_engine_pm_put(engine
);
598 err
= igt_live_test_end(&t
);
602 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
604 ktime_to_ns(times
[0]),
605 prime
, div64_u64(ktime_to_ns(times
[1]), prime
));
611 static struct i915_vma
*empty_batch(struct drm_i915_private
*i915
)
613 struct drm_i915_gem_object
*obj
;
614 struct i915_vma
*vma
;
618 obj
= i915_gem_object_create_internal(i915
, PAGE_SIZE
);
620 return ERR_CAST(obj
);
622 cmd
= i915_gem_object_pin_map(obj
, I915_MAP_WB
);
628 *cmd
= MI_BATCH_BUFFER_END
;
630 __i915_gem_object_flush_map(obj
, 0, 64);
631 i915_gem_object_unpin_map(obj
);
633 intel_gt_chipset_flush(&i915
->gt
);
635 vma
= i915_vma_instance(obj
, &i915
->ggtt
.vm
, NULL
);
641 err
= i915_vma_pin(vma
, 0, 0, PIN_USER
| PIN_GLOBAL
);
645 /* Force the wait wait now to avoid including it in the benchmark */
646 err
= i915_vma_sync(vma
);
655 i915_gem_object_put(obj
);
659 static struct i915_request
*
660 empty_request(struct intel_engine_cs
*engine
,
661 struct i915_vma
*batch
)
663 struct i915_request
*request
;
666 request
= i915_request_create(engine
->kernel_context
);
670 err
= engine
->emit_bb_start(request
,
673 I915_DISPATCH_SECURE
);
677 i915_request_get(request
);
679 i915_request_add(request
);
680 return err
? ERR_PTR(err
) : request
;
683 static int live_empty_request(void *arg
)
685 struct drm_i915_private
*i915
= arg
;
686 struct intel_engine_cs
*engine
;
687 struct igt_live_test t
;
688 struct i915_vma
*batch
;
692 * Submit various sized batches of empty requests, to each engine
693 * (individually), and wait for the batch to complete. We can check
694 * the overhead of submitting requests to the hardware.
697 batch
= empty_batch(i915
);
699 return PTR_ERR(batch
);
701 for_each_uabi_engine(engine
, i915
) {
702 IGT_TIMEOUT(end_time
);
703 struct i915_request
*request
;
704 unsigned long n
, prime
;
705 ktime_t times
[2] = {};
707 err
= igt_live_test_begin(&t
, i915
, __func__
, engine
->name
);
711 intel_engine_pm_get(engine
);
713 /* Warmup / preload */
714 request
= empty_request(engine
, batch
);
715 if (IS_ERR(request
)) {
716 err
= PTR_ERR(request
);
717 intel_engine_pm_put(engine
);
720 i915_request_wait(request
, 0, MAX_SCHEDULE_TIMEOUT
);
722 for_each_prime_number_from(prime
, 1, 8192) {
723 times
[1] = ktime_get_raw();
725 for (n
= 0; n
< prime
; n
++) {
726 i915_request_put(request
);
727 request
= empty_request(engine
, batch
);
728 if (IS_ERR(request
)) {
729 err
= PTR_ERR(request
);
730 intel_engine_pm_put(engine
);
734 i915_request_wait(request
, 0, MAX_SCHEDULE_TIMEOUT
);
736 times
[1] = ktime_sub(ktime_get_raw(), times
[1]);
740 if (__igt_timeout(end_time
, NULL
))
743 i915_request_put(request
);
744 intel_engine_pm_put(engine
);
746 err
= igt_live_test_end(&t
);
750 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
752 ktime_to_ns(times
[0]),
753 prime
, div64_u64(ktime_to_ns(times
[1]), prime
));
757 i915_vma_unpin(batch
);
762 static struct i915_vma
*recursive_batch(struct drm_i915_private
*i915
)
764 struct drm_i915_gem_object
*obj
;
765 const int gen
= INTEL_GEN(i915
);
766 struct i915_vma
*vma
;
770 obj
= i915_gem_object_create_internal(i915
, PAGE_SIZE
);
772 return ERR_CAST(obj
);
774 vma
= i915_vma_instance(obj
, i915
->gt
.vm
, NULL
);
780 err
= i915_vma_pin(vma
, 0, 0, PIN_USER
);
784 cmd
= i915_gem_object_pin_map(obj
, I915_MAP_WC
);
791 *cmd
++ = MI_BATCH_BUFFER_START
| 1 << 8 | 1;
792 *cmd
++ = lower_32_bits(vma
->node
.start
);
793 *cmd
++ = upper_32_bits(vma
->node
.start
);
794 } else if (gen
>= 6) {
795 *cmd
++ = MI_BATCH_BUFFER_START
| 1 << 8;
796 *cmd
++ = lower_32_bits(vma
->node
.start
);
798 *cmd
++ = MI_BATCH_BUFFER_START
| MI_BATCH_GTT
;
799 *cmd
++ = lower_32_bits(vma
->node
.start
);
801 *cmd
++ = MI_BATCH_BUFFER_END
; /* terminate early in case of error */
803 __i915_gem_object_flush_map(obj
, 0, 64);
804 i915_gem_object_unpin_map(obj
);
806 intel_gt_chipset_flush(&i915
->gt
);
811 i915_gem_object_put(obj
);
815 static int recursive_batch_resolve(struct i915_vma
*batch
)
819 cmd
= i915_gem_object_pin_map(batch
->obj
, I915_MAP_WC
);
823 *cmd
= MI_BATCH_BUFFER_END
;
825 __i915_gem_object_flush_map(batch
->obj
, 0, sizeof(*cmd
));
826 i915_gem_object_unpin_map(batch
->obj
);
828 intel_gt_chipset_flush(batch
->vm
->gt
);
833 static int live_all_engines(void *arg
)
835 struct drm_i915_private
*i915
= arg
;
836 const unsigned int nengines
= num_uabi_engines(i915
);
837 struct intel_engine_cs
*engine
;
838 struct i915_request
**request
;
839 struct igt_live_test t
;
840 struct i915_vma
*batch
;
845 * Check we can submit requests to all engines simultaneously. We
846 * send a recursive batch to each engine - checking that we don't
847 * block doing so, and that they don't complete too soon.
850 request
= kcalloc(nengines
, sizeof(*request
), GFP_KERNEL
);
854 err
= igt_live_test_begin(&t
, i915
, __func__
, "");
858 batch
= recursive_batch(i915
);
860 err
= PTR_ERR(batch
);
861 pr_err("%s: Unable to create batch, err=%d\n", __func__
, err
);
865 i915_vma_lock(batch
);
868 for_each_uabi_engine(engine
, i915
) {
869 request
[idx
] = intel_engine_create_kernel_request(engine
);
870 if (IS_ERR(request
[idx
])) {
871 err
= PTR_ERR(request
[idx
]);
872 pr_err("%s: Request allocation failed with err=%d\n",
877 err
= i915_request_await_object(request
[idx
], batch
->obj
, 0);
879 err
= i915_vma_move_to_active(batch
, request
[idx
], 0);
882 err
= engine
->emit_bb_start(request
[idx
],
887 request
[idx
]->batch
= batch
;
889 i915_request_get(request
[idx
]);
890 i915_request_add(request
[idx
]);
894 i915_vma_unlock(batch
);
897 for_each_uabi_engine(engine
, i915
) {
898 if (i915_request_completed(request
[idx
])) {
899 pr_err("%s(%s): request completed too early!\n",
900 __func__
, engine
->name
);
907 err
= recursive_batch_resolve(batch
);
909 pr_err("%s: failed to resolve batch, err=%d\n", __func__
, err
);
914 for_each_uabi_engine(engine
, i915
) {
917 timeout
= i915_request_wait(request
[idx
], 0,
918 MAX_SCHEDULE_TIMEOUT
);
921 pr_err("%s: error waiting for request on %s, err=%d\n",
922 __func__
, engine
->name
, err
);
926 GEM_BUG_ON(!i915_request_completed(request
[idx
]));
927 i915_request_put(request
[idx
]);
932 err
= igt_live_test_end(&t
);
936 for_each_uabi_engine(engine
, i915
) {
938 i915_request_put(request
[idx
]);
941 i915_vma_unpin(batch
);
948 static int live_sequential_engines(void *arg
)
950 struct drm_i915_private
*i915
= arg
;
951 const unsigned int nengines
= num_uabi_engines(i915
);
952 struct i915_request
**request
;
953 struct i915_request
*prev
= NULL
;
954 struct intel_engine_cs
*engine
;
955 struct igt_live_test t
;
960 * Check we can submit requests to all engines sequentially, such
961 * that each successive request waits for the earlier ones. This
962 * tests that we don't execute requests out of order, even though
963 * they are running on independent engines.
966 request
= kcalloc(nengines
, sizeof(*request
), GFP_KERNEL
);
970 err
= igt_live_test_begin(&t
, i915
, __func__
, "");
975 for_each_uabi_engine(engine
, i915
) {
976 struct i915_vma
*batch
;
978 batch
= recursive_batch(i915
);
980 err
= PTR_ERR(batch
);
981 pr_err("%s: Unable to create batch for %s, err=%d\n",
982 __func__
, engine
->name
, err
);
986 i915_vma_lock(batch
);
987 request
[idx
] = intel_engine_create_kernel_request(engine
);
988 if (IS_ERR(request
[idx
])) {
989 err
= PTR_ERR(request
[idx
]);
990 pr_err("%s: Request allocation failed for %s with err=%d\n",
991 __func__
, engine
->name
, err
);
996 err
= i915_request_await_dma_fence(request
[idx
],
999 i915_request_add(request
[idx
]);
1000 pr_err("%s: Request await failed for %s with err=%d\n",
1001 __func__
, engine
->name
, err
);
1006 err
= i915_request_await_object(request
[idx
],
1009 err
= i915_vma_move_to_active(batch
, request
[idx
], 0);
1012 err
= engine
->emit_bb_start(request
[idx
],
1017 request
[idx
]->batch
= batch
;
1019 i915_request_get(request
[idx
]);
1020 i915_request_add(request
[idx
]);
1022 prev
= request
[idx
];
1026 i915_vma_unlock(batch
);
1032 for_each_uabi_engine(engine
, i915
) {
1035 if (i915_request_completed(request
[idx
])) {
1036 pr_err("%s(%s): request completed too early!\n",
1037 __func__
, engine
->name
);
1042 err
= recursive_batch_resolve(request
[idx
]->batch
);
1044 pr_err("%s: failed to resolve batch, err=%d\n",
1049 timeout
= i915_request_wait(request
[idx
], 0,
1050 MAX_SCHEDULE_TIMEOUT
);
1053 pr_err("%s: error waiting for request on %s, err=%d\n",
1054 __func__
, engine
->name
, err
);
1058 GEM_BUG_ON(!i915_request_completed(request
[idx
]));
1062 err
= igt_live_test_end(&t
);
1066 for_each_uabi_engine(engine
, i915
) {
1072 cmd
= i915_gem_object_pin_map(request
[idx
]->batch
->obj
,
1075 *cmd
= MI_BATCH_BUFFER_END
;
1077 __i915_gem_object_flush_map(request
[idx
]->batch
->obj
,
1079 i915_gem_object_unpin_map(request
[idx
]->batch
->obj
);
1081 intel_gt_chipset_flush(engine
->gt
);
1084 i915_vma_put(request
[idx
]->batch
);
1085 i915_request_put(request
[idx
]);
1093 static int __live_parallel_engine1(void *arg
)
1095 struct intel_engine_cs
*engine
= arg
;
1096 IGT_TIMEOUT(end_time
);
1097 unsigned long count
;
1101 intel_engine_pm_get(engine
);
1103 struct i915_request
*rq
;
1105 rq
= i915_request_create(engine
->kernel_context
);
1111 i915_request_get(rq
);
1112 i915_request_add(rq
);
1115 if (i915_request_wait(rq
, 0, HZ
/ 5) < 0)
1117 i915_request_put(rq
);
1122 } while (!__igt_timeout(end_time
, NULL
));
1123 intel_engine_pm_put(engine
);
1125 pr_info("%s: %lu request + sync\n", engine
->name
, count
);
1129 static int __live_parallel_engineN(void *arg
)
1131 struct intel_engine_cs
*engine
= arg
;
1132 IGT_TIMEOUT(end_time
);
1133 unsigned long count
;
1137 intel_engine_pm_get(engine
);
1139 struct i915_request
*rq
;
1141 rq
= i915_request_create(engine
->kernel_context
);
1147 i915_request_add(rq
);
1149 } while (!__igt_timeout(end_time
, NULL
));
1150 intel_engine_pm_put(engine
);
1152 pr_info("%s: %lu requests\n", engine
->name
, count
);
1156 static bool wake_all(struct drm_i915_private
*i915
)
1158 if (atomic_dec_and_test(&i915
->selftest
.counter
)) {
1159 wake_up_var(&i915
->selftest
.counter
);
1166 static int wait_for_all(struct drm_i915_private
*i915
)
1171 if (wait_var_event_timeout(&i915
->selftest
.counter
,
1172 !atomic_read(&i915
->selftest
.counter
),
1173 i915_selftest
.timeout_jiffies
))
1179 static int __live_parallel_spin(void *arg
)
1181 struct intel_engine_cs
*engine
= arg
;
1182 struct igt_spinner spin
;
1183 struct i915_request
*rq
;
1187 * Create a spinner running for eternity on each engine. If a second
1188 * spinner is incorrectly placed on the same engine, it will not be
1189 * able to start in time.
1192 if (igt_spinner_init(&spin
, engine
->gt
)) {
1193 wake_all(engine
->i915
);
1197 intel_engine_pm_get(engine
);
1198 rq
= igt_spinner_create_request(&spin
,
1199 engine
->kernel_context
,
1200 MI_NOOP
); /* no preemption */
1201 intel_engine_pm_put(engine
);
1206 wake_all(engine
->i915
);
1210 i915_request_get(rq
);
1211 i915_request_add(rq
);
1212 if (igt_wait_for_spinner(&spin
, rq
)) {
1213 /* Occupy this engine for the whole test */
1214 err
= wait_for_all(engine
->i915
);
1216 pr_err("Failed to start spinner on %s\n", engine
->name
);
1219 igt_spinner_end(&spin
);
1221 if (err
== 0 && i915_request_wait(rq
, 0, HZ
/ 5) < 0)
1223 i915_request_put(rq
);
1226 igt_spinner_fini(&spin
);
1230 static int live_parallel_engines(void *arg
)
1232 struct drm_i915_private
*i915
= arg
;
1233 static int (* const func
[])(void *arg
) = {
1234 __live_parallel_engine1
,
1235 __live_parallel_engineN
,
1236 __live_parallel_spin
,
1239 const unsigned int nengines
= num_uabi_engines(i915
);
1240 struct intel_engine_cs
*engine
;
1241 int (* const *fn
)(void *arg
);
1242 struct task_struct
**tsk
;
1246 * Check we can submit requests to all engines concurrently. This
1247 * tests that we load up the system maximally.
1250 tsk
= kcalloc(nengines
, sizeof(*tsk
), GFP_KERNEL
);
1254 for (fn
= func
; !err
&& *fn
; fn
++) {
1255 char name
[KSYM_NAME_LEN
];
1256 struct igt_live_test t
;
1259 snprintf(name
, sizeof(name
), "%ps", *fn
);
1260 err
= igt_live_test_begin(&t
, i915
, __func__
, name
);
1264 atomic_set(&i915
->selftest
.counter
, nengines
);
1267 for_each_uabi_engine(engine
, i915
) {
1268 tsk
[idx
] = kthread_run(*fn
, engine
,
1271 if (IS_ERR(tsk
[idx
])) {
1272 err
= PTR_ERR(tsk
[idx
]);
1275 get_task_struct(tsk
[idx
++]);
1278 yield(); /* start all threads before we kthread_stop() */
1281 for_each_uabi_engine(engine
, i915
) {
1284 if (IS_ERR(tsk
[idx
]))
1287 status
= kthread_stop(tsk
[idx
]);
1291 put_task_struct(tsk
[idx
++]);
1294 if (igt_live_test_end(&t
))
1303 max_batches(struct i915_gem_context
*ctx
, struct intel_engine_cs
*engine
)
1305 struct i915_request
*rq
;
1309 * Before execlists, all contexts share the same ringbuffer. With
1310 * execlists, each context/engine has a separate ringbuffer and
1311 * for the purposes of this test, inexhaustible.
1313 * For the global ringbuffer though, we have to be very careful
1314 * that we do not wrap while preventing the execution of requests
1315 * with a unsignaled fence.
1317 if (HAS_EXECLISTS(ctx
->i915
))
1320 rq
= igt_request_alloc(ctx
, engine
);
1326 ret
= rq
->ring
->size
- rq
->reserved_space
;
1327 i915_request_add(rq
);
1329 sz
= rq
->ring
->emit
- rq
->head
;
1331 sz
+= rq
->ring
->size
;
1333 ret
/= 2; /* leave half spare, in case of emergency! */
1339 static int live_breadcrumbs_smoketest(void *arg
)
1341 struct drm_i915_private
*i915
= arg
;
1342 const unsigned int nengines
= num_uabi_engines(i915
);
1343 const unsigned int ncpus
= num_online_cpus();
1344 unsigned long num_waits
, num_fences
;
1345 struct intel_engine_cs
*engine
;
1346 struct task_struct
**threads
;
1347 struct igt_live_test live
;
1348 intel_wakeref_t wakeref
;
1349 struct smoketest
*smoke
;
1350 unsigned int n
, idx
;
1355 * Smoketest our breadcrumb/signal handling for requests across multiple
1356 * threads. A very simple test to only catch the most egregious of bugs.
1357 * See __igt_breadcrumbs_smoketest();
1359 * On real hardware this time.
1362 wakeref
= intel_runtime_pm_get(&i915
->runtime_pm
);
1364 file
= mock_file(i915
);
1366 ret
= PTR_ERR(file
);
1370 smoke
= kcalloc(nengines
, sizeof(*smoke
), GFP_KERNEL
);
1376 threads
= kcalloc(ncpus
* nengines
, sizeof(*threads
), GFP_KERNEL
);
1382 smoke
[0].request_alloc
= __live_request_alloc
;
1383 smoke
[0].ncontexts
= 64;
1384 smoke
[0].contexts
= kcalloc(smoke
[0].ncontexts
,
1385 sizeof(*smoke
[0].contexts
),
1387 if (!smoke
[0].contexts
) {
1392 for (n
= 0; n
< smoke
[0].ncontexts
; n
++) {
1393 smoke
[0].contexts
[n
] = live_context(i915
, file
);
1394 if (!smoke
[0].contexts
[n
]) {
1400 ret
= igt_live_test_begin(&live
, i915
, __func__
, "");
1405 for_each_uabi_engine(engine
, i915
) {
1406 smoke
[idx
] = smoke
[0];
1407 smoke
[idx
].engine
= engine
;
1408 smoke
[idx
].max_batch
=
1409 max_batches(smoke
[0].contexts
[0], engine
);
1410 if (smoke
[idx
].max_batch
< 0) {
1411 ret
= smoke
[idx
].max_batch
;
1414 /* One ring interleaved between requests from all cpus */
1415 smoke
[idx
].max_batch
/= num_online_cpus() + 1;
1416 pr_debug("Limiting batches to %d requests on %s\n",
1417 smoke
[idx
].max_batch
, engine
->name
);
1419 for (n
= 0; n
< ncpus
; n
++) {
1420 struct task_struct
*tsk
;
1422 tsk
= kthread_run(__igt_breadcrumbs_smoketest
,
1423 &smoke
[idx
], "igt/%d.%d", idx
, n
);
1429 get_task_struct(tsk
);
1430 threads
[idx
* ncpus
+ n
] = tsk
;
1436 yield(); /* start all threads before we begin */
1437 msleep(jiffies_to_msecs(i915_selftest
.timeout_jiffies
));
1443 for_each_uabi_engine(engine
, i915
) {
1444 for (n
= 0; n
< ncpus
; n
++) {
1445 struct task_struct
*tsk
= threads
[idx
* ncpus
+ n
];
1451 err
= kthread_stop(tsk
);
1452 if (err
< 0 && !ret
)
1455 put_task_struct(tsk
);
1458 num_waits
+= atomic_long_read(&smoke
[idx
].num_waits
);
1459 num_fences
+= atomic_long_read(&smoke
[idx
].num_fences
);
1462 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1463 num_waits
, num_fences
, idx
, ncpus
);
1465 ret
= igt_live_test_end(&live
) ?: ret
;
1467 kfree(smoke
[0].contexts
);
1475 intel_runtime_pm_put(&i915
->runtime_pm
, wakeref
);
1480 int i915_request_live_selftests(struct drm_i915_private
*i915
)
1482 static const struct i915_subtest tests
[] = {
1483 SUBTEST(live_nop_request
),
1484 SUBTEST(live_all_engines
),
1485 SUBTEST(live_sequential_engines
),
1486 SUBTEST(live_parallel_engines
),
1487 SUBTEST(live_empty_request
),
1488 SUBTEST(live_breadcrumbs_smoketest
),
1491 if (intel_gt_is_wedged(&i915
->gt
))
1494 return i915_subtests(tests
, i915
);
1497 static int switch_to_kernel_sync(struct intel_context
*ce
, int err
)
1499 struct i915_request
*rq
;
1500 struct dma_fence
*fence
;
1502 rq
= intel_engine_create_kernel_request(ce
->engine
);
1506 fence
= i915_active_fence_get(&ce
->timeline
->last_request
);
1508 i915_request_await_dma_fence(rq
, fence
);
1509 dma_fence_put(fence
);
1512 rq
= i915_request_get(rq
);
1513 i915_request_add(rq
);
1514 if (i915_request_wait(rq
, 0, HZ
/ 2) < 0 && !err
)
1516 i915_request_put(rq
);
1518 while (!err
&& !intel_engine_is_idle(ce
->engine
))
1519 intel_engine_flush_submission(ce
->engine
);
1525 struct intel_engine_cs
*engine
;
1526 unsigned long count
;
1532 struct perf_series
{
1533 struct drm_i915_private
*i915
;
1534 unsigned int nengines
;
1535 struct intel_context
*ce
[];
1538 static int cmp_u32(const void *A
, const void *B
)
1540 const u32
*a
= A
, *b
= B
;
1545 static u32
trifilter(u32
*a
)
1550 sort(a
, TF_COUNT
, sizeof(*a
), cmp_u32
, NULL
);
1552 sum
= mul_u32_u32(a
[2], 2);
1556 GEM_BUG_ON(sum
> U32_MAX
);
1561 static u64
cycles_to_ns(struct intel_engine_cs
*engine
, u32 cycles
)
1563 u64 ns
= i915_cs_timestamp_ticks_to_ns(engine
->i915
, cycles
);
1565 return DIV_ROUND_CLOSEST(ns
, 1 << TF_BIAS
);
1568 static u32
*emit_timestamp_store(u32
*cs
, struct intel_context
*ce
, u32 offset
)
1570 *cs
++ = MI_STORE_REGISTER_MEM_GEN8
| MI_USE_GGTT
;
1571 *cs
++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce
->engine
->mmio_base
)));
1578 static u32
*emit_store_dw(u32
*cs
, u32 offset
, u32 value
)
1580 *cs
++ = MI_STORE_DWORD_IMM_GEN4
| MI_USE_GGTT
;
1588 static u32
*emit_semaphore_poll(u32
*cs
, u32 mode
, u32 value
, u32 offset
)
1590 *cs
++ = MI_SEMAPHORE_WAIT
|
1591 MI_SEMAPHORE_GLOBAL_GTT
|
1601 static u32
*emit_semaphore_poll_until(u32
*cs
, u32 offset
, u32 value
)
1603 return emit_semaphore_poll(cs
, MI_SEMAPHORE_SAD_EQ_SDD
, value
, offset
);
1606 static void semaphore_set(u32
*sema
, u32 value
)
1608 WRITE_ONCE(*sema
, value
);
1609 wmb(); /* flush the update to the cache, and beyond */
1612 static u32
*hwsp_scratch(const struct intel_context
*ce
)
1614 return memset32(ce
->engine
->status_page
.addr
+ 1000, 0, 21);
1617 static u32
hwsp_offset(const struct intel_context
*ce
, u32
*dw
)
1619 return (i915_ggtt_offset(ce
->engine
->status_page
.vma
) +
1620 offset_in_page(dw
));
1623 static int measure_semaphore_response(struct intel_context
*ce
)
1625 u32
*sema
= hwsp_scratch(ce
);
1626 const u32 offset
= hwsp_offset(ce
, sema
);
1627 u32 elapsed
[TF_COUNT
], cycles
;
1628 struct i915_request
*rq
;
1634 * Measure how many cycles it takes for the HW to detect the change
1635 * in a semaphore value.
1637 * A: read CS_TIMESTAMP from CPU
1639 * B: read CS_TIMESTAMP on GPU
1641 * Semaphore latency: B - A
1644 semaphore_set(sema
, -1);
1646 rq
= i915_request_create(ce
);
1650 cs
= intel_ring_begin(rq
, 4 + 12 * ARRAY_SIZE(elapsed
));
1652 i915_request_add(rq
);
1657 cs
= emit_store_dw(cs
, offset
, 0);
1658 for (i
= 1; i
<= ARRAY_SIZE(elapsed
); i
++) {
1659 cs
= emit_semaphore_poll_until(cs
, offset
, i
);
1660 cs
= emit_timestamp_store(cs
, ce
, offset
+ i
* sizeof(u32
));
1661 cs
= emit_store_dw(cs
, offset
, 0);
1664 intel_ring_advance(rq
, cs
);
1665 i915_request_add(rq
);
1667 if (wait_for(READ_ONCE(*sema
) == 0, 50)) {
1672 for (i
= 1; i
<= ARRAY_SIZE(elapsed
); i
++) {
1674 cycles
= ENGINE_READ_FW(ce
->engine
, RING_TIMESTAMP
);
1675 semaphore_set(sema
, i
);
1678 if (wait_for(READ_ONCE(*sema
) == 0, 50)) {
1683 elapsed
[i
- 1] = sema
[i
] - cycles
;
1686 cycles
= trifilter(elapsed
);
1687 pr_info("%s: semaphore response %d cycles, %lluns\n",
1688 ce
->engine
->name
, cycles
>> TF_BIAS
,
1689 cycles_to_ns(ce
->engine
, cycles
));
1691 return intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
);
1694 intel_gt_set_wedged(ce
->engine
->gt
);
1698 static int measure_idle_dispatch(struct intel_context
*ce
)
1700 u32
*sema
= hwsp_scratch(ce
);
1701 const u32 offset
= hwsp_offset(ce
, sema
);
1702 u32 elapsed
[TF_COUNT
], cycles
;
1708 * Measure how long it takes for us to submit a request while the
1709 * engine is idle, but is resting in our context.
1711 * A: read CS_TIMESTAMP from CPU
1713 * B: read CS_TIMESTAMP on GPU
1715 * Submission latency: B - A
1718 for (i
= 0; i
< ARRAY_SIZE(elapsed
); i
++) {
1719 struct i915_request
*rq
;
1721 err
= intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
/ 2);
1725 rq
= i915_request_create(ce
);
1731 cs
= intel_ring_begin(rq
, 4);
1733 i915_request_add(rq
);
1738 cs
= emit_timestamp_store(cs
, ce
, offset
+ i
* sizeof(u32
));
1740 intel_ring_advance(rq
, cs
);
1744 elapsed
[i
] = ENGINE_READ_FW(ce
->engine
, RING_TIMESTAMP
);
1745 i915_request_add(rq
);
1750 err
= intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
/ 2);
1754 for (i
= 0; i
< ARRAY_SIZE(elapsed
); i
++)
1755 elapsed
[i
] = sema
[i
] - elapsed
[i
];
1757 cycles
= trifilter(elapsed
);
1758 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1759 ce
->engine
->name
, cycles
>> TF_BIAS
,
1760 cycles_to_ns(ce
->engine
, cycles
));
1762 return intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
);
1765 intel_gt_set_wedged(ce
->engine
->gt
);
1769 static int measure_busy_dispatch(struct intel_context
*ce
)
1771 u32
*sema
= hwsp_scratch(ce
);
1772 const u32 offset
= hwsp_offset(ce
, sema
);
1773 u32 elapsed
[TF_COUNT
+ 1], cycles
;
1779 * Measure how long it takes for us to submit a request while the
1780 * engine is busy, polling on a semaphore in our context. With
1781 * direct submission, this will include the cost of a lite restore.
1783 * A: read CS_TIMESTAMP from CPU
1785 * B: read CS_TIMESTAMP on GPU
1787 * Submission latency: B - A
1790 for (i
= 1; i
<= ARRAY_SIZE(elapsed
); i
++) {
1791 struct i915_request
*rq
;
1793 rq
= i915_request_create(ce
);
1799 cs
= intel_ring_begin(rq
, 12);
1801 i915_request_add(rq
);
1806 cs
= emit_store_dw(cs
, offset
+ i
* sizeof(u32
), -1);
1807 cs
= emit_semaphore_poll_until(cs
, offset
, i
);
1808 cs
= emit_timestamp_store(cs
, ce
, offset
+ i
* sizeof(u32
));
1810 intel_ring_advance(rq
, cs
);
1812 if (i
> 1 && wait_for(READ_ONCE(sema
[i
- 1]), 500)) {
1819 elapsed
[i
- 1] = ENGINE_READ_FW(ce
->engine
, RING_TIMESTAMP
);
1820 i915_request_add(rq
);
1822 semaphore_set(sema
, i
- 1);
1826 wait_for(READ_ONCE(sema
[i
- 1]), 500);
1827 semaphore_set(sema
, i
- 1);
1829 for (i
= 1; i
<= TF_COUNT
; i
++) {
1830 GEM_BUG_ON(sema
[i
] == -1);
1831 elapsed
[i
- 1] = sema
[i
] - elapsed
[i
];
1834 cycles
= trifilter(elapsed
);
1835 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
1836 ce
->engine
->name
, cycles
>> TF_BIAS
,
1837 cycles_to_ns(ce
->engine
, cycles
));
1839 return intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
);
1842 intel_gt_set_wedged(ce
->engine
->gt
);
1846 static int plug(struct intel_engine_cs
*engine
, u32
*sema
, u32 mode
, int value
)
1849 i915_ggtt_offset(engine
->status_page
.vma
) +
1850 offset_in_page(sema
);
1851 struct i915_request
*rq
;
1854 rq
= i915_request_create(engine
->kernel_context
);
1858 cs
= intel_ring_begin(rq
, 4);
1860 i915_request_add(rq
);
1864 cs
= emit_semaphore_poll(cs
, mode
, value
, offset
);
1866 intel_ring_advance(rq
, cs
);
1867 i915_request_add(rq
);
1872 static int measure_inter_request(struct intel_context
*ce
)
1874 u32
*sema
= hwsp_scratch(ce
);
1875 const u32 offset
= hwsp_offset(ce
, sema
);
1876 u32 elapsed
[TF_COUNT
+ 1], cycles
;
1877 struct i915_sw_fence
*submit
;
1881 * Measure how long it takes to advance from one request into the
1882 * next. Between each request we flush the GPU caches to memory,
1883 * update the breadcrumbs, and then invalidate those caches.
1884 * We queue up all the requests to be submitted in one batch so
1885 * it should be one set of contiguous measurements.
1887 * A: read CS_TIMESTAMP on GPU
1889 * B: read CS_TIMESTAMP on GPU
1891 * Request latency: B - A
1894 err
= plug(ce
->engine
, sema
, MI_SEMAPHORE_SAD_NEQ_SDD
, 0);
1898 submit
= heap_fence_create(GFP_KERNEL
);
1900 semaphore_set(sema
, 1);
1904 intel_engine_flush_submission(ce
->engine
);
1905 for (i
= 1; i
<= ARRAY_SIZE(elapsed
); i
++) {
1906 struct i915_request
*rq
;
1909 rq
= i915_request_create(ce
);
1915 err
= i915_sw_fence_await_sw_fence_gfp(&rq
->submit
,
1919 i915_request_add(rq
);
1923 cs
= intel_ring_begin(rq
, 4);
1925 i915_request_add(rq
);
1930 cs
= emit_timestamp_store(cs
, ce
, offset
+ i
* sizeof(u32
));
1932 intel_ring_advance(rq
, cs
);
1933 i915_request_add(rq
);
1936 i915_sw_fence_commit(submit
);
1938 intel_engine_flush_submission(ce
->engine
);
1939 heap_fence_put(submit
);
1941 semaphore_set(sema
, 1);
1942 err
= intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
/ 2);
1946 for (i
= 1; i
<= TF_COUNT
; i
++)
1947 elapsed
[i
- 1] = sema
[i
+ 1] - sema
[i
];
1949 cycles
= trifilter(elapsed
);
1950 pr_info("%s: inter-request latency %d cycles, %lluns\n",
1951 ce
->engine
->name
, cycles
>> TF_BIAS
,
1952 cycles_to_ns(ce
->engine
, cycles
));
1954 return intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
);
1957 i915_sw_fence_commit(submit
);
1958 heap_fence_put(submit
);
1959 semaphore_set(sema
, 1);
1961 intel_gt_set_wedged(ce
->engine
->gt
);
1965 static int measure_context_switch(struct intel_context
*ce
)
1967 u32
*sema
= hwsp_scratch(ce
);
1968 const u32 offset
= hwsp_offset(ce
, sema
);
1969 struct i915_request
*fence
= NULL
;
1970 u32 elapsed
[TF_COUNT
+ 1], cycles
;
1975 * Measure how long it takes to advance from one request in one
1976 * context to a request in another context. This allows us to
1977 * measure how long the context save/restore take, along with all
1978 * the inter-context setup we require.
1980 * A: read CS_TIMESTAMP on GPU
1982 * B: read CS_TIMESTAMP on GPU
1984 * Context switch latency: B - A
1987 err
= plug(ce
->engine
, sema
, MI_SEMAPHORE_SAD_NEQ_SDD
, 0);
1991 for (i
= 1; i
<= ARRAY_SIZE(elapsed
); i
++) {
1992 struct intel_context
*arr
[] = {
1993 ce
, ce
->engine
->kernel_context
1995 u32 addr
= offset
+ ARRAY_SIZE(arr
) * i
* sizeof(u32
);
1997 for (j
= 0; j
< ARRAY_SIZE(arr
); j
++) {
1998 struct i915_request
*rq
;
2000 rq
= i915_request_create(arr
[j
]);
2007 err
= i915_request_await_dma_fence(rq
,
2010 i915_request_add(rq
);
2015 cs
= intel_ring_begin(rq
, 4);
2017 i915_request_add(rq
);
2022 cs
= emit_timestamp_store(cs
, ce
, addr
);
2023 addr
+= sizeof(u32
);
2025 intel_ring_advance(rq
, cs
);
2027 i915_request_put(fence
);
2028 fence
= i915_request_get(rq
);
2030 i915_request_add(rq
);
2033 i915_request_put(fence
);
2034 intel_engine_flush_submission(ce
->engine
);
2036 semaphore_set(sema
, 1);
2037 err
= intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
/ 2);
2041 for (i
= 1; i
<= TF_COUNT
; i
++)
2042 elapsed
[i
- 1] = sema
[2 * i
+ 2] - sema
[2 * i
+ 1];
2044 cycles
= trifilter(elapsed
);
2045 pr_info("%s: context switch latency %d cycles, %lluns\n",
2046 ce
->engine
->name
, cycles
>> TF_BIAS
,
2047 cycles_to_ns(ce
->engine
, cycles
));
2049 return intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
);
2052 i915_request_put(fence
);
2053 semaphore_set(sema
, 1);
2055 intel_gt_set_wedged(ce
->engine
->gt
);
2059 static int measure_preemption(struct intel_context
*ce
)
2061 u32
*sema
= hwsp_scratch(ce
);
2062 const u32 offset
= hwsp_offset(ce
, sema
);
2063 u32 elapsed
[TF_COUNT
], cycles
;
2069 * We measure two latencies while triggering preemption. The first
2070 * latency is how long it takes for us to submit a preempting request.
2071 * The second latency is how it takes for us to return from the
2072 * preemption back to the original context.
2074 * A: read CS_TIMESTAMP from CPU
2076 * B: read CS_TIMESTAMP on GPU (in preempting context)
2078 * C: read CS_TIMESTAMP on GPU (in original context)
2080 * Preemption dispatch latency: B - A
2081 * Preemption switch latency: C - B
2084 if (!intel_engine_has_preemption(ce
->engine
))
2087 for (i
= 1; i
<= ARRAY_SIZE(elapsed
); i
++) {
2088 u32 addr
= offset
+ 2 * i
* sizeof(u32
);
2089 struct i915_request
*rq
;
2091 rq
= i915_request_create(ce
);
2097 cs
= intel_ring_begin(rq
, 12);
2099 i915_request_add(rq
);
2104 cs
= emit_store_dw(cs
, addr
, -1);
2105 cs
= emit_semaphore_poll_until(cs
, offset
, i
);
2106 cs
= emit_timestamp_store(cs
, ce
, addr
+ sizeof(u32
));
2108 intel_ring_advance(rq
, cs
);
2109 i915_request_add(rq
);
2111 if (wait_for(READ_ONCE(sema
[2 * i
]) == -1, 500)) {
2116 rq
= i915_request_create(ce
->engine
->kernel_context
);
2122 cs
= intel_ring_begin(rq
, 8);
2124 i915_request_add(rq
);
2129 cs
= emit_timestamp_store(cs
, ce
, addr
);
2130 cs
= emit_store_dw(cs
, offset
, i
);
2132 intel_ring_advance(rq
, cs
);
2133 rq
->sched
.attr
.priority
= I915_PRIORITY_BARRIER
;
2135 elapsed
[i
- 1] = ENGINE_READ_FW(ce
->engine
, RING_TIMESTAMP
);
2136 i915_request_add(rq
);
2139 if (wait_for(READ_ONCE(sema
[2 * i
- 2]) != -1, 500)) {
2144 for (i
= 1; i
<= TF_COUNT
; i
++)
2145 elapsed
[i
- 1] = sema
[2 * i
+ 0] - elapsed
[i
- 1];
2147 cycles
= trifilter(elapsed
);
2148 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2149 ce
->engine
->name
, cycles
>> TF_BIAS
,
2150 cycles_to_ns(ce
->engine
, cycles
));
2152 for (i
= 1; i
<= TF_COUNT
; i
++)
2153 elapsed
[i
- 1] = sema
[2 * i
+ 1] - sema
[2 * i
+ 0];
2155 cycles
= trifilter(elapsed
);
2156 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2157 ce
->engine
->name
, cycles
>> TF_BIAS
,
2158 cycles_to_ns(ce
->engine
, cycles
));
2160 return intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
);
2163 intel_gt_set_wedged(ce
->engine
->gt
);
2168 struct dma_fence_cb base
;
2172 static void signal_cb(struct dma_fence
*fence
, struct dma_fence_cb
*cb
)
2174 struct signal_cb
*s
= container_of(cb
, typeof(*s
), base
);
2176 smp_store_mb(s
->seen
, true); /* be safe, be strong */
2179 static int measure_completion(struct intel_context
*ce
)
2181 u32
*sema
= hwsp_scratch(ce
);
2182 const u32 offset
= hwsp_offset(ce
, sema
);
2183 u32 elapsed
[TF_COUNT
], cycles
;
2189 * Measure how long it takes for the signal (interrupt) to be
2190 * sent from the GPU to be processed by the CPU.
2192 * A: read CS_TIMESTAMP on GPU
2194 * B: read CS_TIMESTAMP from CPU
2196 * Completion latency: B - A
2199 for (i
= 1; i
<= ARRAY_SIZE(elapsed
); i
++) {
2200 struct signal_cb cb
= { .seen
= false };
2201 struct i915_request
*rq
;
2203 rq
= i915_request_create(ce
);
2209 cs
= intel_ring_begin(rq
, 12);
2211 i915_request_add(rq
);
2216 cs
= emit_store_dw(cs
, offset
+ i
* sizeof(u32
), -1);
2217 cs
= emit_semaphore_poll_until(cs
, offset
, i
);
2218 cs
= emit_timestamp_store(cs
, ce
, offset
+ i
* sizeof(u32
));
2220 intel_ring_advance(rq
, cs
);
2222 dma_fence_add_callback(&rq
->fence
, &cb
.base
, signal_cb
);
2225 i915_request_add(rq
);
2228 if (wait_for(READ_ONCE(sema
[i
]) == -1, 50)) {
2234 semaphore_set(sema
, i
);
2235 while (!READ_ONCE(cb
.seen
))
2238 elapsed
[i
- 1] = ENGINE_READ_FW(ce
->engine
, RING_TIMESTAMP
);
2242 err
= intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
/ 2);
2246 for (i
= 0; i
< ARRAY_SIZE(elapsed
); i
++) {
2247 GEM_BUG_ON(sema
[i
+ 1] == -1);
2248 elapsed
[i
] = elapsed
[i
] - sema
[i
+ 1];
2251 cycles
= trifilter(elapsed
);
2252 pr_info("%s: completion latency %d cycles, %lluns\n",
2253 ce
->engine
->name
, cycles
>> TF_BIAS
,
2254 cycles_to_ns(ce
->engine
, cycles
));
2256 return intel_gt_wait_for_idle(ce
->engine
->gt
, HZ
);
2259 intel_gt_set_wedged(ce
->engine
->gt
);
2263 static void rps_pin(struct intel_gt
*gt
)
2265 /* Pin the frequency to max */
2266 atomic_inc(>
->rps
.num_waiters
);
2267 intel_uncore_forcewake_get(gt
->uncore
, FORCEWAKE_ALL
);
2269 mutex_lock(>
->rps
.lock
);
2270 intel_rps_set(>
->rps
, gt
->rps
.max_freq
);
2271 mutex_unlock(>
->rps
.lock
);
2274 static void rps_unpin(struct intel_gt
*gt
)
2276 intel_uncore_forcewake_put(gt
->uncore
, FORCEWAKE_ALL
);
2277 atomic_dec(>
->rps
.num_waiters
);
2280 static int perf_request_latency(void *arg
)
2282 struct drm_i915_private
*i915
= arg
;
2283 struct intel_engine_cs
*engine
;
2284 struct pm_qos_request qos
;
2287 if (INTEL_GEN(i915
) < 8) /* per-engine CS timestamp, semaphores */
2290 cpu_latency_qos_add_request(&qos
, 0); /* disable cstates */
2292 for_each_uabi_engine(engine
, i915
) {
2293 struct intel_context
*ce
;
2295 ce
= intel_context_create(engine
);
2301 err
= intel_context_pin(ce
);
2303 intel_context_put(ce
);
2307 st_engine_heartbeat_disable(engine
);
2308 rps_pin(engine
->gt
);
2311 err
= measure_semaphore_response(ce
);
2313 err
= measure_idle_dispatch(ce
);
2315 err
= measure_busy_dispatch(ce
);
2317 err
= measure_inter_request(ce
);
2319 err
= measure_context_switch(ce
);
2321 err
= measure_preemption(ce
);
2323 err
= measure_completion(ce
);
2325 rps_unpin(engine
->gt
);
2326 st_engine_heartbeat_enable(engine
);
2328 intel_context_unpin(ce
);
2329 intel_context_put(ce
);
2335 if (igt_flush_test(i915
))
2338 cpu_latency_qos_remove_request(&qos
);
2342 static int s_sync0(void *arg
)
2344 struct perf_series
*ps
= arg
;
2345 IGT_TIMEOUT(end_time
);
2346 unsigned int idx
= 0;
2349 GEM_BUG_ON(!ps
->nengines
);
2351 struct i915_request
*rq
;
2353 rq
= i915_request_create(ps
->ce
[idx
]);
2359 i915_request_get(rq
);
2360 i915_request_add(rq
);
2362 if (i915_request_wait(rq
, 0, HZ
/ 5) < 0)
2364 i915_request_put(rq
);
2368 if (++idx
== ps
->nengines
)
2370 } while (!__igt_timeout(end_time
, NULL
));
2375 static int s_sync1(void *arg
)
2377 struct perf_series
*ps
= arg
;
2378 struct i915_request
*prev
= NULL
;
2379 IGT_TIMEOUT(end_time
);
2380 unsigned int idx
= 0;
2383 GEM_BUG_ON(!ps
->nengines
);
2385 struct i915_request
*rq
;
2387 rq
= i915_request_create(ps
->ce
[idx
]);
2393 i915_request_get(rq
);
2394 i915_request_add(rq
);
2396 if (prev
&& i915_request_wait(prev
, 0, HZ
/ 5) < 0)
2398 i915_request_put(prev
);
2403 if (++idx
== ps
->nengines
)
2405 } while (!__igt_timeout(end_time
, NULL
));
2406 i915_request_put(prev
);
2411 static int s_many(void *arg
)
2413 struct perf_series
*ps
= arg
;
2414 IGT_TIMEOUT(end_time
);
2415 unsigned int idx
= 0;
2417 GEM_BUG_ON(!ps
->nengines
);
2419 struct i915_request
*rq
;
2421 rq
= i915_request_create(ps
->ce
[idx
]);
2425 i915_request_add(rq
);
2427 if (++idx
== ps
->nengines
)
2429 } while (!__igt_timeout(end_time
, NULL
));
2434 static int perf_series_engines(void *arg
)
2436 struct drm_i915_private
*i915
= arg
;
2437 static int (* const func
[])(void *arg
) = {
2443 const unsigned int nengines
= num_uabi_engines(i915
);
2444 struct intel_engine_cs
*engine
;
2445 int (* const *fn
)(void *arg
);
2446 struct pm_qos_request qos
;
2447 struct perf_stats
*stats
;
2448 struct perf_series
*ps
;
2452 stats
= kcalloc(nengines
, sizeof(*stats
), GFP_KERNEL
);
2456 ps
= kzalloc(struct_size(ps
, ce
, nengines
), GFP_KERNEL
);
2462 cpu_latency_qos_add_request(&qos
, 0); /* disable cstates */
2465 ps
->nengines
= nengines
;
2468 for_each_uabi_engine(engine
, i915
) {
2469 struct intel_context
*ce
;
2471 ce
= intel_context_create(engine
);
2477 err
= intel_context_pin(ce
);
2479 intel_context_put(ce
);
2485 GEM_BUG_ON(idx
!= ps
->nengines
);
2487 for (fn
= func
; *fn
&& !err
; fn
++) {
2488 char name
[KSYM_NAME_LEN
];
2489 struct igt_live_test t
;
2491 snprintf(name
, sizeof(name
), "%ps", *fn
);
2492 err
= igt_live_test_begin(&t
, i915
, __func__
, name
);
2496 for (idx
= 0; idx
< nengines
; idx
++) {
2497 struct perf_stats
*p
=
2498 memset(&stats
[idx
], 0, sizeof(stats
[idx
]));
2499 struct intel_context
*ce
= ps
->ce
[idx
];
2501 p
->engine
= ps
->ce
[idx
]->engine
;
2502 intel_engine_pm_get(p
->engine
);
2504 if (intel_engine_supports_stats(p
->engine
))
2505 p
->busy
= intel_engine_get_busy_time(p
->engine
,
2508 p
->time
= ktime_get();
2509 p
->runtime
= -intel_context_get_total_runtime_ns(ce
);
2513 if (igt_live_test_end(&t
))
2516 for (idx
= 0; idx
< nengines
; idx
++) {
2517 struct perf_stats
*p
= &stats
[idx
];
2518 struct intel_context
*ce
= ps
->ce
[idx
];
2519 int integer
, decimal
;
2523 p
->busy
= ktime_sub(intel_engine_get_busy_time(p
->engine
,
2528 p
->time
= ktime_sub(now
, p
->time
);
2530 err
= switch_to_kernel_sync(ce
, err
);
2531 p
->runtime
+= intel_context_get_total_runtime_ns(ce
);
2532 intel_engine_pm_put(p
->engine
);
2534 busy
= 100 * ktime_to_ns(p
->busy
);
2535 dt
= ktime_to_ns(p
->time
);
2537 integer
= div64_u64(busy
, dt
);
2538 busy
-= integer
* dt
;
2539 decimal
= div64_u64(100 * busy
, dt
);
2545 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2546 name
, p
->engine
->name
, ce
->timeline
->seqno
,
2548 div_u64(p
->runtime
, 1000 * 1000),
2549 div_u64(ktime_to_ns(p
->time
), 1000 * 1000));
2554 for (idx
= 0; idx
< nengines
; idx
++) {
2555 if (IS_ERR_OR_NULL(ps
->ce
[idx
]))
2558 intel_context_unpin(ps
->ce
[idx
]);
2559 intel_context_put(ps
->ce
[idx
]);
2563 cpu_latency_qos_remove_request(&qos
);
2568 static int p_sync0(void *arg
)
2570 struct perf_stats
*p
= arg
;
2571 struct intel_engine_cs
*engine
= p
->engine
;
2572 struct intel_context
*ce
;
2573 IGT_TIMEOUT(end_time
);
2574 unsigned long count
;
2578 ce
= intel_context_create(engine
);
2582 err
= intel_context_pin(ce
);
2584 intel_context_put(ce
);
2588 if (intel_engine_supports_stats(engine
)) {
2589 p
->busy
= intel_engine_get_busy_time(engine
, &p
->time
);
2592 p
->time
= ktime_get();
2598 struct i915_request
*rq
;
2600 rq
= i915_request_create(ce
);
2606 i915_request_get(rq
);
2607 i915_request_add(rq
);
2610 if (i915_request_wait(rq
, 0, HZ
/ 5) < 0)
2612 i915_request_put(rq
);
2617 } while (!__igt_timeout(end_time
, NULL
));
2622 p
->busy
= ktime_sub(intel_engine_get_busy_time(engine
, &now
),
2624 p
->time
= ktime_sub(now
, p
->time
);
2626 p
->time
= ktime_sub(ktime_get(), p
->time
);
2629 err
= switch_to_kernel_sync(ce
, err
);
2630 p
->runtime
= intel_context_get_total_runtime_ns(ce
);
2633 intel_context_unpin(ce
);
2634 intel_context_put(ce
);
2638 static int p_sync1(void *arg
)
2640 struct perf_stats
*p
= arg
;
2641 struct intel_engine_cs
*engine
= p
->engine
;
2642 struct i915_request
*prev
= NULL
;
2643 struct intel_context
*ce
;
2644 IGT_TIMEOUT(end_time
);
2645 unsigned long count
;
2649 ce
= intel_context_create(engine
);
2653 err
= intel_context_pin(ce
);
2655 intel_context_put(ce
);
2659 if (intel_engine_supports_stats(engine
)) {
2660 p
->busy
= intel_engine_get_busy_time(engine
, &p
->time
);
2663 p
->time
= ktime_get();
2669 struct i915_request
*rq
;
2671 rq
= i915_request_create(ce
);
2677 i915_request_get(rq
);
2678 i915_request_add(rq
);
2681 if (prev
&& i915_request_wait(prev
, 0, HZ
/ 5) < 0)
2683 i915_request_put(prev
);
2689 } while (!__igt_timeout(end_time
, NULL
));
2690 i915_request_put(prev
);
2695 p
->busy
= ktime_sub(intel_engine_get_busy_time(engine
, &now
),
2697 p
->time
= ktime_sub(now
, p
->time
);
2699 p
->time
= ktime_sub(ktime_get(), p
->time
);
2702 err
= switch_to_kernel_sync(ce
, err
);
2703 p
->runtime
= intel_context_get_total_runtime_ns(ce
);
2706 intel_context_unpin(ce
);
2707 intel_context_put(ce
);
2711 static int p_many(void *arg
)
2713 struct perf_stats
*p
= arg
;
2714 struct intel_engine_cs
*engine
= p
->engine
;
2715 struct intel_context
*ce
;
2716 IGT_TIMEOUT(end_time
);
2717 unsigned long count
;
2721 ce
= intel_context_create(engine
);
2725 err
= intel_context_pin(ce
);
2727 intel_context_put(ce
);
2731 if (intel_engine_supports_stats(engine
)) {
2732 p
->busy
= intel_engine_get_busy_time(engine
, &p
->time
);
2735 p
->time
= ktime_get();
2741 struct i915_request
*rq
;
2743 rq
= i915_request_create(ce
);
2749 i915_request_add(rq
);
2751 } while (!__igt_timeout(end_time
, NULL
));
2756 p
->busy
= ktime_sub(intel_engine_get_busy_time(engine
, &now
),
2758 p
->time
= ktime_sub(now
, p
->time
);
2760 p
->time
= ktime_sub(ktime_get(), p
->time
);
2763 err
= switch_to_kernel_sync(ce
, err
);
2764 p
->runtime
= intel_context_get_total_runtime_ns(ce
);
2767 intel_context_unpin(ce
);
2768 intel_context_put(ce
);
2772 static int perf_parallel_engines(void *arg
)
2774 struct drm_i915_private
*i915
= arg
;
2775 static int (* const func
[])(void *arg
) = {
2781 const unsigned int nengines
= num_uabi_engines(i915
);
2782 struct intel_engine_cs
*engine
;
2783 int (* const *fn
)(void *arg
);
2784 struct pm_qos_request qos
;
2786 struct perf_stats p
;
2787 struct task_struct
*tsk
;
2791 engines
= kcalloc(nengines
, sizeof(*engines
), GFP_KERNEL
);
2795 cpu_latency_qos_add_request(&qos
, 0);
2797 for (fn
= func
; *fn
; fn
++) {
2798 char name
[KSYM_NAME_LEN
];
2799 struct igt_live_test t
;
2802 snprintf(name
, sizeof(name
), "%ps", *fn
);
2803 err
= igt_live_test_begin(&t
, i915
, __func__
, name
);
2807 atomic_set(&i915
->selftest
.counter
, nengines
);
2810 for_each_uabi_engine(engine
, i915
) {
2811 intel_engine_pm_get(engine
);
2813 memset(&engines
[idx
].p
, 0, sizeof(engines
[idx
].p
));
2814 engines
[idx
].p
.engine
= engine
;
2816 engines
[idx
].tsk
= kthread_run(*fn
, &engines
[idx
].p
,
2817 "igt:%s", engine
->name
);
2818 if (IS_ERR(engines
[idx
].tsk
)) {
2819 err
= PTR_ERR(engines
[idx
].tsk
);
2820 intel_engine_pm_put(engine
);
2823 get_task_struct(engines
[idx
++].tsk
);
2826 yield(); /* start all threads before we kthread_stop() */
2829 for_each_uabi_engine(engine
, i915
) {
2832 if (IS_ERR(engines
[idx
].tsk
))
2835 status
= kthread_stop(engines
[idx
].tsk
);
2839 intel_engine_pm_put(engine
);
2840 put_task_struct(engines
[idx
++].tsk
);
2843 if (igt_live_test_end(&t
))
2849 for_each_uabi_engine(engine
, i915
) {
2850 struct perf_stats
*p
= &engines
[idx
].p
;
2851 u64 busy
= 100 * ktime_to_ns(p
->busy
);
2852 u64 dt
= ktime_to_ns(p
->time
);
2853 int integer
, decimal
;
2856 integer
= div64_u64(busy
, dt
);
2857 busy
-= integer
* dt
;
2858 decimal
= div64_u64(100 * busy
, dt
);
2864 GEM_BUG_ON(engine
!= p
->engine
);
2865 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2866 name
, engine
->name
, p
->count
, integer
, decimal
,
2867 div_u64(p
->runtime
, 1000 * 1000),
2868 div_u64(ktime_to_ns(p
->time
), 1000 * 1000));
2873 cpu_latency_qos_remove_request(&qos
);
2878 int i915_request_perf_selftests(struct drm_i915_private
*i915
)
2880 static const struct i915_subtest tests
[] = {
2881 SUBTEST(perf_request_latency
),
2882 SUBTEST(perf_series_engines
),
2883 SUBTEST(perf_parallel_engines
),
2886 if (intel_gt_is_wedged(&i915
->gt
))
2889 return i915_subtests(tests
, i915
);