2 * SPDX-License-Identifier: MIT
4 * Copyright © 2017-2018 Intel Corporation
7 #include <linux/prime_numbers.h>
9 #include "intel_engine_pm.h"
11 #include "intel_gt_requests.h"
12 #include "intel_ring.h"
14 #include "../selftests/i915_random.h"
15 #include "../i915_selftest.h"
17 #include "../selftests/igt_flush_test.h"
18 #include "../selftests/mock_gem_device.h"
19 #include "selftests/mock_timeline.h"
21 static struct page
*hwsp_page(struct intel_timeline
*tl
)
23 struct drm_i915_gem_object
*obj
= tl
->hwsp_ggtt
->obj
;
25 GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj
));
26 return sg_page(obj
->mm
.pages
->sgl
);
29 static unsigned long hwsp_cacheline(struct intel_timeline
*tl
)
31 unsigned long address
= (unsigned long)page_address(hwsp_page(tl
));
33 return (address
+ tl
->hwsp_offset
) / CACHELINE_BYTES
;
36 #define CACHELINES_PER_PAGE (PAGE_SIZE / CACHELINE_BYTES)
38 struct mock_hwsp_freelist
{
40 struct radix_tree_root cachelines
;
41 struct intel_timeline
**history
;
42 unsigned long count
, max
;
43 struct rnd_state prng
;
50 static void __mock_hwsp_record(struct mock_hwsp_freelist
*state
,
52 struct intel_timeline
*tl
)
54 tl
= xchg(&state
->history
[idx
], tl
);
56 radix_tree_delete(&state
->cachelines
, hwsp_cacheline(tl
));
57 intel_timeline_put(tl
);
61 static int __mock_hwsp_timeline(struct mock_hwsp_freelist
*state
,
65 struct intel_timeline
*tl
;
69 unsigned long cacheline
;
72 tl
= intel_timeline_create(state
->gt
, NULL
);
76 cacheline
= hwsp_cacheline(tl
);
77 err
= radix_tree_insert(&state
->cachelines
, cacheline
, tl
);
80 pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
83 intel_timeline_put(tl
);
87 idx
= state
->count
++ % state
->max
;
88 __mock_hwsp_record(state
, idx
, tl
);
92 i915_prandom_shuffle(state
->history
,
93 sizeof(*state
->history
),
94 min(state
->count
, state
->max
),
97 count
= i915_prandom_u32_max_state(min(state
->count
, state
->max
),
100 idx
= --state
->count
% state
->max
;
101 __mock_hwsp_record(state
, idx
, NULL
);
107 static int mock_hwsp_freelist(void *arg
)
109 struct mock_hwsp_freelist state
;
110 struct drm_i915_private
*i915
;
116 { "shuffled", SHUFFLE
},
122 i915
= mock_gem_device();
126 INIT_RADIX_TREE(&state
.cachelines
, GFP_KERNEL
);
127 state
.prng
= I915_RND_STATE_INITIALIZER(i915_selftest
.random_seed
);
129 state
.gt
= &i915
->gt
;
132 * Create a bunch of timelines and check that their HWSP do not overlap.
133 * Free some, and try again.
136 state
.max
= PAGE_SIZE
/ sizeof(*state
.history
);
138 state
.history
= kcalloc(state
.max
, sizeof(*state
.history
), GFP_KERNEL
);
139 if (!state
.history
) {
144 for (p
= phases
; p
->name
; p
++) {
145 pr_debug("%s(%s)\n", __func__
, p
->name
);
146 for_each_prime_number_from(na
, 1, 2 * CACHELINES_PER_PAGE
) {
147 err
= __mock_hwsp_timeline(&state
, na
, p
->flags
);
154 for (na
= 0; na
< state
.max
; na
++)
155 __mock_hwsp_record(&state
, na
, NULL
);
156 kfree(state
.history
);
158 drm_dev_put(&i915
->drm
);
169 static int __igt_sync(struct intel_timeline
*tl
,
171 const struct __igt_sync
*p
,
176 if (__intel_timeline_sync_is_later(tl
, ctx
, p
->seqno
) != p
->expected
) {
177 pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
178 name
, p
->name
, ctx
, p
->seqno
, yesno(p
->expected
));
183 ret
= __intel_timeline_sync_set(tl
, ctx
, p
->seqno
);
191 static int igt_sync(void *arg
)
193 const struct __igt_sync pass
[] = {
194 { "unset", 0, false, false },
195 { "new", 0, false, true },
196 { "0a", 0, true, true },
197 { "1a", 1, false, true },
198 { "1b", 1, true, true },
199 { "0b", 0, true, false },
200 { "2a", 2, false, true },
201 { "4", 4, false, true },
202 { "INT_MAX", INT_MAX
, false, true },
203 { "INT_MAX-1", INT_MAX
-1, true, false },
204 { "INT_MAX+1", (u32
)INT_MAX
+1, false, true },
205 { "INT_MAX", INT_MAX
, true, false },
206 { "UINT_MAX", UINT_MAX
, false, true },
207 { "wrap", 0, false, true },
208 { "unwrap", UINT_MAX
, true, false },
211 struct intel_timeline tl
;
215 mock_timeline_init(&tl
, 0);
216 for (p
= pass
; p
->name
; p
++) {
217 for (order
= 1; order
< 64; order
++) {
218 for (offset
= -1; offset
<= (order
> 1); offset
++) {
219 u64 ctx
= BIT_ULL(order
) + offset
;
221 ret
= __igt_sync(&tl
, ctx
, p
, "1");
227 mock_timeline_fini(&tl
);
229 mock_timeline_init(&tl
, 0);
230 for (order
= 1; order
< 64; order
++) {
231 for (offset
= -1; offset
<= (order
> 1); offset
++) {
232 u64 ctx
= BIT_ULL(order
) + offset
;
234 for (p
= pass
; p
->name
; p
++) {
235 ret
= __igt_sync(&tl
, ctx
, p
, "2");
243 mock_timeline_fini(&tl
);
247 static unsigned int random_engine(struct rnd_state
*rnd
)
249 return i915_prandom_u32_max_state(I915_NUM_ENGINES
, rnd
);
252 static int bench_sync(void *arg
)
254 struct rnd_state prng
;
255 struct intel_timeline tl
;
256 unsigned long end_time
, count
;
259 int order
, last_order
;
261 mock_timeline_init(&tl
, 0);
263 /* Lookups from cache are very fast and so the random number generation
264 * and the loop itself becomes a significant factor in the per-iteration
265 * timings. We try to compensate the results by measuring the overhead
266 * of the prng and subtract it from the reported results.
268 prandom_seed_state(&prng
, i915_selftest
.random_seed
);
271 end_time
= jiffies
+ HZ
/10;
275 /* Make sure the compiler doesn't optimise away the prng call */
276 WRITE_ONCE(x
, prandom_u32_state(&prng
));
279 } while (!time_after(jiffies
, end_time
));
280 kt
= ktime_sub(ktime_get(), kt
);
281 pr_debug("%s: %lu random evaluations, %lluns/prng\n",
282 __func__
, count
, (long long)div64_ul(ktime_to_ns(kt
), count
));
283 prng32_1M
= div64_ul(ktime_to_ns(kt
) << 20, count
);
285 /* Benchmark (only) setting random context ids */
286 prandom_seed_state(&prng
, i915_selftest
.random_seed
);
289 end_time
= jiffies
+ HZ
/10;
291 u64 id
= i915_prandom_u64_state(&prng
);
293 __intel_timeline_sync_set(&tl
, id
, 0);
295 } while (!time_after(jiffies
, end_time
));
296 kt
= ktime_sub(ktime_get(), kt
);
297 kt
= ktime_sub_ns(kt
, (count
* prng32_1M
* 2) >> 20);
298 pr_info("%s: %lu random insertions, %lluns/insert\n",
299 __func__
, count
, (long long)div64_ul(ktime_to_ns(kt
), count
));
301 /* Benchmark looking up the exact same context ids as we just set */
302 prandom_seed_state(&prng
, i915_selftest
.random_seed
);
306 u64 id
= i915_prandom_u64_state(&prng
);
308 if (!__intel_timeline_sync_is_later(&tl
, id
, 0)) {
309 mock_timeline_fini(&tl
);
310 pr_err("Lookup of %llu failed\n", id
);
314 kt
= ktime_sub(ktime_get(), kt
);
315 kt
= ktime_sub_ns(kt
, (count
* prng32_1M
* 2) >> 20);
316 pr_info("%s: %lu random lookups, %lluns/lookup\n",
317 __func__
, count
, (long long)div64_ul(ktime_to_ns(kt
), count
));
319 mock_timeline_fini(&tl
);
322 mock_timeline_init(&tl
, 0);
324 /* Benchmark setting the first N (in order) contexts */
327 end_time
= jiffies
+ HZ
/10;
329 __intel_timeline_sync_set(&tl
, count
++, 0);
330 } while (!time_after(jiffies
, end_time
));
331 kt
= ktime_sub(ktime_get(), kt
);
332 pr_info("%s: %lu in-order insertions, %lluns/insert\n",
333 __func__
, count
, (long long)div64_ul(ktime_to_ns(kt
), count
));
335 /* Benchmark looking up the exact same context ids as we just set */
339 if (!__intel_timeline_sync_is_later(&tl
, end_time
, 0)) {
340 pr_err("Lookup of %lu failed\n", end_time
);
341 mock_timeline_fini(&tl
);
345 kt
= ktime_sub(ktime_get(), kt
);
346 pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
347 __func__
, count
, (long long)div64_ul(ktime_to_ns(kt
), count
));
349 mock_timeline_fini(&tl
);
352 mock_timeline_init(&tl
, 0);
354 /* Benchmark searching for a random context id and maybe changing it */
355 prandom_seed_state(&prng
, i915_selftest
.random_seed
);
358 end_time
= jiffies
+ HZ
/10;
360 u32 id
= random_engine(&prng
);
361 u32 seqno
= prandom_u32_state(&prng
);
363 if (!__intel_timeline_sync_is_later(&tl
, id
, seqno
))
364 __intel_timeline_sync_set(&tl
, id
, seqno
);
367 } while (!time_after(jiffies
, end_time
));
368 kt
= ktime_sub(ktime_get(), kt
);
369 kt
= ktime_sub_ns(kt
, (count
* prng32_1M
* 2) >> 20);
370 pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
371 __func__
, count
, (long long)div64_ul(ktime_to_ns(kt
), count
));
372 mock_timeline_fini(&tl
);
375 /* Benchmark searching for a known context id and changing the seqno */
376 for (last_order
= 1, order
= 1; order
< 32;
377 ({ int tmp
= last_order
; last_order
= order
; order
+= tmp
; })) {
378 unsigned int mask
= BIT(order
) - 1;
380 mock_timeline_init(&tl
, 0);
384 end_time
= jiffies
+ HZ
/10;
386 /* Without assuming too many details of the underlying
387 * implementation, try to identify its phase-changes
390 u64 id
= (u64
)(count
& mask
) << order
;
392 __intel_timeline_sync_is_later(&tl
, id
, 0);
393 __intel_timeline_sync_set(&tl
, id
, 0);
396 } while (!time_after(jiffies
, end_time
));
397 kt
= ktime_sub(ktime_get(), kt
);
398 pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
399 __func__
, count
, order
,
400 (long long)div64_ul(ktime_to_ns(kt
), count
));
401 mock_timeline_fini(&tl
);
408 int intel_timeline_mock_selftests(void)
410 static const struct i915_subtest tests
[] = {
411 SUBTEST(mock_hwsp_freelist
),
416 return i915_subtests(tests
, NULL
);
419 static int emit_ggtt_store_dw(struct i915_request
*rq
, u32 addr
, u32 value
)
423 cs
= intel_ring_begin(rq
, 4);
427 if (INTEL_GEN(rq
->i915
) >= 8) {
428 *cs
++ = MI_STORE_DWORD_IMM_GEN4
| MI_USE_GGTT
;
432 } else if (INTEL_GEN(rq
->i915
) >= 4) {
433 *cs
++ = MI_STORE_DWORD_IMM_GEN4
| MI_USE_GGTT
;
438 *cs
++ = MI_STORE_DWORD_IMM
| MI_MEM_VIRTUAL
;
444 intel_ring_advance(rq
, cs
);
449 static struct i915_request
*
450 tl_write(struct intel_timeline
*tl
, struct intel_engine_cs
*engine
, u32 value
)
452 struct i915_request
*rq
;
455 err
= intel_timeline_pin(tl
);
461 rq
= intel_engine_create_kernel_request(engine
);
465 i915_request_get(rq
);
467 err
= emit_ggtt_store_dw(rq
, tl
->hwsp_offset
, value
);
468 i915_request_add(rq
);
470 i915_request_put(rq
);
475 intel_timeline_unpin(tl
);
478 pr_err("Failed to write to timeline!\n");
482 static struct intel_timeline
*
483 checked_intel_timeline_create(struct intel_gt
*gt
)
485 struct intel_timeline
*tl
;
487 tl
= intel_timeline_create(gt
, NULL
);
491 if (*tl
->hwsp_seqno
!= tl
->seqno
) {
492 pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
493 *tl
->hwsp_seqno
, tl
->seqno
);
494 intel_timeline_put(tl
);
495 return ERR_PTR(-EINVAL
);
501 static int live_hwsp_engine(void *arg
)
503 #define NUM_TIMELINES 4096
504 struct intel_gt
*gt
= arg
;
505 struct intel_timeline
**timelines
;
506 struct intel_engine_cs
*engine
;
507 enum intel_engine_id id
;
508 unsigned long count
, n
;
512 * Create a bunch of timelines and check we can write
513 * independently to each of their breadcrumb slots.
516 timelines
= kvmalloc_array(NUM_TIMELINES
* I915_NUM_ENGINES
,
523 for_each_engine(engine
, gt
, id
) {
524 if (!intel_engine_can_store_dword(engine
))
527 intel_engine_pm_get(engine
);
529 for (n
= 0; n
< NUM_TIMELINES
; n
++) {
530 struct intel_timeline
*tl
;
531 struct i915_request
*rq
;
533 tl
= checked_intel_timeline_create(gt
);
539 rq
= tl_write(tl
, engine
, count
);
541 intel_timeline_put(tl
);
546 timelines
[count
++] = tl
;
547 i915_request_put(rq
);
550 intel_engine_pm_put(engine
);
555 if (igt_flush_test(gt
->i915
))
558 for (n
= 0; n
< count
; n
++) {
559 struct intel_timeline
*tl
= timelines
[n
];
561 if (!err
&& *tl
->hwsp_seqno
!= n
) {
562 pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
566 intel_timeline_put(tl
);
574 static int live_hwsp_alternate(void *arg
)
576 #define NUM_TIMELINES 4096
577 struct intel_gt
*gt
= arg
;
578 struct intel_timeline
**timelines
;
579 struct intel_engine_cs
*engine
;
580 enum intel_engine_id id
;
581 unsigned long count
, n
;
585 * Create a bunch of timelines and check we can write
586 * independently to each of their breadcrumb slots with adjacent
590 timelines
= kvmalloc_array(NUM_TIMELINES
* I915_NUM_ENGINES
,
597 for (n
= 0; n
< NUM_TIMELINES
; n
++) {
598 for_each_engine(engine
, gt
, id
) {
599 struct intel_timeline
*tl
;
600 struct i915_request
*rq
;
602 if (!intel_engine_can_store_dword(engine
))
605 tl
= checked_intel_timeline_create(gt
);
607 intel_engine_pm_put(engine
);
612 intel_engine_pm_get(engine
);
613 rq
= tl_write(tl
, engine
, count
);
614 intel_engine_pm_put(engine
);
616 intel_timeline_put(tl
);
621 timelines
[count
++] = tl
;
622 i915_request_put(rq
);
627 if (igt_flush_test(gt
->i915
))
630 for (n
= 0; n
< count
; n
++) {
631 struct intel_timeline
*tl
= timelines
[n
];
633 if (!err
&& *tl
->hwsp_seqno
!= n
) {
634 pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
638 intel_timeline_put(tl
);
646 static int live_hwsp_wrap(void *arg
)
648 struct intel_gt
*gt
= arg
;
649 struct intel_engine_cs
*engine
;
650 struct intel_timeline
*tl
;
651 enum intel_engine_id id
;
655 * Across a seqno wrap, we need to keep the old cacheline alive for
656 * foreign GPU references.
659 tl
= intel_timeline_create(gt
, NULL
);
663 if (!tl
->has_initial_breadcrumb
|| !tl
->hwsp_cacheline
)
666 err
= intel_timeline_pin(tl
);
670 for_each_engine(engine
, gt
, id
) {
671 const u32
*hwsp_seqno
[2];
672 struct i915_request
*rq
;
675 if (!intel_engine_can_store_dword(engine
))
678 rq
= intel_engine_create_kernel_request(engine
);
686 mutex_lock_nested(&tl
->mutex
, SINGLE_DEPTH_NESTING
);
687 err
= intel_timeline_get_seqno(tl
, rq
, &seqno
[0]);
688 mutex_unlock(&tl
->mutex
);
690 i915_request_add(rq
);
693 pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
694 seqno
[0], tl
->hwsp_offset
);
696 err
= emit_ggtt_store_dw(rq
, tl
->hwsp_offset
, seqno
[0]);
698 i915_request_add(rq
);
701 hwsp_seqno
[0] = tl
->hwsp_seqno
;
703 mutex_lock_nested(&tl
->mutex
, SINGLE_DEPTH_NESTING
);
704 err
= intel_timeline_get_seqno(tl
, rq
, &seqno
[1]);
705 mutex_unlock(&tl
->mutex
);
707 i915_request_add(rq
);
710 pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
711 seqno
[1], tl
->hwsp_offset
);
713 err
= emit_ggtt_store_dw(rq
, tl
->hwsp_offset
, seqno
[1]);
715 i915_request_add(rq
);
718 hwsp_seqno
[1] = tl
->hwsp_seqno
;
720 /* With wrap should come a new hwsp */
721 GEM_BUG_ON(seqno
[1] >= seqno
[0]);
722 GEM_BUG_ON(hwsp_seqno
[0] == hwsp_seqno
[1]);
724 i915_request_add(rq
);
726 if (i915_request_wait(rq
, 0, HZ
/ 5) < 0) {
727 pr_err("Wait for timeline writes timed out!\n");
732 if (*hwsp_seqno
[0] != seqno
[0] || *hwsp_seqno
[1] != seqno
[1]) {
733 pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
734 *hwsp_seqno
[0], *hwsp_seqno
[1],
740 intel_gt_retire_requests(gt
); /* recycle HWSP */
744 if (igt_flush_test(gt
->i915
))
747 intel_timeline_unpin(tl
);
749 intel_timeline_put(tl
);
753 static int live_hwsp_recycle(void *arg
)
755 struct intel_gt
*gt
= arg
;
756 struct intel_engine_cs
*engine
;
757 enum intel_engine_id id
;
762 * Check seqno writes into one timeline at a time. We expect to
763 * recycle the breadcrumb slot between iterations and neither
764 * want to confuse ourselves or the GPU.
768 for_each_engine(engine
, gt
, id
) {
769 IGT_TIMEOUT(end_time
);
771 if (!intel_engine_can_store_dword(engine
))
774 intel_engine_pm_get(engine
);
777 struct intel_timeline
*tl
;
778 struct i915_request
*rq
;
780 tl
= checked_intel_timeline_create(gt
);
786 rq
= tl_write(tl
, engine
, count
);
788 intel_timeline_put(tl
);
793 if (i915_request_wait(rq
, 0, HZ
/ 5) < 0) {
794 pr_err("Wait for timeline writes timed out!\n");
795 i915_request_put(rq
);
796 intel_timeline_put(tl
);
801 if (*tl
->hwsp_seqno
!= count
) {
802 pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
803 count
, *tl
->hwsp_seqno
);
807 i915_request_put(rq
);
808 intel_timeline_put(tl
);
813 } while (!__igt_timeout(end_time
, NULL
));
815 intel_engine_pm_put(engine
);
823 int intel_timeline_live_selftests(struct drm_i915_private
*i915
)
825 static const struct i915_subtest tests
[] = {
826 SUBTEST(live_hwsp_recycle
),
827 SUBTEST(live_hwsp_engine
),
828 SUBTEST(live_hwsp_alternate
),
829 SUBTEST(live_hwsp_wrap
),
832 if (intel_gt_is_wedged(&i915
->gt
))
835 return intel_gt_live_subtests(tests
, &i915
->gt
);