Merge tag 'io_uring-5.11-2021-01-16' of git://git.kernel.dk/linux-block
[linux/fpc-iii.git] / drivers / gpu / drm / i915 / selftests / i915_request.c
blobe424a6d1a68c9433c0ba7f646e95f8255780a307
1 /*
2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
29 #include "gem/i915_gem_pm.h"
30 #include "gem/selftests/mock_context.h"
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_requests.h"
37 #include "gt/selftest_engine_heartbeat.h"
39 #include "i915_random.h"
40 #include "i915_selftest.h"
41 #include "igt_flush_test.h"
42 #include "igt_live_test.h"
43 #include "igt_spinner.h"
44 #include "lib_sw_fence.h"
46 #include "mock_drm.h"
47 #include "mock_gem_device.h"
49 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
51 struct intel_engine_cs *engine;
52 unsigned int count;
54 count = 0;
55 for_each_uabi_engine(engine, i915)
56 count++;
58 return count;
61 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
63 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
66 static int igt_add_request(void *arg)
68 struct drm_i915_private *i915 = arg;
69 struct i915_request *request;
71 /* Basic preliminary test to create a request and let it loose! */
73 request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
74 if (!request)
75 return -ENOMEM;
77 i915_request_add(request);
79 return 0;
82 static int igt_wait_request(void *arg)
84 const long T = HZ / 4;
85 struct drm_i915_private *i915 = arg;
86 struct i915_request *request;
87 int err = -EINVAL;
89 /* Submit a request, then wait upon it */
91 request = mock_request(rcs0(i915)->kernel_context, T);
92 if (!request)
93 return -ENOMEM;
95 i915_request_get(request);
97 if (i915_request_wait(request, 0, 0) != -ETIME) {
98 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
99 goto out_request;
102 if (i915_request_wait(request, 0, T) != -ETIME) {
103 pr_err("request wait succeeded (expected timeout before submit!)\n");
104 goto out_request;
107 if (i915_request_completed(request)) {
108 pr_err("request completed before submit!!\n");
109 goto out_request;
112 i915_request_add(request);
114 if (i915_request_wait(request, 0, 0) != -ETIME) {
115 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
116 goto out_request;
119 if (i915_request_completed(request)) {
120 pr_err("request completed immediately!\n");
121 goto out_request;
124 if (i915_request_wait(request, 0, T / 2) != -ETIME) {
125 pr_err("request wait succeeded (expected timeout!)\n");
126 goto out_request;
129 if (i915_request_wait(request, 0, T) == -ETIME) {
130 pr_err("request wait timed out!\n");
131 goto out_request;
134 if (!i915_request_completed(request)) {
135 pr_err("request not complete after waiting!\n");
136 goto out_request;
139 if (i915_request_wait(request, 0, T) == -ETIME) {
140 pr_err("request wait timed out when already complete!\n");
141 goto out_request;
144 err = 0;
145 out_request:
146 i915_request_put(request);
147 mock_device_flush(i915);
148 return err;
151 static int igt_fence_wait(void *arg)
153 const long T = HZ / 4;
154 struct drm_i915_private *i915 = arg;
155 struct i915_request *request;
156 int err = -EINVAL;
158 /* Submit a request, treat it as a fence and wait upon it */
160 request = mock_request(rcs0(i915)->kernel_context, T);
161 if (!request)
162 return -ENOMEM;
164 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
165 pr_err("fence wait success before submit (expected timeout)!\n");
166 goto out;
169 i915_request_add(request);
171 if (dma_fence_is_signaled(&request->fence)) {
172 pr_err("fence signaled immediately!\n");
173 goto out;
176 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
177 pr_err("fence wait success after submit (expected timeout)!\n");
178 goto out;
181 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
182 pr_err("fence wait timed out (expected success)!\n");
183 goto out;
186 if (!dma_fence_is_signaled(&request->fence)) {
187 pr_err("fence unsignaled after waiting!\n");
188 goto out;
191 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
192 pr_err("fence wait timed out when complete (expected success)!\n");
193 goto out;
196 err = 0;
197 out:
198 mock_device_flush(i915);
199 return err;
202 static int igt_request_rewind(void *arg)
204 struct drm_i915_private *i915 = arg;
205 struct i915_request *request, *vip;
206 struct i915_gem_context *ctx[2];
207 struct intel_context *ce;
208 int err = -EINVAL;
210 ctx[0] = mock_context(i915, "A");
212 ce = i915_gem_context_get_engine(ctx[0], RCS0);
213 GEM_BUG_ON(IS_ERR(ce));
214 request = mock_request(ce, 2 * HZ);
215 intel_context_put(ce);
216 if (!request) {
217 err = -ENOMEM;
218 goto err_context_0;
221 i915_request_get(request);
222 i915_request_add(request);
224 ctx[1] = mock_context(i915, "B");
226 ce = i915_gem_context_get_engine(ctx[1], RCS0);
227 GEM_BUG_ON(IS_ERR(ce));
228 vip = mock_request(ce, 0);
229 intel_context_put(ce);
230 if (!vip) {
231 err = -ENOMEM;
232 goto err_context_1;
235 /* Simulate preemption by manual reordering */
236 if (!mock_cancel_request(request)) {
237 pr_err("failed to cancel request (already executed)!\n");
238 i915_request_add(vip);
239 goto err_context_1;
241 i915_request_get(vip);
242 i915_request_add(vip);
243 rcu_read_lock();
244 request->engine->submit_request(request);
245 rcu_read_unlock();
248 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
249 pr_err("timed out waiting for high priority request\n");
250 goto err;
253 if (i915_request_completed(request)) {
254 pr_err("low priority request already completed\n");
255 goto err;
258 err = 0;
259 err:
260 i915_request_put(vip);
261 err_context_1:
262 mock_context_close(ctx[1]);
263 i915_request_put(request);
264 err_context_0:
265 mock_context_close(ctx[0]);
266 mock_device_flush(i915);
267 return err;
270 struct smoketest {
271 struct intel_engine_cs *engine;
272 struct i915_gem_context **contexts;
273 atomic_long_t num_waits, num_fences;
274 int ncontexts, max_batch;
275 struct i915_request *(*request_alloc)(struct intel_context *ce);
278 static struct i915_request *
279 __mock_request_alloc(struct intel_context *ce)
281 return mock_request(ce, 0);
284 static struct i915_request *
285 __live_request_alloc(struct intel_context *ce)
287 return intel_context_create_request(ce);
290 static int __igt_breadcrumbs_smoketest(void *arg)
292 struct smoketest *t = arg;
293 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
294 const unsigned int total = 4 * t->ncontexts + 1;
295 unsigned int num_waits = 0, num_fences = 0;
296 struct i915_request **requests;
297 I915_RND_STATE(prng);
298 unsigned int *order;
299 int err = 0;
302 * A very simple test to catch the most egregious of list handling bugs.
304 * At its heart, we simply create oodles of requests running across
305 * multiple kthreads and enable signaling on them, for the sole purpose
306 * of stressing our breadcrumb handling. The only inspection we do is
307 * that the fences were marked as signaled.
310 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
311 if (!requests)
312 return -ENOMEM;
314 order = i915_random_order(total, &prng);
315 if (!order) {
316 err = -ENOMEM;
317 goto out_requests;
320 while (!kthread_should_stop()) {
321 struct i915_sw_fence *submit, *wait;
322 unsigned int n, count;
324 submit = heap_fence_create(GFP_KERNEL);
325 if (!submit) {
326 err = -ENOMEM;
327 break;
330 wait = heap_fence_create(GFP_KERNEL);
331 if (!wait) {
332 i915_sw_fence_commit(submit);
333 heap_fence_put(submit);
334 err = -ENOMEM;
335 break;
338 i915_random_reorder(order, total, &prng);
339 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
341 for (n = 0; n < count; n++) {
342 struct i915_gem_context *ctx =
343 t->contexts[order[n] % t->ncontexts];
344 struct i915_request *rq;
345 struct intel_context *ce;
347 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
348 GEM_BUG_ON(IS_ERR(ce));
349 rq = t->request_alloc(ce);
350 intel_context_put(ce);
351 if (IS_ERR(rq)) {
352 err = PTR_ERR(rq);
353 count = n;
354 break;
357 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
358 submit,
359 GFP_KERNEL);
361 requests[n] = i915_request_get(rq);
362 i915_request_add(rq);
364 if (err >= 0)
365 err = i915_sw_fence_await_dma_fence(wait,
366 &rq->fence,
368 GFP_KERNEL);
370 if (err < 0) {
371 i915_request_put(rq);
372 count = n;
373 break;
377 i915_sw_fence_commit(submit);
378 i915_sw_fence_commit(wait);
380 if (!wait_event_timeout(wait->wait,
381 i915_sw_fence_done(wait),
382 5 * HZ)) {
383 struct i915_request *rq = requests[count - 1];
385 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
386 atomic_read(&wait->pending), count,
387 rq->fence.context, rq->fence.seqno,
388 t->engine->name);
389 GEM_TRACE_DUMP();
391 intel_gt_set_wedged(t->engine->gt);
392 GEM_BUG_ON(!i915_request_completed(rq));
393 i915_sw_fence_wait(wait);
394 err = -EIO;
397 for (n = 0; n < count; n++) {
398 struct i915_request *rq = requests[n];
400 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
401 &rq->fence.flags)) {
402 pr_err("%llu:%llu was not signaled!\n",
403 rq->fence.context, rq->fence.seqno);
404 err = -EINVAL;
407 i915_request_put(rq);
410 heap_fence_put(wait);
411 heap_fence_put(submit);
413 if (err < 0)
414 break;
416 num_fences += count;
417 num_waits++;
419 cond_resched();
422 atomic_long_add(num_fences, &t->num_fences);
423 atomic_long_add(num_waits, &t->num_waits);
425 kfree(order);
426 out_requests:
427 kfree(requests);
428 return err;
431 static int mock_breadcrumbs_smoketest(void *arg)
433 struct drm_i915_private *i915 = arg;
434 struct smoketest t = {
435 .engine = rcs0(i915),
436 .ncontexts = 1024,
437 .max_batch = 1024,
438 .request_alloc = __mock_request_alloc
440 unsigned int ncpus = num_online_cpus();
441 struct task_struct **threads;
442 unsigned int n;
443 int ret = 0;
446 * Smoketest our breadcrumb/signal handling for requests across multiple
447 * threads. A very simple test to only catch the most egregious of bugs.
448 * See __igt_breadcrumbs_smoketest();
451 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
452 if (!threads)
453 return -ENOMEM;
455 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
456 if (!t.contexts) {
457 ret = -ENOMEM;
458 goto out_threads;
461 for (n = 0; n < t.ncontexts; n++) {
462 t.contexts[n] = mock_context(t.engine->i915, "mock");
463 if (!t.contexts[n]) {
464 ret = -ENOMEM;
465 goto out_contexts;
469 for (n = 0; n < ncpus; n++) {
470 threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
471 &t, "igt/%d", n);
472 if (IS_ERR(threads[n])) {
473 ret = PTR_ERR(threads[n]);
474 ncpus = n;
475 break;
478 get_task_struct(threads[n]);
481 yield(); /* start all threads before we begin */
482 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
484 for (n = 0; n < ncpus; n++) {
485 int err;
487 err = kthread_stop(threads[n]);
488 if (err < 0 && !ret)
489 ret = err;
491 put_task_struct(threads[n]);
493 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
494 atomic_long_read(&t.num_waits),
495 atomic_long_read(&t.num_fences),
496 ncpus);
498 out_contexts:
499 for (n = 0; n < t.ncontexts; n++) {
500 if (!t.contexts[n])
501 break;
502 mock_context_close(t.contexts[n]);
504 kfree(t.contexts);
505 out_threads:
506 kfree(threads);
507 return ret;
510 int i915_request_mock_selftests(void)
512 static const struct i915_subtest tests[] = {
513 SUBTEST(igt_add_request),
514 SUBTEST(igt_wait_request),
515 SUBTEST(igt_fence_wait),
516 SUBTEST(igt_request_rewind),
517 SUBTEST(mock_breadcrumbs_smoketest),
519 struct drm_i915_private *i915;
520 intel_wakeref_t wakeref;
521 int err = 0;
523 i915 = mock_gem_device();
524 if (!i915)
525 return -ENOMEM;
527 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
528 err = i915_subtests(tests, i915);
530 mock_destroy_device(i915);
532 return err;
535 static int live_nop_request(void *arg)
537 struct drm_i915_private *i915 = arg;
538 struct intel_engine_cs *engine;
539 struct igt_live_test t;
540 int err = -ENODEV;
543 * Submit various sized batches of empty requests, to each engine
544 * (individually), and wait for the batch to complete. We can check
545 * the overhead of submitting requests to the hardware.
548 for_each_uabi_engine(engine, i915) {
549 unsigned long n, prime;
550 IGT_TIMEOUT(end_time);
551 ktime_t times[2] = {};
553 err = igt_live_test_begin(&t, i915, __func__, engine->name);
554 if (err)
555 return err;
557 intel_engine_pm_get(engine);
558 for_each_prime_number_from(prime, 1, 8192) {
559 struct i915_request *request = NULL;
561 times[1] = ktime_get_raw();
563 for (n = 0; n < prime; n++) {
564 i915_request_put(request);
565 request = i915_request_create(engine->kernel_context);
566 if (IS_ERR(request))
567 return PTR_ERR(request);
570 * This space is left intentionally blank.
572 * We do not actually want to perform any
573 * action with this request, we just want
574 * to measure the latency in allocation
575 * and submission of our breadcrumbs -
576 * ensuring that the bare request is sufficient
577 * for the system to work (i.e. proper HEAD
578 * tracking of the rings, interrupt handling,
579 * etc). It also gives us the lowest bounds
580 * for latency.
583 i915_request_get(request);
584 i915_request_add(request);
586 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
587 i915_request_put(request);
589 times[1] = ktime_sub(ktime_get_raw(), times[1]);
590 if (prime == 1)
591 times[0] = times[1];
593 if (__igt_timeout(end_time, NULL))
594 break;
596 intel_engine_pm_put(engine);
598 err = igt_live_test_end(&t);
599 if (err)
600 return err;
602 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
603 engine->name,
604 ktime_to_ns(times[0]),
605 prime, div64_u64(ktime_to_ns(times[1]), prime));
608 return err;
611 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
613 struct drm_i915_gem_object *obj;
614 struct i915_vma *vma;
615 u32 *cmd;
616 int err;
618 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
619 if (IS_ERR(obj))
620 return ERR_CAST(obj);
622 cmd = i915_gem_object_pin_map(obj, I915_MAP_WB);
623 if (IS_ERR(cmd)) {
624 err = PTR_ERR(cmd);
625 goto err;
628 *cmd = MI_BATCH_BUFFER_END;
630 __i915_gem_object_flush_map(obj, 0, 64);
631 i915_gem_object_unpin_map(obj);
633 intel_gt_chipset_flush(&i915->gt);
635 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
636 if (IS_ERR(vma)) {
637 err = PTR_ERR(vma);
638 goto err;
641 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
642 if (err)
643 goto err;
645 /* Force the wait wait now to avoid including it in the benchmark */
646 err = i915_vma_sync(vma);
647 if (err)
648 goto err_pin;
650 return vma;
652 err_pin:
653 i915_vma_unpin(vma);
654 err:
655 i915_gem_object_put(obj);
656 return ERR_PTR(err);
659 static struct i915_request *
660 empty_request(struct intel_engine_cs *engine,
661 struct i915_vma *batch)
663 struct i915_request *request;
664 int err;
666 request = i915_request_create(engine->kernel_context);
667 if (IS_ERR(request))
668 return request;
670 err = engine->emit_bb_start(request,
671 batch->node.start,
672 batch->node.size,
673 I915_DISPATCH_SECURE);
674 if (err)
675 goto out_request;
677 i915_request_get(request);
678 out_request:
679 i915_request_add(request);
680 return err ? ERR_PTR(err) : request;
683 static int live_empty_request(void *arg)
685 struct drm_i915_private *i915 = arg;
686 struct intel_engine_cs *engine;
687 struct igt_live_test t;
688 struct i915_vma *batch;
689 int err = 0;
692 * Submit various sized batches of empty requests, to each engine
693 * (individually), and wait for the batch to complete. We can check
694 * the overhead of submitting requests to the hardware.
697 batch = empty_batch(i915);
698 if (IS_ERR(batch))
699 return PTR_ERR(batch);
701 for_each_uabi_engine(engine, i915) {
702 IGT_TIMEOUT(end_time);
703 struct i915_request *request;
704 unsigned long n, prime;
705 ktime_t times[2] = {};
707 err = igt_live_test_begin(&t, i915, __func__, engine->name);
708 if (err)
709 goto out_batch;
711 intel_engine_pm_get(engine);
713 /* Warmup / preload */
714 request = empty_request(engine, batch);
715 if (IS_ERR(request)) {
716 err = PTR_ERR(request);
717 intel_engine_pm_put(engine);
718 goto out_batch;
720 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
722 for_each_prime_number_from(prime, 1, 8192) {
723 times[1] = ktime_get_raw();
725 for (n = 0; n < prime; n++) {
726 i915_request_put(request);
727 request = empty_request(engine, batch);
728 if (IS_ERR(request)) {
729 err = PTR_ERR(request);
730 intel_engine_pm_put(engine);
731 goto out_batch;
734 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
736 times[1] = ktime_sub(ktime_get_raw(), times[1]);
737 if (prime == 1)
738 times[0] = times[1];
740 if (__igt_timeout(end_time, NULL))
741 break;
743 i915_request_put(request);
744 intel_engine_pm_put(engine);
746 err = igt_live_test_end(&t);
747 if (err)
748 goto out_batch;
750 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
751 engine->name,
752 ktime_to_ns(times[0]),
753 prime, div64_u64(ktime_to_ns(times[1]), prime));
756 out_batch:
757 i915_vma_unpin(batch);
758 i915_vma_put(batch);
759 return err;
762 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
764 struct drm_i915_gem_object *obj;
765 const int gen = INTEL_GEN(i915);
766 struct i915_vma *vma;
767 u32 *cmd;
768 int err;
770 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
771 if (IS_ERR(obj))
772 return ERR_CAST(obj);
774 vma = i915_vma_instance(obj, i915->gt.vm, NULL);
775 if (IS_ERR(vma)) {
776 err = PTR_ERR(vma);
777 goto err;
780 err = i915_vma_pin(vma, 0, 0, PIN_USER);
781 if (err)
782 goto err;
784 cmd = i915_gem_object_pin_map(obj, I915_MAP_WC);
785 if (IS_ERR(cmd)) {
786 err = PTR_ERR(cmd);
787 goto err;
790 if (gen >= 8) {
791 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
792 *cmd++ = lower_32_bits(vma->node.start);
793 *cmd++ = upper_32_bits(vma->node.start);
794 } else if (gen >= 6) {
795 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
796 *cmd++ = lower_32_bits(vma->node.start);
797 } else {
798 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
799 *cmd++ = lower_32_bits(vma->node.start);
801 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
803 __i915_gem_object_flush_map(obj, 0, 64);
804 i915_gem_object_unpin_map(obj);
806 intel_gt_chipset_flush(&i915->gt);
808 return vma;
810 err:
811 i915_gem_object_put(obj);
812 return ERR_PTR(err);
815 static int recursive_batch_resolve(struct i915_vma *batch)
817 u32 *cmd;
819 cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC);
820 if (IS_ERR(cmd))
821 return PTR_ERR(cmd);
823 *cmd = MI_BATCH_BUFFER_END;
825 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
826 i915_gem_object_unpin_map(batch->obj);
828 intel_gt_chipset_flush(batch->vm->gt);
830 return 0;
833 static int live_all_engines(void *arg)
835 struct drm_i915_private *i915 = arg;
836 const unsigned int nengines = num_uabi_engines(i915);
837 struct intel_engine_cs *engine;
838 struct i915_request **request;
839 struct igt_live_test t;
840 struct i915_vma *batch;
841 unsigned int idx;
842 int err;
845 * Check we can submit requests to all engines simultaneously. We
846 * send a recursive batch to each engine - checking that we don't
847 * block doing so, and that they don't complete too soon.
850 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
851 if (!request)
852 return -ENOMEM;
854 err = igt_live_test_begin(&t, i915, __func__, "");
855 if (err)
856 goto out_free;
858 batch = recursive_batch(i915);
859 if (IS_ERR(batch)) {
860 err = PTR_ERR(batch);
861 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
862 goto out_free;
865 i915_vma_lock(batch);
867 idx = 0;
868 for_each_uabi_engine(engine, i915) {
869 request[idx] = intel_engine_create_kernel_request(engine);
870 if (IS_ERR(request[idx])) {
871 err = PTR_ERR(request[idx]);
872 pr_err("%s: Request allocation failed with err=%d\n",
873 __func__, err);
874 goto out_request;
877 err = i915_request_await_object(request[idx], batch->obj, 0);
878 if (err == 0)
879 err = i915_vma_move_to_active(batch, request[idx], 0);
880 GEM_BUG_ON(err);
882 err = engine->emit_bb_start(request[idx],
883 batch->node.start,
884 batch->node.size,
886 GEM_BUG_ON(err);
887 request[idx]->batch = batch;
889 i915_request_get(request[idx]);
890 i915_request_add(request[idx]);
891 idx++;
894 i915_vma_unlock(batch);
896 idx = 0;
897 for_each_uabi_engine(engine, i915) {
898 if (i915_request_completed(request[idx])) {
899 pr_err("%s(%s): request completed too early!\n",
900 __func__, engine->name);
901 err = -EINVAL;
902 goto out_request;
904 idx++;
907 err = recursive_batch_resolve(batch);
908 if (err) {
909 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
910 goto out_request;
913 idx = 0;
914 for_each_uabi_engine(engine, i915) {
915 long timeout;
917 timeout = i915_request_wait(request[idx], 0,
918 MAX_SCHEDULE_TIMEOUT);
919 if (timeout < 0) {
920 err = timeout;
921 pr_err("%s: error waiting for request on %s, err=%d\n",
922 __func__, engine->name, err);
923 goto out_request;
926 GEM_BUG_ON(!i915_request_completed(request[idx]));
927 i915_request_put(request[idx]);
928 request[idx] = NULL;
929 idx++;
932 err = igt_live_test_end(&t);
934 out_request:
935 idx = 0;
936 for_each_uabi_engine(engine, i915) {
937 if (request[idx])
938 i915_request_put(request[idx]);
939 idx++;
941 i915_vma_unpin(batch);
942 i915_vma_put(batch);
943 out_free:
944 kfree(request);
945 return err;
948 static int live_sequential_engines(void *arg)
950 struct drm_i915_private *i915 = arg;
951 const unsigned int nengines = num_uabi_engines(i915);
952 struct i915_request **request;
953 struct i915_request *prev = NULL;
954 struct intel_engine_cs *engine;
955 struct igt_live_test t;
956 unsigned int idx;
957 int err;
960 * Check we can submit requests to all engines sequentially, such
961 * that each successive request waits for the earlier ones. This
962 * tests that we don't execute requests out of order, even though
963 * they are running on independent engines.
966 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
967 if (!request)
968 return -ENOMEM;
970 err = igt_live_test_begin(&t, i915, __func__, "");
971 if (err)
972 goto out_free;
974 idx = 0;
975 for_each_uabi_engine(engine, i915) {
976 struct i915_vma *batch;
978 batch = recursive_batch(i915);
979 if (IS_ERR(batch)) {
980 err = PTR_ERR(batch);
981 pr_err("%s: Unable to create batch for %s, err=%d\n",
982 __func__, engine->name, err);
983 goto out_free;
986 i915_vma_lock(batch);
987 request[idx] = intel_engine_create_kernel_request(engine);
988 if (IS_ERR(request[idx])) {
989 err = PTR_ERR(request[idx]);
990 pr_err("%s: Request allocation failed for %s with err=%d\n",
991 __func__, engine->name, err);
992 goto out_unlock;
995 if (prev) {
996 err = i915_request_await_dma_fence(request[idx],
997 &prev->fence);
998 if (err) {
999 i915_request_add(request[idx]);
1000 pr_err("%s: Request await failed for %s with err=%d\n",
1001 __func__, engine->name, err);
1002 goto out_unlock;
1006 err = i915_request_await_object(request[idx],
1007 batch->obj, false);
1008 if (err == 0)
1009 err = i915_vma_move_to_active(batch, request[idx], 0);
1010 GEM_BUG_ON(err);
1012 err = engine->emit_bb_start(request[idx],
1013 batch->node.start,
1014 batch->node.size,
1016 GEM_BUG_ON(err);
1017 request[idx]->batch = batch;
1019 i915_request_get(request[idx]);
1020 i915_request_add(request[idx]);
1022 prev = request[idx];
1023 idx++;
1025 out_unlock:
1026 i915_vma_unlock(batch);
1027 if (err)
1028 goto out_request;
1031 idx = 0;
1032 for_each_uabi_engine(engine, i915) {
1033 long timeout;
1035 if (i915_request_completed(request[idx])) {
1036 pr_err("%s(%s): request completed too early!\n",
1037 __func__, engine->name);
1038 err = -EINVAL;
1039 goto out_request;
1042 err = recursive_batch_resolve(request[idx]->batch);
1043 if (err) {
1044 pr_err("%s: failed to resolve batch, err=%d\n",
1045 __func__, err);
1046 goto out_request;
1049 timeout = i915_request_wait(request[idx], 0,
1050 MAX_SCHEDULE_TIMEOUT);
1051 if (timeout < 0) {
1052 err = timeout;
1053 pr_err("%s: error waiting for request on %s, err=%d\n",
1054 __func__, engine->name, err);
1055 goto out_request;
1058 GEM_BUG_ON(!i915_request_completed(request[idx]));
1059 idx++;
1062 err = igt_live_test_end(&t);
1064 out_request:
1065 idx = 0;
1066 for_each_uabi_engine(engine, i915) {
1067 u32 *cmd;
1069 if (!request[idx])
1070 break;
1072 cmd = i915_gem_object_pin_map(request[idx]->batch->obj,
1073 I915_MAP_WC);
1074 if (!IS_ERR(cmd)) {
1075 *cmd = MI_BATCH_BUFFER_END;
1077 __i915_gem_object_flush_map(request[idx]->batch->obj,
1078 0, sizeof(*cmd));
1079 i915_gem_object_unpin_map(request[idx]->batch->obj);
1081 intel_gt_chipset_flush(engine->gt);
1084 i915_vma_put(request[idx]->batch);
1085 i915_request_put(request[idx]);
1086 idx++;
1088 out_free:
1089 kfree(request);
1090 return err;
1093 static int __live_parallel_engine1(void *arg)
1095 struct intel_engine_cs *engine = arg;
1096 IGT_TIMEOUT(end_time);
1097 unsigned long count;
1098 int err = 0;
1100 count = 0;
1101 intel_engine_pm_get(engine);
1102 do {
1103 struct i915_request *rq;
1105 rq = i915_request_create(engine->kernel_context);
1106 if (IS_ERR(rq)) {
1107 err = PTR_ERR(rq);
1108 break;
1111 i915_request_get(rq);
1112 i915_request_add(rq);
1114 err = 0;
1115 if (i915_request_wait(rq, 0, HZ / 5) < 0)
1116 err = -ETIME;
1117 i915_request_put(rq);
1118 if (err)
1119 break;
1121 count++;
1122 } while (!__igt_timeout(end_time, NULL));
1123 intel_engine_pm_put(engine);
1125 pr_info("%s: %lu request + sync\n", engine->name, count);
1126 return err;
1129 static int __live_parallel_engineN(void *arg)
1131 struct intel_engine_cs *engine = arg;
1132 IGT_TIMEOUT(end_time);
1133 unsigned long count;
1134 int err = 0;
1136 count = 0;
1137 intel_engine_pm_get(engine);
1138 do {
1139 struct i915_request *rq;
1141 rq = i915_request_create(engine->kernel_context);
1142 if (IS_ERR(rq)) {
1143 err = PTR_ERR(rq);
1144 break;
1147 i915_request_add(rq);
1148 count++;
1149 } while (!__igt_timeout(end_time, NULL));
1150 intel_engine_pm_put(engine);
1152 pr_info("%s: %lu requests\n", engine->name, count);
1153 return err;
1156 static bool wake_all(struct drm_i915_private *i915)
1158 if (atomic_dec_and_test(&i915->selftest.counter)) {
1159 wake_up_var(&i915->selftest.counter);
1160 return true;
1163 return false;
1166 static int wait_for_all(struct drm_i915_private *i915)
1168 if (wake_all(i915))
1169 return 0;
1171 if (wait_var_event_timeout(&i915->selftest.counter,
1172 !atomic_read(&i915->selftest.counter),
1173 i915_selftest.timeout_jiffies))
1174 return 0;
1176 return -ETIME;
1179 static int __live_parallel_spin(void *arg)
1181 struct intel_engine_cs *engine = arg;
1182 struct igt_spinner spin;
1183 struct i915_request *rq;
1184 int err = 0;
1187 * Create a spinner running for eternity on each engine. If a second
1188 * spinner is incorrectly placed on the same engine, it will not be
1189 * able to start in time.
1192 if (igt_spinner_init(&spin, engine->gt)) {
1193 wake_all(engine->i915);
1194 return -ENOMEM;
1197 intel_engine_pm_get(engine);
1198 rq = igt_spinner_create_request(&spin,
1199 engine->kernel_context,
1200 MI_NOOP); /* no preemption */
1201 intel_engine_pm_put(engine);
1202 if (IS_ERR(rq)) {
1203 err = PTR_ERR(rq);
1204 if (err == -ENODEV)
1205 err = 0;
1206 wake_all(engine->i915);
1207 goto out_spin;
1210 i915_request_get(rq);
1211 i915_request_add(rq);
1212 if (igt_wait_for_spinner(&spin, rq)) {
1213 /* Occupy this engine for the whole test */
1214 err = wait_for_all(engine->i915);
1215 } else {
1216 pr_err("Failed to start spinner on %s\n", engine->name);
1217 err = -EINVAL;
1219 igt_spinner_end(&spin);
1221 if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1222 err = -EIO;
1223 i915_request_put(rq);
1225 out_spin:
1226 igt_spinner_fini(&spin);
1227 return err;
1230 static int live_parallel_engines(void *arg)
1232 struct drm_i915_private *i915 = arg;
1233 static int (* const func[])(void *arg) = {
1234 __live_parallel_engine1,
1235 __live_parallel_engineN,
1236 __live_parallel_spin,
1237 NULL,
1239 const unsigned int nengines = num_uabi_engines(i915);
1240 struct intel_engine_cs *engine;
1241 int (* const *fn)(void *arg);
1242 struct task_struct **tsk;
1243 int err = 0;
1246 * Check we can submit requests to all engines concurrently. This
1247 * tests that we load up the system maximally.
1250 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1251 if (!tsk)
1252 return -ENOMEM;
1254 for (fn = func; !err && *fn; fn++) {
1255 char name[KSYM_NAME_LEN];
1256 struct igt_live_test t;
1257 unsigned int idx;
1259 snprintf(name, sizeof(name), "%ps", *fn);
1260 err = igt_live_test_begin(&t, i915, __func__, name);
1261 if (err)
1262 break;
1264 atomic_set(&i915->selftest.counter, nengines);
1266 idx = 0;
1267 for_each_uabi_engine(engine, i915) {
1268 tsk[idx] = kthread_run(*fn, engine,
1269 "igt/parallel:%s",
1270 engine->name);
1271 if (IS_ERR(tsk[idx])) {
1272 err = PTR_ERR(tsk[idx]);
1273 break;
1275 get_task_struct(tsk[idx++]);
1278 yield(); /* start all threads before we kthread_stop() */
1280 idx = 0;
1281 for_each_uabi_engine(engine, i915) {
1282 int status;
1284 if (IS_ERR(tsk[idx]))
1285 break;
1287 status = kthread_stop(tsk[idx]);
1288 if (status && !err)
1289 err = status;
1291 put_task_struct(tsk[idx++]);
1294 if (igt_live_test_end(&t))
1295 err = -EIO;
1298 kfree(tsk);
1299 return err;
1302 static int
1303 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1305 struct i915_request *rq;
1306 int ret;
1309 * Before execlists, all contexts share the same ringbuffer. With
1310 * execlists, each context/engine has a separate ringbuffer and
1311 * for the purposes of this test, inexhaustible.
1313 * For the global ringbuffer though, we have to be very careful
1314 * that we do not wrap while preventing the execution of requests
1315 * with a unsignaled fence.
1317 if (HAS_EXECLISTS(ctx->i915))
1318 return INT_MAX;
1320 rq = igt_request_alloc(ctx, engine);
1321 if (IS_ERR(rq)) {
1322 ret = PTR_ERR(rq);
1323 } else {
1324 int sz;
1326 ret = rq->ring->size - rq->reserved_space;
1327 i915_request_add(rq);
1329 sz = rq->ring->emit - rq->head;
1330 if (sz < 0)
1331 sz += rq->ring->size;
1332 ret /= sz;
1333 ret /= 2; /* leave half spare, in case of emergency! */
1336 return ret;
1339 static int live_breadcrumbs_smoketest(void *arg)
1341 struct drm_i915_private *i915 = arg;
1342 const unsigned int nengines = num_uabi_engines(i915);
1343 const unsigned int ncpus = num_online_cpus();
1344 unsigned long num_waits, num_fences;
1345 struct intel_engine_cs *engine;
1346 struct task_struct **threads;
1347 struct igt_live_test live;
1348 intel_wakeref_t wakeref;
1349 struct smoketest *smoke;
1350 unsigned int n, idx;
1351 struct file *file;
1352 int ret = 0;
1355 * Smoketest our breadcrumb/signal handling for requests across multiple
1356 * threads. A very simple test to only catch the most egregious of bugs.
1357 * See __igt_breadcrumbs_smoketest();
1359 * On real hardware this time.
1362 wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1364 file = mock_file(i915);
1365 if (IS_ERR(file)) {
1366 ret = PTR_ERR(file);
1367 goto out_rpm;
1370 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1371 if (!smoke) {
1372 ret = -ENOMEM;
1373 goto out_file;
1376 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1377 if (!threads) {
1378 ret = -ENOMEM;
1379 goto out_smoke;
1382 smoke[0].request_alloc = __live_request_alloc;
1383 smoke[0].ncontexts = 64;
1384 smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1385 sizeof(*smoke[0].contexts),
1386 GFP_KERNEL);
1387 if (!smoke[0].contexts) {
1388 ret = -ENOMEM;
1389 goto out_threads;
1392 for (n = 0; n < smoke[0].ncontexts; n++) {
1393 smoke[0].contexts[n] = live_context(i915, file);
1394 if (!smoke[0].contexts[n]) {
1395 ret = -ENOMEM;
1396 goto out_contexts;
1400 ret = igt_live_test_begin(&live, i915, __func__, "");
1401 if (ret)
1402 goto out_contexts;
1404 idx = 0;
1405 for_each_uabi_engine(engine, i915) {
1406 smoke[idx] = smoke[0];
1407 smoke[idx].engine = engine;
1408 smoke[idx].max_batch =
1409 max_batches(smoke[0].contexts[0], engine);
1410 if (smoke[idx].max_batch < 0) {
1411 ret = smoke[idx].max_batch;
1412 goto out_flush;
1414 /* One ring interleaved between requests from all cpus */
1415 smoke[idx].max_batch /= num_online_cpus() + 1;
1416 pr_debug("Limiting batches to %d requests on %s\n",
1417 smoke[idx].max_batch, engine->name);
1419 for (n = 0; n < ncpus; n++) {
1420 struct task_struct *tsk;
1422 tsk = kthread_run(__igt_breadcrumbs_smoketest,
1423 &smoke[idx], "igt/%d.%d", idx, n);
1424 if (IS_ERR(tsk)) {
1425 ret = PTR_ERR(tsk);
1426 goto out_flush;
1429 get_task_struct(tsk);
1430 threads[idx * ncpus + n] = tsk;
1433 idx++;
1436 yield(); /* start all threads before we begin */
1437 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1439 out_flush:
1440 idx = 0;
1441 num_waits = 0;
1442 num_fences = 0;
1443 for_each_uabi_engine(engine, i915) {
1444 for (n = 0; n < ncpus; n++) {
1445 struct task_struct *tsk = threads[idx * ncpus + n];
1446 int err;
1448 if (!tsk)
1449 continue;
1451 err = kthread_stop(tsk);
1452 if (err < 0 && !ret)
1453 ret = err;
1455 put_task_struct(tsk);
1458 num_waits += atomic_long_read(&smoke[idx].num_waits);
1459 num_fences += atomic_long_read(&smoke[idx].num_fences);
1460 idx++;
1462 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1463 num_waits, num_fences, idx, ncpus);
1465 ret = igt_live_test_end(&live) ?: ret;
1466 out_contexts:
1467 kfree(smoke[0].contexts);
1468 out_threads:
1469 kfree(threads);
1470 out_smoke:
1471 kfree(smoke);
1472 out_file:
1473 fput(file);
1474 out_rpm:
1475 intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1477 return ret;
1480 int i915_request_live_selftests(struct drm_i915_private *i915)
1482 static const struct i915_subtest tests[] = {
1483 SUBTEST(live_nop_request),
1484 SUBTEST(live_all_engines),
1485 SUBTEST(live_sequential_engines),
1486 SUBTEST(live_parallel_engines),
1487 SUBTEST(live_empty_request),
1488 SUBTEST(live_breadcrumbs_smoketest),
1491 if (intel_gt_is_wedged(&i915->gt))
1492 return 0;
1494 return i915_subtests(tests, i915);
1497 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1499 struct i915_request *rq;
1500 struct dma_fence *fence;
1502 rq = intel_engine_create_kernel_request(ce->engine);
1503 if (IS_ERR(rq))
1504 return PTR_ERR(rq);
1506 fence = i915_active_fence_get(&ce->timeline->last_request);
1507 if (fence) {
1508 i915_request_await_dma_fence(rq, fence);
1509 dma_fence_put(fence);
1512 rq = i915_request_get(rq);
1513 i915_request_add(rq);
1514 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1515 err = -ETIME;
1516 i915_request_put(rq);
1518 while (!err && !intel_engine_is_idle(ce->engine))
1519 intel_engine_flush_submission(ce->engine);
1521 return err;
1524 struct perf_stats {
1525 struct intel_engine_cs *engine;
1526 unsigned long count;
1527 ktime_t time;
1528 ktime_t busy;
1529 u64 runtime;
1532 struct perf_series {
1533 struct drm_i915_private *i915;
1534 unsigned int nengines;
1535 struct intel_context *ce[];
1538 static int cmp_u32(const void *A, const void *B)
1540 const u32 *a = A, *b = B;
1542 return *a - *b;
1545 static u32 trifilter(u32 *a)
1547 u64 sum;
1549 #define TF_COUNT 5
1550 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1552 sum = mul_u32_u32(a[2], 2);
1553 sum += a[1];
1554 sum += a[3];
1556 GEM_BUG_ON(sum > U32_MAX);
1557 return sum;
1558 #define TF_BIAS 2
1561 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1563 u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
1565 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1568 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1570 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1571 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1572 *cs++ = offset;
1573 *cs++ = 0;
1575 return cs;
1578 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1580 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1581 *cs++ = offset;
1582 *cs++ = 0;
1583 *cs++ = value;
1585 return cs;
1588 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1590 *cs++ = MI_SEMAPHORE_WAIT |
1591 MI_SEMAPHORE_GLOBAL_GTT |
1592 MI_SEMAPHORE_POLL |
1593 mode;
1594 *cs++ = value;
1595 *cs++ = offset;
1596 *cs++ = 0;
1598 return cs;
1601 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1603 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1606 static void semaphore_set(u32 *sema, u32 value)
1608 WRITE_ONCE(*sema, value);
1609 wmb(); /* flush the update to the cache, and beyond */
1612 static u32 *hwsp_scratch(const struct intel_context *ce)
1614 return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1617 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1619 return (i915_ggtt_offset(ce->engine->status_page.vma) +
1620 offset_in_page(dw));
1623 static int measure_semaphore_response(struct intel_context *ce)
1625 u32 *sema = hwsp_scratch(ce);
1626 const u32 offset = hwsp_offset(ce, sema);
1627 u32 elapsed[TF_COUNT], cycles;
1628 struct i915_request *rq;
1629 u32 *cs;
1630 int err;
1631 int i;
1634 * Measure how many cycles it takes for the HW to detect the change
1635 * in a semaphore value.
1637 * A: read CS_TIMESTAMP from CPU
1638 * poke semaphore
1639 * B: read CS_TIMESTAMP on GPU
1641 * Semaphore latency: B - A
1644 semaphore_set(sema, -1);
1646 rq = i915_request_create(ce);
1647 if (IS_ERR(rq))
1648 return PTR_ERR(rq);
1650 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1651 if (IS_ERR(cs)) {
1652 i915_request_add(rq);
1653 err = PTR_ERR(cs);
1654 goto err;
1657 cs = emit_store_dw(cs, offset, 0);
1658 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1659 cs = emit_semaphore_poll_until(cs, offset, i);
1660 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1661 cs = emit_store_dw(cs, offset, 0);
1664 intel_ring_advance(rq, cs);
1665 i915_request_add(rq);
1667 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1668 err = -EIO;
1669 goto err;
1672 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1673 preempt_disable();
1674 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1675 semaphore_set(sema, i);
1676 preempt_enable();
1678 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1679 err = -EIO;
1680 goto err;
1683 elapsed[i - 1] = sema[i] - cycles;
1686 cycles = trifilter(elapsed);
1687 pr_info("%s: semaphore response %d cycles, %lluns\n",
1688 ce->engine->name, cycles >> TF_BIAS,
1689 cycles_to_ns(ce->engine, cycles));
1691 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1693 err:
1694 intel_gt_set_wedged(ce->engine->gt);
1695 return err;
1698 static int measure_idle_dispatch(struct intel_context *ce)
1700 u32 *sema = hwsp_scratch(ce);
1701 const u32 offset = hwsp_offset(ce, sema);
1702 u32 elapsed[TF_COUNT], cycles;
1703 u32 *cs;
1704 int err;
1705 int i;
1708 * Measure how long it takes for us to submit a request while the
1709 * engine is idle, but is resting in our context.
1711 * A: read CS_TIMESTAMP from CPU
1712 * submit request
1713 * B: read CS_TIMESTAMP on GPU
1715 * Submission latency: B - A
1718 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1719 struct i915_request *rq;
1721 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1722 if (err)
1723 return err;
1725 rq = i915_request_create(ce);
1726 if (IS_ERR(rq)) {
1727 err = PTR_ERR(rq);
1728 goto err;
1731 cs = intel_ring_begin(rq, 4);
1732 if (IS_ERR(cs)) {
1733 i915_request_add(rq);
1734 err = PTR_ERR(cs);
1735 goto err;
1738 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1740 intel_ring_advance(rq, cs);
1742 preempt_disable();
1743 local_bh_disable();
1744 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1745 i915_request_add(rq);
1746 local_bh_enable();
1747 preempt_enable();
1750 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1751 if (err)
1752 goto err;
1754 for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1755 elapsed[i] = sema[i] - elapsed[i];
1757 cycles = trifilter(elapsed);
1758 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1759 ce->engine->name, cycles >> TF_BIAS,
1760 cycles_to_ns(ce->engine, cycles));
1762 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1764 err:
1765 intel_gt_set_wedged(ce->engine->gt);
1766 return err;
1769 static int measure_busy_dispatch(struct intel_context *ce)
1771 u32 *sema = hwsp_scratch(ce);
1772 const u32 offset = hwsp_offset(ce, sema);
1773 u32 elapsed[TF_COUNT + 1], cycles;
1774 u32 *cs;
1775 int err;
1776 int i;
1779 * Measure how long it takes for us to submit a request while the
1780 * engine is busy, polling on a semaphore in our context. With
1781 * direct submission, this will include the cost of a lite restore.
1783 * A: read CS_TIMESTAMP from CPU
1784 * submit request
1785 * B: read CS_TIMESTAMP on GPU
1787 * Submission latency: B - A
1790 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1791 struct i915_request *rq;
1793 rq = i915_request_create(ce);
1794 if (IS_ERR(rq)) {
1795 err = PTR_ERR(rq);
1796 goto err;
1799 cs = intel_ring_begin(rq, 12);
1800 if (IS_ERR(cs)) {
1801 i915_request_add(rq);
1802 err = PTR_ERR(cs);
1803 goto err;
1806 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
1807 cs = emit_semaphore_poll_until(cs, offset, i);
1808 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1810 intel_ring_advance(rq, cs);
1812 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
1813 err = -EIO;
1814 goto err;
1817 preempt_disable();
1818 local_bh_disable();
1819 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1820 i915_request_add(rq);
1821 local_bh_enable();
1822 semaphore_set(sema, i - 1);
1823 preempt_enable();
1826 wait_for(READ_ONCE(sema[i - 1]), 500);
1827 semaphore_set(sema, i - 1);
1829 for (i = 1; i <= TF_COUNT; i++) {
1830 GEM_BUG_ON(sema[i] == -1);
1831 elapsed[i - 1] = sema[i] - elapsed[i];
1834 cycles = trifilter(elapsed);
1835 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
1836 ce->engine->name, cycles >> TF_BIAS,
1837 cycles_to_ns(ce->engine, cycles));
1839 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1841 err:
1842 intel_gt_set_wedged(ce->engine->gt);
1843 return err;
1846 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
1848 const u32 offset =
1849 i915_ggtt_offset(engine->status_page.vma) +
1850 offset_in_page(sema);
1851 struct i915_request *rq;
1852 u32 *cs;
1854 rq = i915_request_create(engine->kernel_context);
1855 if (IS_ERR(rq))
1856 return PTR_ERR(rq);
1858 cs = intel_ring_begin(rq, 4);
1859 if (IS_ERR(cs)) {
1860 i915_request_add(rq);
1861 return PTR_ERR(cs);
1864 cs = emit_semaphore_poll(cs, mode, value, offset);
1866 intel_ring_advance(rq, cs);
1867 i915_request_add(rq);
1869 return 0;
1872 static int measure_inter_request(struct intel_context *ce)
1874 u32 *sema = hwsp_scratch(ce);
1875 const u32 offset = hwsp_offset(ce, sema);
1876 u32 elapsed[TF_COUNT + 1], cycles;
1877 struct i915_sw_fence *submit;
1878 int i, err;
1881 * Measure how long it takes to advance from one request into the
1882 * next. Between each request we flush the GPU caches to memory,
1883 * update the breadcrumbs, and then invalidate those caches.
1884 * We queue up all the requests to be submitted in one batch so
1885 * it should be one set of contiguous measurements.
1887 * A: read CS_TIMESTAMP on GPU
1888 * advance request
1889 * B: read CS_TIMESTAMP on GPU
1891 * Request latency: B - A
1894 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1895 if (err)
1896 return err;
1898 submit = heap_fence_create(GFP_KERNEL);
1899 if (!submit) {
1900 semaphore_set(sema, 1);
1901 return -ENOMEM;
1904 intel_engine_flush_submission(ce->engine);
1905 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1906 struct i915_request *rq;
1907 u32 *cs;
1909 rq = i915_request_create(ce);
1910 if (IS_ERR(rq)) {
1911 err = PTR_ERR(rq);
1912 goto err_submit;
1915 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
1916 submit,
1917 GFP_KERNEL);
1918 if (err < 0) {
1919 i915_request_add(rq);
1920 goto err_submit;
1923 cs = intel_ring_begin(rq, 4);
1924 if (IS_ERR(cs)) {
1925 i915_request_add(rq);
1926 err = PTR_ERR(cs);
1927 goto err_submit;
1930 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1932 intel_ring_advance(rq, cs);
1933 i915_request_add(rq);
1935 local_bh_disable();
1936 i915_sw_fence_commit(submit);
1937 local_bh_enable();
1938 intel_engine_flush_submission(ce->engine);
1939 heap_fence_put(submit);
1941 semaphore_set(sema, 1);
1942 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1943 if (err)
1944 goto err;
1946 for (i = 1; i <= TF_COUNT; i++)
1947 elapsed[i - 1] = sema[i + 1] - sema[i];
1949 cycles = trifilter(elapsed);
1950 pr_info("%s: inter-request latency %d cycles, %lluns\n",
1951 ce->engine->name, cycles >> TF_BIAS,
1952 cycles_to_ns(ce->engine, cycles));
1954 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1956 err_submit:
1957 i915_sw_fence_commit(submit);
1958 heap_fence_put(submit);
1959 semaphore_set(sema, 1);
1960 err:
1961 intel_gt_set_wedged(ce->engine->gt);
1962 return err;
1965 static int measure_context_switch(struct intel_context *ce)
1967 u32 *sema = hwsp_scratch(ce);
1968 const u32 offset = hwsp_offset(ce, sema);
1969 struct i915_request *fence = NULL;
1970 u32 elapsed[TF_COUNT + 1], cycles;
1971 int i, j, err;
1972 u32 *cs;
1975 * Measure how long it takes to advance from one request in one
1976 * context to a request in another context. This allows us to
1977 * measure how long the context save/restore take, along with all
1978 * the inter-context setup we require.
1980 * A: read CS_TIMESTAMP on GPU
1981 * switch context
1982 * B: read CS_TIMESTAMP on GPU
1984 * Context switch latency: B - A
1987 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1988 if (err)
1989 return err;
1991 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1992 struct intel_context *arr[] = {
1993 ce, ce->engine->kernel_context
1995 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
1997 for (j = 0; j < ARRAY_SIZE(arr); j++) {
1998 struct i915_request *rq;
2000 rq = i915_request_create(arr[j]);
2001 if (IS_ERR(rq)) {
2002 err = PTR_ERR(rq);
2003 goto err_fence;
2006 if (fence) {
2007 err = i915_request_await_dma_fence(rq,
2008 &fence->fence);
2009 if (err) {
2010 i915_request_add(rq);
2011 goto err_fence;
2015 cs = intel_ring_begin(rq, 4);
2016 if (IS_ERR(cs)) {
2017 i915_request_add(rq);
2018 err = PTR_ERR(cs);
2019 goto err_fence;
2022 cs = emit_timestamp_store(cs, ce, addr);
2023 addr += sizeof(u32);
2025 intel_ring_advance(rq, cs);
2027 i915_request_put(fence);
2028 fence = i915_request_get(rq);
2030 i915_request_add(rq);
2033 i915_request_put(fence);
2034 intel_engine_flush_submission(ce->engine);
2036 semaphore_set(sema, 1);
2037 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2038 if (err)
2039 goto err;
2041 for (i = 1; i <= TF_COUNT; i++)
2042 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2044 cycles = trifilter(elapsed);
2045 pr_info("%s: context switch latency %d cycles, %lluns\n",
2046 ce->engine->name, cycles >> TF_BIAS,
2047 cycles_to_ns(ce->engine, cycles));
2049 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2051 err_fence:
2052 i915_request_put(fence);
2053 semaphore_set(sema, 1);
2054 err:
2055 intel_gt_set_wedged(ce->engine->gt);
2056 return err;
2059 static int measure_preemption(struct intel_context *ce)
2061 u32 *sema = hwsp_scratch(ce);
2062 const u32 offset = hwsp_offset(ce, sema);
2063 u32 elapsed[TF_COUNT], cycles;
2064 u32 *cs;
2065 int err;
2066 int i;
2069 * We measure two latencies while triggering preemption. The first
2070 * latency is how long it takes for us to submit a preempting request.
2071 * The second latency is how it takes for us to return from the
2072 * preemption back to the original context.
2074 * A: read CS_TIMESTAMP from CPU
2075 * submit preemption
2076 * B: read CS_TIMESTAMP on GPU (in preempting context)
2077 * context switch
2078 * C: read CS_TIMESTAMP on GPU (in original context)
2080 * Preemption dispatch latency: B - A
2081 * Preemption switch latency: C - B
2084 if (!intel_engine_has_preemption(ce->engine))
2085 return 0;
2087 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2088 u32 addr = offset + 2 * i * sizeof(u32);
2089 struct i915_request *rq;
2091 rq = i915_request_create(ce);
2092 if (IS_ERR(rq)) {
2093 err = PTR_ERR(rq);
2094 goto err;
2097 cs = intel_ring_begin(rq, 12);
2098 if (IS_ERR(cs)) {
2099 i915_request_add(rq);
2100 err = PTR_ERR(cs);
2101 goto err;
2104 cs = emit_store_dw(cs, addr, -1);
2105 cs = emit_semaphore_poll_until(cs, offset, i);
2106 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2108 intel_ring_advance(rq, cs);
2109 i915_request_add(rq);
2111 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2112 err = -EIO;
2113 goto err;
2116 rq = i915_request_create(ce->engine->kernel_context);
2117 if (IS_ERR(rq)) {
2118 err = PTR_ERR(rq);
2119 goto err;
2122 cs = intel_ring_begin(rq, 8);
2123 if (IS_ERR(cs)) {
2124 i915_request_add(rq);
2125 err = PTR_ERR(cs);
2126 goto err;
2129 cs = emit_timestamp_store(cs, ce, addr);
2130 cs = emit_store_dw(cs, offset, i);
2132 intel_ring_advance(rq, cs);
2133 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2135 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2136 i915_request_add(rq);
2139 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2140 err = -EIO;
2141 goto err;
2144 for (i = 1; i <= TF_COUNT; i++)
2145 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2147 cycles = trifilter(elapsed);
2148 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2149 ce->engine->name, cycles >> TF_BIAS,
2150 cycles_to_ns(ce->engine, cycles));
2152 for (i = 1; i <= TF_COUNT; i++)
2153 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2155 cycles = trifilter(elapsed);
2156 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2157 ce->engine->name, cycles >> TF_BIAS,
2158 cycles_to_ns(ce->engine, cycles));
2160 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2162 err:
2163 intel_gt_set_wedged(ce->engine->gt);
2164 return err;
2167 struct signal_cb {
2168 struct dma_fence_cb base;
2169 bool seen;
2172 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2174 struct signal_cb *s = container_of(cb, typeof(*s), base);
2176 smp_store_mb(s->seen, true); /* be safe, be strong */
2179 static int measure_completion(struct intel_context *ce)
2181 u32 *sema = hwsp_scratch(ce);
2182 const u32 offset = hwsp_offset(ce, sema);
2183 u32 elapsed[TF_COUNT], cycles;
2184 u32 *cs;
2185 int err;
2186 int i;
2189 * Measure how long it takes for the signal (interrupt) to be
2190 * sent from the GPU to be processed by the CPU.
2192 * A: read CS_TIMESTAMP on GPU
2193 * signal
2194 * B: read CS_TIMESTAMP from CPU
2196 * Completion latency: B - A
2199 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2200 struct signal_cb cb = { .seen = false };
2201 struct i915_request *rq;
2203 rq = i915_request_create(ce);
2204 if (IS_ERR(rq)) {
2205 err = PTR_ERR(rq);
2206 goto err;
2209 cs = intel_ring_begin(rq, 12);
2210 if (IS_ERR(cs)) {
2211 i915_request_add(rq);
2212 err = PTR_ERR(cs);
2213 goto err;
2216 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2217 cs = emit_semaphore_poll_until(cs, offset, i);
2218 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2220 intel_ring_advance(rq, cs);
2222 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2224 local_bh_disable();
2225 i915_request_add(rq);
2226 local_bh_enable();
2228 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2229 err = -EIO;
2230 goto err;
2233 preempt_disable();
2234 semaphore_set(sema, i);
2235 while (!READ_ONCE(cb.seen))
2236 cpu_relax();
2238 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2239 preempt_enable();
2242 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2243 if (err)
2244 goto err;
2246 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2247 GEM_BUG_ON(sema[i + 1] == -1);
2248 elapsed[i] = elapsed[i] - sema[i + 1];
2251 cycles = trifilter(elapsed);
2252 pr_info("%s: completion latency %d cycles, %lluns\n",
2253 ce->engine->name, cycles >> TF_BIAS,
2254 cycles_to_ns(ce->engine, cycles));
2256 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2258 err:
2259 intel_gt_set_wedged(ce->engine->gt);
2260 return err;
2263 static void rps_pin(struct intel_gt *gt)
2265 /* Pin the frequency to max */
2266 atomic_inc(&gt->rps.num_waiters);
2267 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2269 mutex_lock(&gt->rps.lock);
2270 intel_rps_set(&gt->rps, gt->rps.max_freq);
2271 mutex_unlock(&gt->rps.lock);
2274 static void rps_unpin(struct intel_gt *gt)
2276 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2277 atomic_dec(&gt->rps.num_waiters);
2280 static int perf_request_latency(void *arg)
2282 struct drm_i915_private *i915 = arg;
2283 struct intel_engine_cs *engine;
2284 struct pm_qos_request qos;
2285 int err = 0;
2287 if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
2288 return 0;
2290 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2292 for_each_uabi_engine(engine, i915) {
2293 struct intel_context *ce;
2295 ce = intel_context_create(engine);
2296 if (IS_ERR(ce)) {
2297 err = PTR_ERR(ce);
2298 goto out;
2301 err = intel_context_pin(ce);
2302 if (err) {
2303 intel_context_put(ce);
2304 goto out;
2307 st_engine_heartbeat_disable(engine);
2308 rps_pin(engine->gt);
2310 if (err == 0)
2311 err = measure_semaphore_response(ce);
2312 if (err == 0)
2313 err = measure_idle_dispatch(ce);
2314 if (err == 0)
2315 err = measure_busy_dispatch(ce);
2316 if (err == 0)
2317 err = measure_inter_request(ce);
2318 if (err == 0)
2319 err = measure_context_switch(ce);
2320 if (err == 0)
2321 err = measure_preemption(ce);
2322 if (err == 0)
2323 err = measure_completion(ce);
2325 rps_unpin(engine->gt);
2326 st_engine_heartbeat_enable(engine);
2328 intel_context_unpin(ce);
2329 intel_context_put(ce);
2330 if (err)
2331 goto out;
2334 out:
2335 if (igt_flush_test(i915))
2336 err = -EIO;
2338 cpu_latency_qos_remove_request(&qos);
2339 return err;
2342 static int s_sync0(void *arg)
2344 struct perf_series *ps = arg;
2345 IGT_TIMEOUT(end_time);
2346 unsigned int idx = 0;
2347 int err = 0;
2349 GEM_BUG_ON(!ps->nengines);
2350 do {
2351 struct i915_request *rq;
2353 rq = i915_request_create(ps->ce[idx]);
2354 if (IS_ERR(rq)) {
2355 err = PTR_ERR(rq);
2356 break;
2359 i915_request_get(rq);
2360 i915_request_add(rq);
2362 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2363 err = -ETIME;
2364 i915_request_put(rq);
2365 if (err)
2366 break;
2368 if (++idx == ps->nengines)
2369 idx = 0;
2370 } while (!__igt_timeout(end_time, NULL));
2372 return err;
2375 static int s_sync1(void *arg)
2377 struct perf_series *ps = arg;
2378 struct i915_request *prev = NULL;
2379 IGT_TIMEOUT(end_time);
2380 unsigned int idx = 0;
2381 int err = 0;
2383 GEM_BUG_ON(!ps->nengines);
2384 do {
2385 struct i915_request *rq;
2387 rq = i915_request_create(ps->ce[idx]);
2388 if (IS_ERR(rq)) {
2389 err = PTR_ERR(rq);
2390 break;
2393 i915_request_get(rq);
2394 i915_request_add(rq);
2396 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2397 err = -ETIME;
2398 i915_request_put(prev);
2399 prev = rq;
2400 if (err)
2401 break;
2403 if (++idx == ps->nengines)
2404 idx = 0;
2405 } while (!__igt_timeout(end_time, NULL));
2406 i915_request_put(prev);
2408 return err;
2411 static int s_many(void *arg)
2413 struct perf_series *ps = arg;
2414 IGT_TIMEOUT(end_time);
2415 unsigned int idx = 0;
2417 GEM_BUG_ON(!ps->nengines);
2418 do {
2419 struct i915_request *rq;
2421 rq = i915_request_create(ps->ce[idx]);
2422 if (IS_ERR(rq))
2423 return PTR_ERR(rq);
2425 i915_request_add(rq);
2427 if (++idx == ps->nengines)
2428 idx = 0;
2429 } while (!__igt_timeout(end_time, NULL));
2431 return 0;
2434 static int perf_series_engines(void *arg)
2436 struct drm_i915_private *i915 = arg;
2437 static int (* const func[])(void *arg) = {
2438 s_sync0,
2439 s_sync1,
2440 s_many,
2441 NULL,
2443 const unsigned int nengines = num_uabi_engines(i915);
2444 struct intel_engine_cs *engine;
2445 int (* const *fn)(void *arg);
2446 struct pm_qos_request qos;
2447 struct perf_stats *stats;
2448 struct perf_series *ps;
2449 unsigned int idx;
2450 int err = 0;
2452 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2453 if (!stats)
2454 return -ENOMEM;
2456 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2457 if (!ps) {
2458 kfree(stats);
2459 return -ENOMEM;
2462 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2464 ps->i915 = i915;
2465 ps->nengines = nengines;
2467 idx = 0;
2468 for_each_uabi_engine(engine, i915) {
2469 struct intel_context *ce;
2471 ce = intel_context_create(engine);
2472 if (IS_ERR(ce)) {
2473 err = PTR_ERR(ce);
2474 goto out;
2477 err = intel_context_pin(ce);
2478 if (err) {
2479 intel_context_put(ce);
2480 goto out;
2483 ps->ce[idx++] = ce;
2485 GEM_BUG_ON(idx != ps->nengines);
2487 for (fn = func; *fn && !err; fn++) {
2488 char name[KSYM_NAME_LEN];
2489 struct igt_live_test t;
2491 snprintf(name, sizeof(name), "%ps", *fn);
2492 err = igt_live_test_begin(&t, i915, __func__, name);
2493 if (err)
2494 break;
2496 for (idx = 0; idx < nengines; idx++) {
2497 struct perf_stats *p =
2498 memset(&stats[idx], 0, sizeof(stats[idx]));
2499 struct intel_context *ce = ps->ce[idx];
2501 p->engine = ps->ce[idx]->engine;
2502 intel_engine_pm_get(p->engine);
2504 if (intel_engine_supports_stats(p->engine))
2505 p->busy = intel_engine_get_busy_time(p->engine,
2506 &p->time) + 1;
2507 else
2508 p->time = ktime_get();
2509 p->runtime = -intel_context_get_total_runtime_ns(ce);
2512 err = (*fn)(ps);
2513 if (igt_live_test_end(&t))
2514 err = -EIO;
2516 for (idx = 0; idx < nengines; idx++) {
2517 struct perf_stats *p = &stats[idx];
2518 struct intel_context *ce = ps->ce[idx];
2519 int integer, decimal;
2520 u64 busy, dt, now;
2522 if (p->busy)
2523 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2524 &now),
2525 p->busy - 1);
2526 else
2527 now = ktime_get();
2528 p->time = ktime_sub(now, p->time);
2530 err = switch_to_kernel_sync(ce, err);
2531 p->runtime += intel_context_get_total_runtime_ns(ce);
2532 intel_engine_pm_put(p->engine);
2534 busy = 100 * ktime_to_ns(p->busy);
2535 dt = ktime_to_ns(p->time);
2536 if (dt) {
2537 integer = div64_u64(busy, dt);
2538 busy -= integer * dt;
2539 decimal = div64_u64(100 * busy, dt);
2540 } else {
2541 integer = 0;
2542 decimal = 0;
2545 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2546 name, p->engine->name, ce->timeline->seqno,
2547 integer, decimal,
2548 div_u64(p->runtime, 1000 * 1000),
2549 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2553 out:
2554 for (idx = 0; idx < nengines; idx++) {
2555 if (IS_ERR_OR_NULL(ps->ce[idx]))
2556 break;
2558 intel_context_unpin(ps->ce[idx]);
2559 intel_context_put(ps->ce[idx]);
2561 kfree(ps);
2563 cpu_latency_qos_remove_request(&qos);
2564 kfree(stats);
2565 return err;
2568 static int p_sync0(void *arg)
2570 struct perf_stats *p = arg;
2571 struct intel_engine_cs *engine = p->engine;
2572 struct intel_context *ce;
2573 IGT_TIMEOUT(end_time);
2574 unsigned long count;
2575 bool busy;
2576 int err = 0;
2578 ce = intel_context_create(engine);
2579 if (IS_ERR(ce))
2580 return PTR_ERR(ce);
2582 err = intel_context_pin(ce);
2583 if (err) {
2584 intel_context_put(ce);
2585 return err;
2588 if (intel_engine_supports_stats(engine)) {
2589 p->busy = intel_engine_get_busy_time(engine, &p->time);
2590 busy = true;
2591 } else {
2592 p->time = ktime_get();
2593 busy = false;
2596 count = 0;
2597 do {
2598 struct i915_request *rq;
2600 rq = i915_request_create(ce);
2601 if (IS_ERR(rq)) {
2602 err = PTR_ERR(rq);
2603 break;
2606 i915_request_get(rq);
2607 i915_request_add(rq);
2609 err = 0;
2610 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2611 err = -ETIME;
2612 i915_request_put(rq);
2613 if (err)
2614 break;
2616 count++;
2617 } while (!__igt_timeout(end_time, NULL));
2619 if (busy) {
2620 ktime_t now;
2622 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2623 p->busy);
2624 p->time = ktime_sub(now, p->time);
2625 } else {
2626 p->time = ktime_sub(ktime_get(), p->time);
2629 err = switch_to_kernel_sync(ce, err);
2630 p->runtime = intel_context_get_total_runtime_ns(ce);
2631 p->count = count;
2633 intel_context_unpin(ce);
2634 intel_context_put(ce);
2635 return err;
2638 static int p_sync1(void *arg)
2640 struct perf_stats *p = arg;
2641 struct intel_engine_cs *engine = p->engine;
2642 struct i915_request *prev = NULL;
2643 struct intel_context *ce;
2644 IGT_TIMEOUT(end_time);
2645 unsigned long count;
2646 bool busy;
2647 int err = 0;
2649 ce = intel_context_create(engine);
2650 if (IS_ERR(ce))
2651 return PTR_ERR(ce);
2653 err = intel_context_pin(ce);
2654 if (err) {
2655 intel_context_put(ce);
2656 return err;
2659 if (intel_engine_supports_stats(engine)) {
2660 p->busy = intel_engine_get_busy_time(engine, &p->time);
2661 busy = true;
2662 } else {
2663 p->time = ktime_get();
2664 busy = false;
2667 count = 0;
2668 do {
2669 struct i915_request *rq;
2671 rq = i915_request_create(ce);
2672 if (IS_ERR(rq)) {
2673 err = PTR_ERR(rq);
2674 break;
2677 i915_request_get(rq);
2678 i915_request_add(rq);
2680 err = 0;
2681 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2682 err = -ETIME;
2683 i915_request_put(prev);
2684 prev = rq;
2685 if (err)
2686 break;
2688 count++;
2689 } while (!__igt_timeout(end_time, NULL));
2690 i915_request_put(prev);
2692 if (busy) {
2693 ktime_t now;
2695 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2696 p->busy);
2697 p->time = ktime_sub(now, p->time);
2698 } else {
2699 p->time = ktime_sub(ktime_get(), p->time);
2702 err = switch_to_kernel_sync(ce, err);
2703 p->runtime = intel_context_get_total_runtime_ns(ce);
2704 p->count = count;
2706 intel_context_unpin(ce);
2707 intel_context_put(ce);
2708 return err;
2711 static int p_many(void *arg)
2713 struct perf_stats *p = arg;
2714 struct intel_engine_cs *engine = p->engine;
2715 struct intel_context *ce;
2716 IGT_TIMEOUT(end_time);
2717 unsigned long count;
2718 int err = 0;
2719 bool busy;
2721 ce = intel_context_create(engine);
2722 if (IS_ERR(ce))
2723 return PTR_ERR(ce);
2725 err = intel_context_pin(ce);
2726 if (err) {
2727 intel_context_put(ce);
2728 return err;
2731 if (intel_engine_supports_stats(engine)) {
2732 p->busy = intel_engine_get_busy_time(engine, &p->time);
2733 busy = true;
2734 } else {
2735 p->time = ktime_get();
2736 busy = false;
2739 count = 0;
2740 do {
2741 struct i915_request *rq;
2743 rq = i915_request_create(ce);
2744 if (IS_ERR(rq)) {
2745 err = PTR_ERR(rq);
2746 break;
2749 i915_request_add(rq);
2750 count++;
2751 } while (!__igt_timeout(end_time, NULL));
2753 if (busy) {
2754 ktime_t now;
2756 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2757 p->busy);
2758 p->time = ktime_sub(now, p->time);
2759 } else {
2760 p->time = ktime_sub(ktime_get(), p->time);
2763 err = switch_to_kernel_sync(ce, err);
2764 p->runtime = intel_context_get_total_runtime_ns(ce);
2765 p->count = count;
2767 intel_context_unpin(ce);
2768 intel_context_put(ce);
2769 return err;
2772 static int perf_parallel_engines(void *arg)
2774 struct drm_i915_private *i915 = arg;
2775 static int (* const func[])(void *arg) = {
2776 p_sync0,
2777 p_sync1,
2778 p_many,
2779 NULL,
2781 const unsigned int nengines = num_uabi_engines(i915);
2782 struct intel_engine_cs *engine;
2783 int (* const *fn)(void *arg);
2784 struct pm_qos_request qos;
2785 struct {
2786 struct perf_stats p;
2787 struct task_struct *tsk;
2788 } *engines;
2789 int err = 0;
2791 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2792 if (!engines)
2793 return -ENOMEM;
2795 cpu_latency_qos_add_request(&qos, 0);
2797 for (fn = func; *fn; fn++) {
2798 char name[KSYM_NAME_LEN];
2799 struct igt_live_test t;
2800 unsigned int idx;
2802 snprintf(name, sizeof(name), "%ps", *fn);
2803 err = igt_live_test_begin(&t, i915, __func__, name);
2804 if (err)
2805 break;
2807 atomic_set(&i915->selftest.counter, nengines);
2809 idx = 0;
2810 for_each_uabi_engine(engine, i915) {
2811 intel_engine_pm_get(engine);
2813 memset(&engines[idx].p, 0, sizeof(engines[idx].p));
2814 engines[idx].p.engine = engine;
2816 engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
2817 "igt:%s", engine->name);
2818 if (IS_ERR(engines[idx].tsk)) {
2819 err = PTR_ERR(engines[idx].tsk);
2820 intel_engine_pm_put(engine);
2821 break;
2823 get_task_struct(engines[idx++].tsk);
2826 yield(); /* start all threads before we kthread_stop() */
2828 idx = 0;
2829 for_each_uabi_engine(engine, i915) {
2830 int status;
2832 if (IS_ERR(engines[idx].tsk))
2833 break;
2835 status = kthread_stop(engines[idx].tsk);
2836 if (status && !err)
2837 err = status;
2839 intel_engine_pm_put(engine);
2840 put_task_struct(engines[idx++].tsk);
2843 if (igt_live_test_end(&t))
2844 err = -EIO;
2845 if (err)
2846 break;
2848 idx = 0;
2849 for_each_uabi_engine(engine, i915) {
2850 struct perf_stats *p = &engines[idx].p;
2851 u64 busy = 100 * ktime_to_ns(p->busy);
2852 u64 dt = ktime_to_ns(p->time);
2853 int integer, decimal;
2855 if (dt) {
2856 integer = div64_u64(busy, dt);
2857 busy -= integer * dt;
2858 decimal = div64_u64(100 * busy, dt);
2859 } else {
2860 integer = 0;
2861 decimal = 0;
2864 GEM_BUG_ON(engine != p->engine);
2865 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2866 name, engine->name, p->count, integer, decimal,
2867 div_u64(p->runtime, 1000 * 1000),
2868 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2869 idx++;
2873 cpu_latency_qos_remove_request(&qos);
2874 kfree(engines);
2875 return err;
2878 int i915_request_perf_selftests(struct drm_i915_private *i915)
2880 static const struct i915_subtest tests[] = {
2881 SUBTEST(perf_request_latency),
2882 SUBTEST(perf_series_engines),
2883 SUBTEST(perf_parallel_engines),
2886 if (intel_gt_is_wedged(&i915->gt))
2887 return 0;
2889 return i915_subtests(tests, i915);