drivers/gpu/drm/i915/gt/selftest_timeline.c

   1 /*
   2  * SPDX-License-Identifier: MIT
   3  *
   4  * Copyright © 2017-2018 Intel Corporation
   5  */
   6
   7 #include <linux/prime_numbers.h>
   8
   9 #include "intel_context.h"
  10 #include "intel_engine_heartbeat.h"
  11 #include "intel_engine_pm.h"
  12 #include "intel_gt.h"
  13 #include "intel_gt_requests.h"
  14 #include "intel_ring.h"
  15 #include "selftest_engine_heartbeat.h"
  16
  17 #include "../selftests/i915_random.h"
  18 #include "../i915_selftest.h"
  19
  20 #include "selftests/igt_flush_test.h"
  21 #include "selftests/lib_sw_fence.h"
  22 #include "selftests/mock_gem_device.h"
  23 #include "selftests/mock_timeline.h"
  24
  25 static struct page *hwsp_page(struct intel_timeline *tl)
  26 {
  27         struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj;
  28
  29         GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
  30         return sg_page(obj->mm.pages->sgl);
  31 }
  32
  33 static unsigned long hwsp_cacheline(struct intel_timeline *tl)
  34 {
  35         unsigned long address = (unsigned long)page_address(hwsp_page(tl));
  36
  37         return (address + tl->hwsp_offset) / CACHELINE_BYTES;
  38 }
  39
  40 #define CACHELINES_PER_PAGE (PAGE_SIZE / CACHELINE_BYTES)
  41
  42 struct mock_hwsp_freelist {
  43         struct intel_gt *gt;
  44         struct radix_tree_root cachelines;
  45         struct intel_timeline **history;
  46         unsigned long count, max;
  47         struct rnd_state prng;
  48 };
  49
  50 enum {
  51         SHUFFLE = BIT(0),
  52 };
  53
  54 static void __mock_hwsp_record(struct mock_hwsp_freelist *state,
  55                                unsigned int idx,
  56                                struct intel_timeline *tl)
  57 {
  58         tl = xchg(&state->history[idx], tl);
  59         if (tl) {
  60                 radix_tree_delete(&state->cachelines, hwsp_cacheline(tl));
  61                 intel_timeline_put(tl);
  62         }
  63 }
  64
  65 static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
  66                                 unsigned int count,
  67                                 unsigned int flags)
  68 {
  69         struct intel_timeline *tl;
  70         unsigned int idx;
  71
  72         while (count--) {
  73                 unsigned long cacheline;
  74                 int err;
  75
  76                 tl = intel_timeline_create(state->gt);
  77                 if (IS_ERR(tl))
  78                         return PTR_ERR(tl);
  79
  80                 cacheline = hwsp_cacheline(tl);
  81                 err = radix_tree_insert(&state->cachelines, cacheline, tl);
  82                 if (err) {
  83                         if (err == -EEXIST) {
  84                                 pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
  85                                        cacheline);
  86                         }
  87                         intel_timeline_put(tl);
  88                         return err;
  89                 }
  90
  91                 idx = state->count++ % state->max;
  92                 __mock_hwsp_record(state, idx, tl);
  93         }
  94
  95         if (flags & SHUFFLE)
  96                 i915_prandom_shuffle(state->history,
  97                                      sizeof(*state->history),
  98                                      min(state->count, state->max),
  99                                      &state->prng);
 100
 101         count = i915_prandom_u32_max_state(min(state->count, state->max),
 102                                            &state->prng);
 103         while (count--) {
 104                 idx = --state->count % state->max;
 105                 __mock_hwsp_record(state, idx, NULL);
 106         }
 107
 108         return 0;
 109 }
 110
 111 static int mock_hwsp_freelist(void *arg)
 112 {
 113         struct mock_hwsp_freelist state;
 114         struct drm_i915_private *i915;
 115         const struct {
 116                 const char *name;
 117                 unsigned int flags;
 118         } phases[] = {
 119                 { "linear", 0 },
 120                 { "shuffled", SHUFFLE },
 121                 { },
 122         }, *p;
 123         unsigned int na;
 124         int err = 0;
 125
 126         i915 = mock_gem_device();
 127         if (!i915)
 128                 return -ENOMEM;
 129
 130         INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
 131         state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
 132
 133         state.gt = &i915->gt;
 134
 135         /*
 136          * Create a bunch of timelines and check that their HWSP do not overlap.
 137          * Free some, and try again.
 138          */
 139
 140         state.max = PAGE_SIZE / sizeof(*state.history);
 141         state.count = 0;
 142         state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL);
 143         if (!state.history) {
 144                 err = -ENOMEM;
 145                 goto err_put;
 146         }
 147
 148         for (p = phases; p->name; p++) {
 149                 pr_debug("%s(%s)\n", __func__, p->name);
 150                 for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) {
 151                         err = __mock_hwsp_timeline(&state, na, p->flags);
 152                         if (err)
 153                                 goto out;
 154                 }
 155         }
 156
 157 out:
 158         for (na = 0; na < state.max; na++)
 159                 __mock_hwsp_record(&state, na, NULL);
 160         kfree(state.history);
 161 err_put:
 162         mock_destroy_device(i915);
 163         return err;
 164 }
 165
 166 struct __igt_sync {
 167         const char *name;
 168         u32 seqno;
 169         bool expected;
 170         bool set;
 171 };
 172
 173 static int __igt_sync(struct intel_timeline *tl,
 174                       u64 ctx,
 175                       const struct __igt_sync *p,
 176                       const char *name)
 177 {
 178         int ret;
 179
 180         if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) {
 181                 pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
 182                        name, p->name, ctx, p->seqno, yesno(p->expected));
 183                 return -EINVAL;
 184         }
 185
 186         if (p->set) {
 187                 ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
 188                 if (ret)
 189                         return ret;
 190         }
 191
 192         return 0;
 193 }
 194
 195 static int igt_sync(void *arg)
 196 {
 197         const struct __igt_sync pass[] = {
 198                 { "unset", 0, false, false },
 199                 { "new", 0, false, true },
 200                 { "0a", 0, true, true },
 201                 { "1a", 1, false, true },
 202                 { "1b", 1, true, true },
 203                 { "0b", 0, true, false },
 204                 { "2a", 2, false, true },
 205                 { "4", 4, false, true },
 206                 { "INT_MAX", INT_MAX, false, true },
 207                 { "INT_MAX-1", INT_MAX-1, true, false },
 208                 { "INT_MAX+1", (u32)INT_MAX+1, false, true },
 209                 { "INT_MAX", INT_MAX, true, false },
 210                 { "UINT_MAX", UINT_MAX, false, true },
 211                 { "wrap", 0, false, true },
 212                 { "unwrap", UINT_MAX, true, false },
 213                 {},
 214         }, *p;
 215         struct intel_timeline tl;
 216         int order, offset;
 217         int ret = -ENODEV;
 218
 219         mock_timeline_init(&tl, 0);
 220         for (p = pass; p->name; p++) {
 221                 for (order = 1; order < 64; order++) {
 222                         for (offset = -1; offset <= (order > 1); offset++) {
 223                                 u64 ctx = BIT_ULL(order) + offset;
 224
 225                                 ret = __igt_sync(&tl, ctx, p, "1");
 226                                 if (ret)
 227                                         goto out;
 228                         }
 229                 }
 230         }
 231         mock_timeline_fini(&tl);
 232
 233         mock_timeline_init(&tl, 0);
 234         for (order = 1; order < 64; order++) {
 235                 for (offset = -1; offset <= (order > 1); offset++) {
 236                         u64 ctx = BIT_ULL(order) + offset;
 237
 238                         for (p = pass; p->name; p++) {
 239                                 ret = __igt_sync(&tl, ctx, p, "2");
 240                                 if (ret)
 241                                         goto out;
 242                         }
 243                 }
 244         }
 245
 246 out:
 247         mock_timeline_fini(&tl);
 248         return ret;
 249 }
 250
 251 static unsigned int random_engine(struct rnd_state *rnd)
 252 {
 253         return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd);
 254 }
 255
 256 static int bench_sync(void *arg)
 257 {
 258         struct rnd_state prng;
 259         struct intel_timeline tl;
 260         unsigned long end_time, count;
 261         u64 prng32_1M;
 262         ktime_t kt;
 263         int order, last_order;
 264
 265         mock_timeline_init(&tl, 0);
 266
 267         /* Lookups from cache are very fast and so the random number generation
 268          * and the loop itself becomes a significant factor in the per-iteration
 269          * timings. We try to compensate the results by measuring the overhead
 270          * of the prng and subtract it from the reported results.
 271          */
 272         prandom_seed_state(&prng, i915_selftest.random_seed);
 273         count = 0;
 274         kt = ktime_get();
 275         end_time = jiffies + HZ/10;
 276         do {
 277                 u32 x;
 278
 279                 /* Make sure the compiler doesn't optimise away the prng call */
 280                 WRITE_ONCE(x, prandom_u32_state(&prng));
 281
 282                 count++;
 283         } while (!time_after(jiffies, end_time));
 284         kt = ktime_sub(ktime_get(), kt);
 285         pr_debug("%s: %lu random evaluations, %lluns/prng\n",
 286                  __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 287         prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count);
 288
 289         /* Benchmark (only) setting random context ids */
 290         prandom_seed_state(&prng, i915_selftest.random_seed);
 291         count = 0;
 292         kt = ktime_get();
 293         end_time = jiffies + HZ/10;
 294         do {
 295                 u64 id = i915_prandom_u64_state(&prng);
 296
 297                 __intel_timeline_sync_set(&tl, id, 0);
 298                 count++;
 299         } while (!time_after(jiffies, end_time));
 300         kt = ktime_sub(ktime_get(), kt);
 301         kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
 302         pr_info("%s: %lu random insertions, %lluns/insert\n",
 303                 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 304
 305         /* Benchmark looking up the exact same context ids as we just set */
 306         prandom_seed_state(&prng, i915_selftest.random_seed);
 307         end_time = count;
 308         kt = ktime_get();
 309         while (end_time--) {
 310                 u64 id = i915_prandom_u64_state(&prng);
 311
 312                 if (!__intel_timeline_sync_is_later(&tl, id, 0)) {
 313                         mock_timeline_fini(&tl);
 314                         pr_err("Lookup of %llu failed\n", id);
 315                         return -EINVAL;
 316                 }
 317         }
 318         kt = ktime_sub(ktime_get(), kt);
 319         kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
 320         pr_info("%s: %lu random lookups, %lluns/lookup\n",
 321                 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 322
 323         mock_timeline_fini(&tl);
 324         cond_resched();
 325
 326         mock_timeline_init(&tl, 0);
 327
 328         /* Benchmark setting the first N (in order) contexts */
 329         count = 0;
 330         kt = ktime_get();
 331         end_time = jiffies + HZ/10;
 332         do {
 333                 __intel_timeline_sync_set(&tl, count++, 0);
 334         } while (!time_after(jiffies, end_time));
 335         kt = ktime_sub(ktime_get(), kt);
 336         pr_info("%s: %lu in-order insertions, %lluns/insert\n",
 337                 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 338
 339         /* Benchmark looking up the exact same context ids as we just set */
 340         end_time = count;
 341         kt = ktime_get();
 342         while (end_time--) {
 343                 if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) {
 344                         pr_err("Lookup of %lu failed\n", end_time);
 345                         mock_timeline_fini(&tl);
 346                         return -EINVAL;
 347                 }
 348         }
 349         kt = ktime_sub(ktime_get(), kt);
 350         pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
 351                 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 352
 353         mock_timeline_fini(&tl);
 354         cond_resched();
 355
 356         mock_timeline_init(&tl, 0);
 357
 358         /* Benchmark searching for a random context id and maybe changing it */
 359         prandom_seed_state(&prng, i915_selftest.random_seed);
 360         count = 0;
 361         kt = ktime_get();
 362         end_time = jiffies + HZ/10;
 363         do {
 364                 u32 id = random_engine(&prng);
 365                 u32 seqno = prandom_u32_state(&prng);
 366
 367                 if (!__intel_timeline_sync_is_later(&tl, id, seqno))
 368                         __intel_timeline_sync_set(&tl, id, seqno);
 369
 370                 count++;
 371         } while (!time_after(jiffies, end_time));
 372         kt = ktime_sub(ktime_get(), kt);
 373         kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
 374         pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
 375                 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
 376         mock_timeline_fini(&tl);
 377         cond_resched();
 378
 379         /* Benchmark searching for a known context id and changing the seqno */
 380         for (last_order = 1, order = 1; order < 32;
 381              ({ int tmp = last_order; last_order = order; order += tmp; })) {
 382                 unsigned int mask = BIT(order) - 1;
 383
 384                 mock_timeline_init(&tl, 0);
 385
 386                 count = 0;
 387                 kt = ktime_get();
 388                 end_time = jiffies + HZ/10;
 389                 do {
 390                         /* Without assuming too many details of the underlying
 391                          * implementation, try to identify its phase-changes
 392                          * (if any)!
 393                          */
 394                         u64 id = (u64)(count & mask) << order;
 395
 396                         __intel_timeline_sync_is_later(&tl, id, 0);
 397                         __intel_timeline_sync_set(&tl, id, 0);
 398
 399                         count++;
 400                 } while (!time_after(jiffies, end_time));
 401                 kt = ktime_sub(ktime_get(), kt);
 402                 pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
 403                         __func__, count, order,
 404                         (long long)div64_ul(ktime_to_ns(kt), count));
 405                 mock_timeline_fini(&tl);
 406                 cond_resched();
 407         }
 408
 409         return 0;
 410 }
 411
 412 int intel_timeline_mock_selftests(void)
 413 {
 414         static const struct i915_subtest tests[] = {
 415                 SUBTEST(mock_hwsp_freelist),
 416                 SUBTEST(igt_sync),
 417                 SUBTEST(bench_sync),
 418         };
 419
 420         return i915_subtests(tests, NULL);
 421 }
 422
 423 static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
 424 {
 425         u32 *cs;
 426
 427         cs = intel_ring_begin(rq, 4);
 428         if (IS_ERR(cs))
 429                 return PTR_ERR(cs);
 430
 431         if (INTEL_GEN(rq->engine->i915) >= 8) {
 432                 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 433                 *cs++ = addr;
 434                 *cs++ = 0;
 435                 *cs++ = value;
 436         } else if (INTEL_GEN(rq->engine->i915) >= 4) {
 437                 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 438                 *cs++ = 0;
 439                 *cs++ = addr;
 440                 *cs++ = value;
 441         } else {
 442                 *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 443                 *cs++ = addr;
 444                 *cs++ = value;
 445                 *cs++ = MI_NOOP;
 446         }
 447
 448         intel_ring_advance(rq, cs);
 449
 450         return 0;
 451 }
 452
 453 static struct i915_request *
 454 tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value)
 455 {
 456         struct i915_request *rq;
 457         int err;
 458
 459         err = intel_timeline_pin(tl, NULL);
 460         if (err) {
 461                 rq = ERR_PTR(err);
 462                 goto out;
 463         }
 464
 465         rq = intel_engine_create_kernel_request(engine);
 466         if (IS_ERR(rq))
 467                 goto out_unpin;
 468
 469         i915_request_get(rq);
 470
 471         err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value);
 472         i915_request_add(rq);
 473         if (err) {
 474                 i915_request_put(rq);
 475                 rq = ERR_PTR(err);
 476         }
 477
 478 out_unpin:
 479         intel_timeline_unpin(tl);
 480 out:
 481         if (IS_ERR(rq))
 482                 pr_err("Failed to write to timeline!\n");
 483         return rq;
 484 }
 485
 486 static struct intel_timeline *
 487 checked_intel_timeline_create(struct intel_gt *gt)
 488 {
 489         struct intel_timeline *tl;
 490
 491         tl = intel_timeline_create(gt);
 492         if (IS_ERR(tl))
 493                 return tl;
 494
 495         if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) {
 496                 pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
 497                        *tl->hwsp_seqno, tl->seqno);
 498                 intel_timeline_put(tl);
 499                 return ERR_PTR(-EINVAL);
 500         }
 501
 502         return tl;
 503 }
 504
 505 static int live_hwsp_engine(void *arg)
 506 {
 507 #define NUM_TIMELINES 4096
 508         struct intel_gt *gt = arg;
 509         struct intel_timeline **timelines;
 510         struct intel_engine_cs *engine;
 511         enum intel_engine_id id;
 512         unsigned long count, n;
 513         int err = 0;
 514
 515         /*
 516          * Create a bunch of timelines and check we can write
 517          * independently to each of their breadcrumb slots.
 518          */
 519
 520         timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
 521                                    sizeof(*timelines),
 522                                    GFP_KERNEL);
 523         if (!timelines)
 524                 return -ENOMEM;
 525
 526         count = 0;
 527         for_each_engine(engine, gt, id) {
 528                 if (!intel_engine_can_store_dword(engine))
 529                         continue;
 530
 531                 intel_engine_pm_get(engine);
 532
 533                 for (n = 0; n < NUM_TIMELINES; n++) {
 534                         struct intel_timeline *tl;
 535                         struct i915_request *rq;
 536
 537                         tl = checked_intel_timeline_create(gt);
 538                         if (IS_ERR(tl)) {
 539                                 err = PTR_ERR(tl);
 540                                 break;
 541                         }
 542
 543                         rq = tl_write(tl, engine, count);
 544                         if (IS_ERR(rq)) {
 545                                 intel_timeline_put(tl);
 546                                 err = PTR_ERR(rq);
 547                                 break;
 548                         }
 549
 550                         timelines[count++] = tl;
 551                         i915_request_put(rq);
 552                 }
 553
 554                 intel_engine_pm_put(engine);
 555                 if (err)
 556                         break;
 557         }
 558
 559         if (igt_flush_test(gt->i915))
 560                 err = -EIO;
 561
 562         for (n = 0; n < count; n++) {
 563                 struct intel_timeline *tl = timelines[n];
 564
 565                 if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
 566                         GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
 567                                       n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
 568                         GEM_TRACE_DUMP();
 569                         err = -EINVAL;
 570                 }
 571                 intel_timeline_put(tl);
 572         }
 573
 574         kvfree(timelines);
 575         return err;
 576 #undef NUM_TIMELINES
 577 }
 578
 579 static int live_hwsp_alternate(void *arg)
 580 {
 581 #define NUM_TIMELINES 4096
 582         struct intel_gt *gt = arg;
 583         struct intel_timeline **timelines;
 584         struct intel_engine_cs *engine;
 585         enum intel_engine_id id;
 586         unsigned long count, n;
 587         int err = 0;
 588
 589         /*
 590          * Create a bunch of timelines and check we can write
 591          * independently to each of their breadcrumb slots with adjacent
 592          * engines.
 593          */
 594
 595         timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
 596                                    sizeof(*timelines),
 597                                    GFP_KERNEL);
 598         if (!timelines)
 599                 return -ENOMEM;
 600
 601         count = 0;
 602         for (n = 0; n < NUM_TIMELINES; n++) {
 603                 for_each_engine(engine, gt, id) {
 604                         struct intel_timeline *tl;
 605                         struct i915_request *rq;
 606
 607                         if (!intel_engine_can_store_dword(engine))
 608                                 continue;
 609
 610                         tl = checked_intel_timeline_create(gt);
 611                         if (IS_ERR(tl)) {
 612                                 err = PTR_ERR(tl);
 613                                 goto out;
 614                         }
 615
 616                         intel_engine_pm_get(engine);
 617                         rq = tl_write(tl, engine, count);
 618                         intel_engine_pm_put(engine);
 619                         if (IS_ERR(rq)) {
 620                                 intel_timeline_put(tl);
 621                                 err = PTR_ERR(rq);
 622                                 goto out;
 623                         }
 624
 625                         timelines[count++] = tl;
 626                         i915_request_put(rq);
 627                 }
 628         }
 629
 630 out:
 631         if (igt_flush_test(gt->i915))
 632                 err = -EIO;
 633
 634         for (n = 0; n < count; n++) {
 635                 struct intel_timeline *tl = timelines[n];
 636
 637                 if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
 638                         GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
 639                                       n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
 640                         GEM_TRACE_DUMP();
 641                         err = -EINVAL;
 642                 }
 643                 intel_timeline_put(tl);
 644         }
 645
 646         kvfree(timelines);
 647         return err;
 648 #undef NUM_TIMELINES
 649 }
 650
 651 static int live_hwsp_wrap(void *arg)
 652 {
 653         struct intel_gt *gt = arg;
 654         struct intel_engine_cs *engine;
 655         struct intel_timeline *tl;
 656         enum intel_engine_id id;
 657         int err = 0;
 658
 659         /*
 660          * Across a seqno wrap, we need to keep the old cacheline alive for
 661          * foreign GPU references.
 662          */
 663
 664         tl = intel_timeline_create(gt);
 665         if (IS_ERR(tl))
 666                 return PTR_ERR(tl);
 667
 668         if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline)
 669                 goto out_free;
 670
 671         err = intel_timeline_pin(tl, NULL);
 672         if (err)
 673                 goto out_free;
 674
 675         for_each_engine(engine, gt, id) {
 676                 const u32 *hwsp_seqno[2];
 677                 struct i915_request *rq;
 678                 u32 seqno[2];
 679
 680                 if (!intel_engine_can_store_dword(engine))
 681                         continue;
 682
 683                 rq = intel_engine_create_kernel_request(engine);
 684                 if (IS_ERR(rq)) {
 685                         err = PTR_ERR(rq);
 686                         goto out;
 687                 }
 688
 689                 tl->seqno = -4u;
 690
 691                 mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
 692                 err = intel_timeline_get_seqno(tl, rq, &seqno[0]);
 693                 mutex_unlock(&tl->mutex);
 694                 if (err) {
 695                         i915_request_add(rq);
 696                         goto out;
 697                 }
 698                 pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
 699                          seqno[0], tl->hwsp_offset);
 700
 701                 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]);
 702                 if (err) {
 703                         i915_request_add(rq);
 704                         goto out;
 705                 }
 706                 hwsp_seqno[0] = tl->hwsp_seqno;
 707
 708                 mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
 709                 err = intel_timeline_get_seqno(tl, rq, &seqno[1]);
 710                 mutex_unlock(&tl->mutex);
 711                 if (err) {
 712                         i915_request_add(rq);
 713                         goto out;
 714                 }
 715                 pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
 716                          seqno[1], tl->hwsp_offset);
 717
 718                 err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]);
 719                 if (err) {
 720                         i915_request_add(rq);
 721                         goto out;
 722                 }
 723                 hwsp_seqno[1] = tl->hwsp_seqno;
 724
 725                 /* With wrap should come a new hwsp */
 726                 GEM_BUG_ON(seqno[1] >= seqno[0]);
 727                 GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]);
 728
 729                 i915_request_add(rq);
 730
 731                 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 732                         pr_err("Wait for timeline writes timed out!\n");
 733                         err = -EIO;
 734                         goto out;
 735                 }
 736
 737                 if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] ||
 738                     READ_ONCE(*hwsp_seqno[1]) != seqno[1]) {
 739                         pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
 740                                *hwsp_seqno[0], *hwsp_seqno[1],
 741                                seqno[0], seqno[1]);
 742                         err = -EINVAL;
 743                         goto out;
 744                 }
 745
 746                 intel_gt_retire_requests(gt); /* recycle HWSP */
 747         }
 748
 749 out:
 750         if (igt_flush_test(gt->i915))
 751                 err = -EIO;
 752
 753         intel_timeline_unpin(tl);
 754 out_free:
 755         intel_timeline_put(tl);
 756         return err;
 757 }
 758
 759 static int emit_read_hwsp(struct i915_request *rq,
 760                           u32 seqno, u32 hwsp,
 761                           u32 *addr)
 762 {
 763         const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0));
 764         u32 *cs;
 765
 766         cs = intel_ring_begin(rq, 12);
 767         if (IS_ERR(cs))
 768                 return PTR_ERR(cs);
 769
 770         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 771         *cs++ = *addr;
 772         *cs++ = 0;
 773         *cs++ = seqno;
 774         *addr += 4;
 775
 776         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
 777         *cs++ = gpr;
 778         *cs++ = hwsp;
 779         *cs++ = 0;
 780
 781         *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
 782         *cs++ = gpr;
 783         *cs++ = *addr;
 784         *cs++ = 0;
 785         *addr += 4;
 786
 787         intel_ring_advance(rq, cs);
 788
 789         return 0;
 790 }
 791
 792 struct hwsp_watcher {
 793         struct i915_vma *vma;
 794         struct i915_request *rq;
 795         u32 addr;
 796         u32 *map;
 797 };
 798
 799 static bool cmp_lt(u32 a, u32 b)
 800 {
 801         return a < b;
 802 }
 803
 804 static bool cmp_gte(u32 a, u32 b)
 805 {
 806         return a >= b;
 807 }
 808
 809 static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt)
 810 {
 811         struct drm_i915_gem_object *obj;
 812         struct i915_vma *vma;
 813
 814         obj = i915_gem_object_create_internal(gt->i915, SZ_2M);
 815         if (IS_ERR(obj))
 816                 return PTR_ERR(obj);
 817
 818         w->map = i915_gem_object_pin_map(obj, I915_MAP_WB);
 819         if (IS_ERR(w->map)) {
 820                 i915_gem_object_put(obj);
 821                 return PTR_ERR(w->map);
 822         }
 823
 824         vma = i915_gem_object_ggtt_pin_ww(obj, NULL, NULL, 0, 0, 0);
 825         if (IS_ERR(vma)) {
 826                 i915_gem_object_put(obj);
 827                 return PTR_ERR(vma);
 828         }
 829
 830         w->vma = vma;
 831         w->addr = i915_ggtt_offset(vma);
 832         return 0;
 833 }
 834
 835 static int create_watcher(struct hwsp_watcher *w,
 836                           struct intel_engine_cs *engine,
 837                           int ringsz)
 838 {
 839         struct intel_context *ce;
 840         struct intel_timeline *tl;
 841
 842         ce = intel_context_create(engine);
 843         if (IS_ERR(ce))
 844                 return PTR_ERR(ce);
 845
 846         ce->ring = __intel_context_ring_size(ringsz);
 847         w->rq = intel_context_create_request(ce);
 848         intel_context_put(ce);
 849         if (IS_ERR(w->rq))
 850                 return PTR_ERR(w->rq);
 851
 852         w->addr = i915_ggtt_offset(w->vma);
 853         tl = w->rq->context->timeline;
 854
 855         /* some light mutex juggling required; think co-routines */
 856         lockdep_unpin_lock(&tl->mutex, w->rq->cookie);
 857         mutex_unlock(&tl->mutex);
 858
 859         return 0;
 860 }
 861
 862 static int check_watcher(struct hwsp_watcher *w, const char *name,
 863                          bool (*op)(u32 hwsp, u32 seqno))
 864 {
 865         struct i915_request *rq = fetch_and_zero(&w->rq);
 866         struct intel_timeline *tl = rq->context->timeline;
 867         u32 offset, end;
 868         int err;
 869
 870         GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size);
 871
 872         i915_request_get(rq);
 873         mutex_lock(&tl->mutex);
 874         rq->cookie = lockdep_pin_lock(&tl->mutex);
 875         i915_request_add(rq);
 876
 877         if (i915_request_wait(rq, 0, HZ) < 0) {
 878                 err = -ETIME;
 879                 goto out;
 880         }
 881
 882         err = 0;
 883         offset = 0;
 884         end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map);
 885         while (offset < end) {
 886                 if (!op(w->map[offset + 1], w->map[offset])) {
 887                         pr_err("Watcher '%s' found HWSP value %x for seqno %x\n",
 888                                name, w->map[offset + 1], w->map[offset]);
 889                         err = -EINVAL;
 890                 }
 891
 892                 offset += 2;
 893         }
 894
 895 out:
 896         i915_request_put(rq);
 897         return err;
 898 }
 899
 900 static void cleanup_watcher(struct hwsp_watcher *w)
 901 {
 902         if (w->rq) {
 903                 struct intel_timeline *tl = w->rq->context->timeline;
 904
 905                 mutex_lock(&tl->mutex);
 906                 w->rq->cookie = lockdep_pin_lock(&tl->mutex);
 907
 908                 i915_request_add(w->rq);
 909         }
 910
 911         i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP);
 912 }
 913
 914 static bool retire_requests(struct intel_timeline *tl)
 915 {
 916         struct i915_request *rq, *rn;
 917
 918         mutex_lock(&tl->mutex);
 919         list_for_each_entry_safe(rq, rn, &tl->requests, link)
 920                 if (!i915_request_retire(rq))
 921                         break;
 922         mutex_unlock(&tl->mutex);
 923
 924         return !i915_active_fence_isset(&tl->last_request);
 925 }
 926
 927 static struct i915_request *wrap_timeline(struct i915_request *rq)
 928 {
 929         struct intel_context *ce = rq->context;
 930         struct intel_timeline *tl = ce->timeline;
 931         u32 seqno = rq->fence.seqno;
 932
 933         while (tl->seqno >= seqno) { /* Cause a wrap */
 934                 i915_request_put(rq);
 935                 rq = intel_context_create_request(ce);
 936                 if (IS_ERR(rq))
 937                         return rq;
 938
 939                 i915_request_get(rq);
 940                 i915_request_add(rq);
 941         }
 942
 943         i915_request_put(rq);
 944         rq = intel_context_create_request(ce);
 945         if (IS_ERR(rq))
 946                 return rq;
 947
 948         i915_request_get(rq);
 949         i915_request_add(rq);
 950
 951         return rq;
 952 }
 953
 954 static int live_hwsp_read(void *arg)
 955 {
 956         struct intel_gt *gt = arg;
 957         struct hwsp_watcher watcher[2] = {};
 958         struct intel_engine_cs *engine;
 959         struct intel_timeline *tl;
 960         enum intel_engine_id id;
 961         int err = 0;
 962         int i;
 963
 964         /*
 965          * If we take a reference to the HWSP for reading on the GPU, that
 966          * read may be arbitrarily delayed (either by foreign fence or
 967          * priority saturation) and a wrap can happen within 30 minutes.
 968          * When the GPU read is finally submitted it should be correct,
 969          * even across multiple wraps.
 970          */
 971
 972         if (INTEL_GEN(gt->i915) < 8) /* CS convenience [SRM/LRM] */
 973                 return 0;
 974
 975         tl = intel_timeline_create(gt);
 976         if (IS_ERR(tl))
 977                 return PTR_ERR(tl);
 978
 979         if (!tl->hwsp_cacheline)
 980                 goto out_free;
 981
 982         for (i = 0; i < ARRAY_SIZE(watcher); i++) {
 983                 err = setup_watcher(&watcher[i], gt);
 984                 if (err)
 985                         goto out;
 986         }
 987
 988         for_each_engine(engine, gt, id) {
 989                 struct intel_context *ce;
 990                 unsigned long count = 0;
 991                 IGT_TIMEOUT(end_time);
 992
 993                 /* Create a request we can use for remote reading of the HWSP */
 994                 err = create_watcher(&watcher[1], engine, SZ_512K);
 995                 if (err)
 996                         goto out;
 997
 998                 do {
 999                         struct i915_sw_fence *submit;
1000                         struct i915_request *rq;
1001                         u32 hwsp;
1002
1003                         submit = heap_fence_create(GFP_KERNEL);
1004                         if (!submit) {
1005                                 err = -ENOMEM;
1006                                 goto out;
1007                         }
1008
1009                         err = create_watcher(&watcher[0], engine, SZ_4K);
1010                         if (err)
1011                                 goto out;
1012
1013                         ce = intel_context_create(engine);
1014                         if (IS_ERR(ce)) {
1015                                 err = PTR_ERR(ce);
1016                                 goto out;
1017                         }
1018
1019                         /* Skip to the end, saving 30 minutes of nops */
1020                         tl->seqno = -10u + 2 * (count & 3);
1021                         WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1022                         ce->timeline = intel_timeline_get(tl);
1023
1024                         rq = intel_context_create_request(ce);
1025                         if (IS_ERR(rq)) {
1026                                 err = PTR_ERR(rq);
1027                                 intel_context_put(ce);
1028                                 goto out;
1029                         }
1030
1031                         err = i915_sw_fence_await_dma_fence(&rq->submit,
1032                                                             &watcher[0].rq->fence, 0,
1033                                                             GFP_KERNEL);
1034                         if (err < 0) {
1035                                 i915_request_add(rq);
1036                                 intel_context_put(ce);
1037                                 goto out;
1038                         }
1039
1040                         mutex_lock(&watcher[0].rq->context->timeline->mutex);
1041                         err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp);
1042                         if (err == 0)
1043                                 err = emit_read_hwsp(watcher[0].rq, /* before */
1044                                                      rq->fence.seqno, hwsp,
1045                                                      &watcher[0].addr);
1046                         mutex_unlock(&watcher[0].rq->context->timeline->mutex);
1047                         if (err) {
1048                                 i915_request_add(rq);
1049                                 intel_context_put(ce);
1050                                 goto out;
1051                         }
1052
1053                         mutex_lock(&watcher[1].rq->context->timeline->mutex);
1054                         err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp);
1055                         if (err == 0)
1056                                 err = emit_read_hwsp(watcher[1].rq, /* after */
1057                                                      rq->fence.seqno, hwsp,
1058                                                      &watcher[1].addr);
1059                         mutex_unlock(&watcher[1].rq->context->timeline->mutex);
1060                         if (err) {
1061                                 i915_request_add(rq);
1062                                 intel_context_put(ce);
1063                                 goto out;
1064                         }
1065
1066                         i915_request_get(rq);
1067                         i915_request_add(rq);
1068
1069                         rq = wrap_timeline(rq);
1070                         intel_context_put(ce);
1071                         if (IS_ERR(rq)) {
1072                                 err = PTR_ERR(rq);
1073                                 goto out;
1074                         }
1075
1076                         err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit,
1077                                                             &rq->fence, 0,
1078                                                             GFP_KERNEL);
1079                         if (err < 0) {
1080                                 i915_request_put(rq);
1081                                 goto out;
1082                         }
1083
1084                         err = check_watcher(&watcher[0], "before", cmp_lt);
1085                         i915_sw_fence_commit(submit);
1086                         heap_fence_put(submit);
1087                         if (err) {
1088                                 i915_request_put(rq);
1089                                 goto out;
1090                         }
1091                         count++;
1092
1093                         if (8 * watcher[1].rq->ring->emit >
1094                             3 * watcher[1].rq->ring->size) {
1095                                 i915_request_put(rq);
1096                                 break;
1097                         }
1098
1099                         /* Flush the timeline before manually wrapping again */
1100                         if (i915_request_wait(rq,
1101                                               I915_WAIT_INTERRUPTIBLE,
1102                                               HZ) < 0) {
1103                                 err = -ETIME;
1104                                 i915_request_put(rq);
1105                                 goto out;
1106                         }
1107
1108                         retire_requests(tl);
1109                         i915_request_put(rq);
1110                 } while (!__igt_timeout(end_time, NULL));
1111                 WRITE_ONCE(*(u32 *)tl->hwsp_seqno, 0xdeadbeef);
1112
1113                 pr_info("%s: simulated %lu wraps\n", engine->name, count);
1114                 err = check_watcher(&watcher[1], "after", cmp_gte);
1115                 if (err)
1116                         goto out;
1117         }
1118
1119 out:
1120         for (i = 0; i < ARRAY_SIZE(watcher); i++)
1121                 cleanup_watcher(&watcher[i]);
1122
1123         if (igt_flush_test(gt->i915))
1124                 err = -EIO;
1125
1126 out_free:
1127         intel_timeline_put(tl);
1128         return err;
1129 }
1130
1131 static int live_hwsp_rollover_kernel(void *arg)
1132 {
1133         struct intel_gt *gt = arg;
1134         struct intel_engine_cs *engine;
1135         enum intel_engine_id id;
1136         int err = 0;
1137
1138         /*
1139          * Run the host for long enough, and even the kernel context will
1140          * see a seqno rollover.
1141          */
1142
1143         for_each_engine(engine, gt, id) {
1144                 struct intel_context *ce = engine->kernel_context;
1145                 struct intel_timeline *tl = ce->timeline;
1146                 struct i915_request *rq[3] = {};
1147                 int i;
1148
1149                 st_engine_heartbeat_disable(engine);
1150                 if (intel_gt_wait_for_idle(gt, HZ / 2)) {
1151                         err = -EIO;
1152                         goto out;
1153                 }
1154
1155                 GEM_BUG_ON(i915_active_fence_isset(&tl->last_request));
1156                 tl->seqno = 0;
1157                 timeline_rollback(tl);
1158                 timeline_rollback(tl);
1159                 WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1160
1161                 for (i = 0; i < ARRAY_SIZE(rq); i++) {
1162                         struct i915_request *this;
1163
1164                         this = i915_request_create(ce);
1165                         if (IS_ERR(this)) {
1166                                 err = PTR_ERR(this);
1167                                 goto out;
1168                         }
1169
1170                         pr_debug("%s: create fence.seqnp:%d\n",
1171                                  engine->name,
1172                                  lower_32_bits(this->fence.seqno));
1173
1174                         GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1175
1176                         rq[i] = i915_request_get(this);
1177                         i915_request_add(this);
1178                 }
1179
1180                 /* We expected a wrap! */
1181                 GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1182
1183                 if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1184                         pr_err("Wait for timeline wrap timed out!\n");
1185                         err = -EIO;
1186                         goto out;
1187                 }
1188
1189                 for (i = 0; i < ARRAY_SIZE(rq); i++) {
1190                         if (!i915_request_completed(rq[i])) {
1191                                 pr_err("Pre-wrap request not completed!\n");
1192                                 err = -EINVAL;
1193                                 goto out;
1194                         }
1195                 }
1196
1197 out:
1198                 for (i = 0; i < ARRAY_SIZE(rq); i++)
1199                         i915_request_put(rq[i]);
1200                 st_engine_heartbeat_enable(engine);
1201                 if (err)
1202                         break;
1203         }
1204
1205         if (igt_flush_test(gt->i915))
1206                 err = -EIO;
1207
1208         return err;
1209 }
1210
1211 static int live_hwsp_rollover_user(void *arg)
1212 {
1213         struct intel_gt *gt = arg;
1214         struct intel_engine_cs *engine;
1215         enum intel_engine_id id;
1216         int err = 0;
1217
1218         /*
1219          * Simulate a long running user context, and force the seqno wrap
1220          * on the user's timeline.
1221          */
1222
1223         for_each_engine(engine, gt, id) {
1224                 struct i915_request *rq[3] = {};
1225                 struct intel_timeline *tl;
1226                 struct intel_context *ce;
1227                 int i;
1228
1229                 ce = intel_context_create(engine);
1230                 if (IS_ERR(ce))
1231                         return PTR_ERR(ce);
1232
1233                 err = intel_context_alloc_state(ce);
1234                 if (err)
1235                         goto out;
1236
1237                 tl = ce->timeline;
1238                 if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline)
1239                         goto out;
1240
1241                 timeline_rollback(tl);
1242                 timeline_rollback(tl);
1243                 WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1244
1245                 for (i = 0; i < ARRAY_SIZE(rq); i++) {
1246                         struct i915_request *this;
1247
1248                         this = intel_context_create_request(ce);
1249                         if (IS_ERR(this)) {
1250                                 err = PTR_ERR(this);
1251                                 goto out;
1252                         }
1253
1254                         pr_debug("%s: create fence.seqnp:%d\n",
1255                                  engine->name,
1256                                  lower_32_bits(this->fence.seqno));
1257
1258                         GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1259
1260                         rq[i] = i915_request_get(this);
1261                         i915_request_add(this);
1262                 }
1263
1264                 /* We expected a wrap! */
1265                 GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1266
1267                 if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1268                         pr_err("Wait for timeline wrap timed out!\n");
1269                         err = -EIO;
1270                         goto out;
1271                 }
1272
1273                 for (i = 0; i < ARRAY_SIZE(rq); i++) {
1274                         if (!i915_request_completed(rq[i])) {
1275                                 pr_err("Pre-wrap request not completed!\n");
1276                                 err = -EINVAL;
1277                                 goto out;
1278                         }
1279                 }
1280
1281 out:
1282                 for (i = 0; i < ARRAY_SIZE(rq); i++)
1283                         i915_request_put(rq[i]);
1284                 intel_context_put(ce);
1285                 if (err)
1286                         break;
1287         }
1288
1289         if (igt_flush_test(gt->i915))
1290                 err = -EIO;
1291
1292         return err;
1293 }
1294
1295 static int live_hwsp_recycle(void *arg)
1296 {
1297         struct intel_gt *gt = arg;
1298         struct intel_engine_cs *engine;
1299         enum intel_engine_id id;
1300         unsigned long count;
1301         int err = 0;
1302
1303         /*
1304          * Check seqno writes into one timeline at a time. We expect to
1305          * recycle the breadcrumb slot between iterations and neither
1306          * want to confuse ourselves or the GPU.
1307          */
1308
1309         count = 0;
1310         for_each_engine(engine, gt, id) {
1311                 IGT_TIMEOUT(end_time);
1312
1313                 if (!intel_engine_can_store_dword(engine))
1314                         continue;
1315
1316                 intel_engine_pm_get(engine);
1317
1318                 do {
1319                         struct intel_timeline *tl;
1320                         struct i915_request *rq;
1321
1322                         tl = checked_intel_timeline_create(gt);
1323                         if (IS_ERR(tl)) {
1324                                 err = PTR_ERR(tl);
1325                                 break;
1326                         }
1327
1328                         rq = tl_write(tl, engine, count);
1329                         if (IS_ERR(rq)) {
1330                                 intel_timeline_put(tl);
1331                                 err = PTR_ERR(rq);
1332                                 break;
1333                         }
1334
1335                         if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1336                                 pr_err("Wait for timeline writes timed out!\n");
1337                                 i915_request_put(rq);
1338                                 intel_timeline_put(tl);
1339                                 err = -EIO;
1340                                 break;
1341                         }
1342
1343                         if (READ_ONCE(*tl->hwsp_seqno) != count) {
1344                                 GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n",
1345                                               count, tl->fence_context,
1346                                               tl->hwsp_offset, *tl->hwsp_seqno);
1347                                 GEM_TRACE_DUMP();
1348                                 err = -EINVAL;
1349                         }
1350
1351                         i915_request_put(rq);
1352                         intel_timeline_put(tl);
1353                         count++;
1354
1355                         if (err)
1356                                 break;
1357                 } while (!__igt_timeout(end_time, NULL));
1358
1359                 intel_engine_pm_put(engine);
1360                 if (err)
1361                         break;
1362         }
1363
1364         return err;
1365 }
1366
1367 int intel_timeline_live_selftests(struct drm_i915_private *i915)
1368 {
1369         static const struct i915_subtest tests[] = {
1370                 SUBTEST(live_hwsp_recycle),
1371                 SUBTEST(live_hwsp_engine),
1372                 SUBTEST(live_hwsp_alternate),
1373                 SUBTEST(live_hwsp_wrap),
1374                 SUBTEST(live_hwsp_read),
1375                 SUBTEST(live_hwsp_rollover_kernel),
1376                 SUBTEST(live_hwsp_rollover_user),
1377         };
1378
1379         if (intel_gt_is_wedged(&i915->gt))
1380                 return 0;
1381
1382         return intel_gt_live_subtests(tests, &i915->gt);
1383 }