drivers/gpu/drm/i915/selftests/intel_hangcheck.c

   1 /*
   2  * Copyright © 2016 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include <linux/kthread.h>
  26
  27 #include "../i915_selftest.h"
  28
  29 #include "mock_context.h"
  30 #include "mock_drm.h"
  31
  32 struct hang {
  33         struct drm_i915_private *i915;
  34         struct drm_i915_gem_object *hws;
  35         struct drm_i915_gem_object *obj;
  36         u32 *seqno;
  37         u32 *batch;
  38 };
  39
  40 static int hang_init(struct hang *h, struct drm_i915_private *i915)
  41 {
  42         void *vaddr;
  43         int err;
  44
  45         memset(h, 0, sizeof(*h));
  46         h->i915 = i915;
  47
  48         h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  49         if (IS_ERR(h->hws))
  50                 return PTR_ERR(h->hws);
  51
  52         h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  53         if (IS_ERR(h->obj)) {
  54                 err = PTR_ERR(h->obj);
  55                 goto err_hws;
  56         }
  57
  58         i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
  59         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  60         if (IS_ERR(vaddr)) {
  61                 err = PTR_ERR(vaddr);
  62                 goto err_obj;
  63         }
  64         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  65
  66         vaddr = i915_gem_object_pin_map(h->obj,
  67                                         HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
  68         if (IS_ERR(vaddr)) {
  69                 err = PTR_ERR(vaddr);
  70                 goto err_unpin_hws;
  71         }
  72         h->batch = vaddr;
  73
  74         return 0;
  75
  76 err_unpin_hws:
  77         i915_gem_object_unpin_map(h->hws);
  78 err_obj:
  79         i915_gem_object_put(h->obj);
  80 err_hws:
  81         i915_gem_object_put(h->hws);
  82         return err;
  83 }
  84
  85 static u64 hws_address(const struct i915_vma *hws,
  86                        const struct drm_i915_gem_request *rq)
  87 {
  88         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  89 }
  90
  91 static int emit_recurse_batch(struct hang *h,
  92                               struct drm_i915_gem_request *rq)
  93 {
  94         struct drm_i915_private *i915 = h->i915;
  95         struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
  96         struct i915_vma *hws, *vma;
  97         unsigned int flags;
  98         u32 *batch;
  99         int err;
 100
 101         vma = i915_vma_instance(h->obj, vm, NULL);
 102         if (IS_ERR(vma))
 103                 return PTR_ERR(vma);
 104
 105         hws = i915_vma_instance(h->hws, vm, NULL);
 106         if (IS_ERR(hws))
 107                 return PTR_ERR(hws);
 108
 109         err = i915_vma_pin(vma, 0, 0, PIN_USER);
 110         if (err)
 111                 return err;
 112
 113         err = i915_vma_pin(hws, 0, 0, PIN_USER);
 114         if (err)
 115                 goto unpin_vma;
 116
 117         i915_vma_move_to_active(vma, rq, 0);
 118         if (!i915_gem_object_has_active_reference(vma->obj)) {
 119                 i915_gem_object_get(vma->obj);
 120                 i915_gem_object_set_active_reference(vma->obj);
 121         }
 122
 123         i915_vma_move_to_active(hws, rq, 0);
 124         if (!i915_gem_object_has_active_reference(hws->obj)) {
 125                 i915_gem_object_get(hws->obj);
 126                 i915_gem_object_set_active_reference(hws->obj);
 127         }
 128
 129         batch = h->batch;
 130         if (INTEL_GEN(i915) >= 8) {
 131                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 132                 *batch++ = lower_32_bits(hws_address(hws, rq));
 133                 *batch++ = upper_32_bits(hws_address(hws, rq));
 134                 *batch++ = rq->fence.seqno;
 135                 *batch++ = MI_ARB_CHECK;
 136
 137                 memset(batch, 0, 1024);
 138                 batch += 1024 / sizeof(*batch);
 139
 140                 *batch++ = MI_ARB_CHECK;
 141                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 142                 *batch++ = lower_32_bits(vma->node.start);
 143                 *batch++ = upper_32_bits(vma->node.start);
 144         } else if (INTEL_GEN(i915) >= 6) {
 145                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 146                 *batch++ = 0;
 147                 *batch++ = lower_32_bits(hws_address(hws, rq));
 148                 *batch++ = rq->fence.seqno;
 149                 *batch++ = MI_ARB_CHECK;
 150
 151                 memset(batch, 0, 1024);
 152                 batch += 1024 / sizeof(*batch);
 153
 154                 *batch++ = MI_ARB_CHECK;
 155                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 156                 *batch++ = lower_32_bits(vma->node.start);
 157         } else if (INTEL_GEN(i915) >= 4) {
 158                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
 159                 *batch++ = 0;
 160                 *batch++ = lower_32_bits(hws_address(hws, rq));
 161                 *batch++ = rq->fence.seqno;
 162                 *batch++ = MI_ARB_CHECK;
 163
 164                 memset(batch, 0, 1024);
 165                 batch += 1024 / sizeof(*batch);
 166
 167                 *batch++ = MI_ARB_CHECK;
 168                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 169                 *batch++ = lower_32_bits(vma->node.start);
 170         } else {
 171                 *batch++ = MI_STORE_DWORD_IMM;
 172                 *batch++ = lower_32_bits(hws_address(hws, rq));
 173                 *batch++ = rq->fence.seqno;
 174                 *batch++ = MI_ARB_CHECK;
 175
 176                 memset(batch, 0, 1024);
 177                 batch += 1024 / sizeof(*batch);
 178
 179                 *batch++ = MI_ARB_CHECK;
 180                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
 181                 *batch++ = lower_32_bits(vma->node.start);
 182         }
 183         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 184         i915_gem_chipset_flush(h->i915);
 185
 186         flags = 0;
 187         if (INTEL_GEN(vm->i915) <= 5)
 188                 flags |= I915_DISPATCH_SECURE;
 189
 190         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 191
 192         i915_vma_unpin(hws);
 193 unpin_vma:
 194         i915_vma_unpin(vma);
 195         return err;
 196 }
 197
 198 static struct drm_i915_gem_request *
 199 hang_create_request(struct hang *h,
 200                     struct intel_engine_cs *engine,
 201                     struct i915_gem_context *ctx)
 202 {
 203         struct drm_i915_gem_request *rq;
 204         int err;
 205
 206         if (i915_gem_object_is_active(h->obj)) {
 207                 struct drm_i915_gem_object *obj;
 208                 void *vaddr;
 209
 210                 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
 211                 if (IS_ERR(obj))
 212                         return ERR_CAST(obj);
 213
 214                 vaddr = i915_gem_object_pin_map(obj,
 215                                                 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
 216                 if (IS_ERR(vaddr)) {
 217                         i915_gem_object_put(obj);
 218                         return ERR_CAST(vaddr);
 219                 }
 220
 221                 i915_gem_object_unpin_map(h->obj);
 222                 i915_gem_object_put(h->obj);
 223
 224                 h->obj = obj;
 225                 h->batch = vaddr;
 226         }
 227
 228         rq = i915_gem_request_alloc(engine, ctx);
 229         if (IS_ERR(rq))
 230                 return rq;
 231
 232         err = emit_recurse_batch(h, rq);
 233         if (err) {
 234                 __i915_add_request(rq, false);
 235                 return ERR_PTR(err);
 236         }
 237
 238         return rq;
 239 }
 240
 241 static u32 hws_seqno(const struct hang *h,
 242                      const struct drm_i915_gem_request *rq)
 243 {
 244         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 245 }
 246
 247 static void hang_fini(struct hang *h)
 248 {
 249         *h->batch = MI_BATCH_BUFFER_END;
 250         i915_gem_chipset_flush(h->i915);
 251
 252         i915_gem_object_unpin_map(h->obj);
 253         i915_gem_object_put(h->obj);
 254
 255         i915_gem_object_unpin_map(h->hws);
 256         i915_gem_object_put(h->hws);
 257
 258         i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
 259 }
 260
 261 static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
 262 {
 263         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 264                                                rq->fence.seqno),
 265                              10) &&
 266                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
 267                                             rq->fence.seqno),
 268                           1000));
 269 }
 270
 271 static int igt_hang_sanitycheck(void *arg)
 272 {
 273         struct drm_i915_private *i915 = arg;
 274         struct drm_i915_gem_request *rq;
 275         struct intel_engine_cs *engine;
 276         enum intel_engine_id id;
 277         struct hang h;
 278         int err;
 279
 280         /* Basic check that we can execute our hanging batch */
 281
 282         mutex_lock(&i915->drm.struct_mutex);
 283         err = hang_init(&h, i915);
 284         if (err)
 285                 goto unlock;
 286
 287         for_each_engine(engine, i915, id) {
 288                 long timeout;
 289
 290                 if (!intel_engine_can_store_dword(engine))
 291                         continue;
 292
 293                 rq = hang_create_request(&h, engine, i915->kernel_context);
 294                 if (IS_ERR(rq)) {
 295                         err = PTR_ERR(rq);
 296                         pr_err("Failed to create request for %s, err=%d\n",
 297                                engine->name, err);
 298                         goto fini;
 299                 }
 300
 301                 i915_gem_request_get(rq);
 302
 303                 *h.batch = MI_BATCH_BUFFER_END;
 304                 i915_gem_chipset_flush(i915);
 305
 306                 __i915_add_request(rq, true);
 307
 308                 timeout = i915_wait_request(rq,
 309                                             I915_WAIT_LOCKED,
 310                                             MAX_SCHEDULE_TIMEOUT);
 311                 i915_gem_request_put(rq);
 312
 313                 if (timeout < 0) {
 314                         err = timeout;
 315                         pr_err("Wait for request failed on %s, err=%d\n",
 316                                engine->name, err);
 317                         goto fini;
 318                 }
 319         }
 320
 321 fini:
 322         hang_fini(&h);
 323 unlock:
 324         mutex_unlock(&i915->drm.struct_mutex);
 325         return err;
 326 }
 327
 328 static void global_reset_lock(struct drm_i915_private *i915)
 329 {
 330         struct intel_engine_cs *engine;
 331         enum intel_engine_id id;
 332
 333         pr_debug("%s: current gpu_error=%08lx\n",
 334                  __func__, i915->gpu_error.flags);
 335
 336         while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
 337                 wait_event(i915->gpu_error.reset_queue,
 338                            !test_bit(I915_RESET_BACKOFF,
 339                                      &i915->gpu_error.flags));
 340
 341         for_each_engine(engine, i915, id) {
 342                 while (test_and_set_bit(I915_RESET_ENGINE + id,
 343                                         &i915->gpu_error.flags))
 344                         wait_on_bit(&i915->gpu_error.flags,
 345                                     I915_RESET_ENGINE + id,
 346                                     TASK_UNINTERRUPTIBLE);
 347         }
 348 }
 349
 350 static void global_reset_unlock(struct drm_i915_private *i915)
 351 {
 352         struct intel_engine_cs *engine;
 353         enum intel_engine_id id;
 354
 355         for_each_engine(engine, i915, id)
 356                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 357
 358         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 359         wake_up_all(&i915->gpu_error.reset_queue);
 360 }
 361
 362 static int igt_global_reset(void *arg)
 363 {
 364         struct drm_i915_private *i915 = arg;
 365         unsigned int reset_count;
 366         int err = 0;
 367
 368         /* Check that we can issue a global GPU reset */
 369
 370         global_reset_lock(i915);
 371         set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 372
 373         mutex_lock(&i915->drm.struct_mutex);
 374         reset_count = i915_reset_count(&i915->gpu_error);
 375
 376         i915_reset(i915, I915_RESET_QUIET);
 377
 378         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 379                 pr_err("No GPU reset recorded!\n");
 380                 err = -EINVAL;
 381         }
 382         mutex_unlock(&i915->drm.struct_mutex);
 383
 384         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 385         global_reset_unlock(i915);
 386
 387         if (i915_terminally_wedged(&i915->gpu_error))
 388                 err = -EIO;
 389
 390         return err;
 391 }
 392
 393 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 394 {
 395         struct intel_engine_cs *engine;
 396         enum intel_engine_id id;
 397         struct hang h;
 398         int err = 0;
 399
 400         /* Check that we can issue an engine reset on an idle engine (no-op) */
 401
 402         if (!intel_has_reset_engine(i915))
 403                 return 0;
 404
 405         if (active) {
 406                 mutex_lock(&i915->drm.struct_mutex);
 407                 err = hang_init(&h, i915);
 408                 mutex_unlock(&i915->drm.struct_mutex);
 409                 if (err)
 410                         return err;
 411         }
 412
 413         for_each_engine(engine, i915, id) {
 414                 unsigned int reset_count, reset_engine_count;
 415                 IGT_TIMEOUT(end_time);
 416
 417                 if (active && !intel_engine_can_store_dword(engine))
 418                         continue;
 419
 420                 reset_count = i915_reset_count(&i915->gpu_error);
 421                 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 422                                                              engine);
 423
 424                 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 425                 do {
 426                         if (active) {
 427                                 struct drm_i915_gem_request *rq;
 428
 429                                 mutex_lock(&i915->drm.struct_mutex);
 430                                 rq = hang_create_request(&h, engine,
 431                                                          i915->kernel_context);
 432                                 if (IS_ERR(rq)) {
 433                                         err = PTR_ERR(rq);
 434                                         mutex_unlock(&i915->drm.struct_mutex);
 435                                         break;
 436                                 }
 437
 438                                 i915_gem_request_get(rq);
 439                                 __i915_add_request(rq, true);
 440                                 mutex_unlock(&i915->drm.struct_mutex);
 441
 442                                 if (!wait_for_hang(&h, rq)) {
 443                                         struct drm_printer p = drm_info_printer(i915->drm.dev);
 444
 445                                         pr_err("%s: Failed to start request %x, at %x\n",
 446                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
 447                                         intel_engine_dump(engine, &p,
 448                                                           "%s\n", engine->name);
 449
 450                                         i915_gem_request_put(rq);
 451                                         err = -EIO;
 452                                         break;
 453                                 }
 454
 455                                 i915_gem_request_put(rq);
 456                         }
 457
 458                         engine->hangcheck.stalled = true;
 459                         engine->hangcheck.seqno =
 460                                 intel_engine_get_seqno(engine);
 461
 462                         err = i915_reset_engine(engine, I915_RESET_QUIET);
 463                         if (err) {
 464                                 pr_err("i915_reset_engine failed\n");
 465                                 break;
 466                         }
 467
 468                         if (i915_reset_count(&i915->gpu_error) != reset_count) {
 469                                 pr_err("Full GPU reset recorded! (engine reset expected)\n");
 470                                 err = -EINVAL;
 471                                 break;
 472                         }
 473
 474                         reset_engine_count += active;
 475                         if (i915_reset_engine_count(&i915->gpu_error, engine) !=
 476                             reset_engine_count) {
 477                                 pr_err("%s engine reset %srecorded!\n",
 478                                        engine->name, active ? "not " : "");
 479                                 err = -EINVAL;
 480                                 break;
 481                         }
 482
 483                         engine->hangcheck.stalled = false;
 484                 } while (time_before(jiffies, end_time));
 485                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 486
 487                 if (err)
 488                         break;
 489
 490                 cond_resched();
 491         }
 492
 493         if (i915_terminally_wedged(&i915->gpu_error))
 494                 err = -EIO;
 495
 496         if (active) {
 497                 mutex_lock(&i915->drm.struct_mutex);
 498                 hang_fini(&h);
 499                 mutex_unlock(&i915->drm.struct_mutex);
 500         }
 501
 502         return err;
 503 }
 504
 505 static int igt_reset_idle_engine(void *arg)
 506 {
 507         return __igt_reset_engine(arg, false);
 508 }
 509
 510 static int igt_reset_active_engine(void *arg)
 511 {
 512         return __igt_reset_engine(arg, true);
 513 }
 514
 515 static int active_engine(void *data)
 516 {
 517         struct intel_engine_cs *engine = data;
 518         struct drm_i915_gem_request *rq[2] = {};
 519         struct i915_gem_context *ctx[2];
 520         struct drm_file *file;
 521         unsigned long count = 0;
 522         int err = 0;
 523
 524         file = mock_file(engine->i915);
 525         if (IS_ERR(file))
 526                 return PTR_ERR(file);
 527
 528         mutex_lock(&engine->i915->drm.struct_mutex);
 529         ctx[0] = live_context(engine->i915, file);
 530         mutex_unlock(&engine->i915->drm.struct_mutex);
 531         if (IS_ERR(ctx[0])) {
 532                 err = PTR_ERR(ctx[0]);
 533                 goto err_file;
 534         }
 535
 536         mutex_lock(&engine->i915->drm.struct_mutex);
 537         ctx[1] = live_context(engine->i915, file);
 538         mutex_unlock(&engine->i915->drm.struct_mutex);
 539         if (IS_ERR(ctx[1])) {
 540                 err = PTR_ERR(ctx[1]);
 541                 i915_gem_context_put(ctx[0]);
 542                 goto err_file;
 543         }
 544
 545         while (!kthread_should_stop()) {
 546                 unsigned int idx = count++ & 1;
 547                 struct drm_i915_gem_request *old = rq[idx];
 548                 struct drm_i915_gem_request *new;
 549
 550                 mutex_lock(&engine->i915->drm.struct_mutex);
 551                 new = i915_gem_request_alloc(engine, ctx[idx]);
 552                 if (IS_ERR(new)) {
 553                         mutex_unlock(&engine->i915->drm.struct_mutex);
 554                         err = PTR_ERR(new);
 555                         break;
 556                 }
 557
 558                 rq[idx] = i915_gem_request_get(new);
 559                 i915_add_request(new);
 560                 mutex_unlock(&engine->i915->drm.struct_mutex);
 561
 562                 if (old) {
 563                         i915_wait_request(old, 0, MAX_SCHEDULE_TIMEOUT);
 564                         i915_gem_request_put(old);
 565                 }
 566         }
 567
 568         for (count = 0; count < ARRAY_SIZE(rq); count++)
 569                 i915_gem_request_put(rq[count]);
 570
 571 err_file:
 572         mock_file_free(engine->i915, file);
 573         return err;
 574 }
 575
 576 static int __igt_reset_engine_others(struct drm_i915_private *i915,
 577                                      bool active)
 578 {
 579         struct intel_engine_cs *engine, *other;
 580         enum intel_engine_id id, tmp;
 581         struct hang h;
 582         int err = 0;
 583
 584         /* Check that issuing a reset on one engine does not interfere
 585          * with any other engine.
 586          */
 587
 588         if (!intel_has_reset_engine(i915))
 589                 return 0;
 590
 591         if (active) {
 592                 mutex_lock(&i915->drm.struct_mutex);
 593                 err = hang_init(&h, i915);
 594                 mutex_unlock(&i915->drm.struct_mutex);
 595                 if (err)
 596                         return err;
 597         }
 598
 599         for_each_engine(engine, i915, id) {
 600                 struct task_struct *threads[I915_NUM_ENGINES] = {};
 601                 unsigned long resets[I915_NUM_ENGINES];
 602                 unsigned long global = i915_reset_count(&i915->gpu_error);
 603                 unsigned long count = 0;
 604                 IGT_TIMEOUT(end_time);
 605
 606                 if (active && !intel_engine_can_store_dword(engine))
 607                         continue;
 608
 609                 memset(threads, 0, sizeof(threads));
 610                 for_each_engine(other, i915, tmp) {
 611                         struct task_struct *tsk;
 612
 613                         resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
 614                                                               other);
 615
 616                         if (other == engine)
 617                                 continue;
 618
 619                         tsk = kthread_run(active_engine, other,
 620                                           "igt/%s", other->name);
 621                         if (IS_ERR(tsk)) {
 622                                 err = PTR_ERR(tsk);
 623                                 goto unwind;
 624                         }
 625
 626                         threads[tmp] = tsk;
 627                         get_task_struct(tsk);
 628                 }
 629
 630                 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 631                 do {
 632                         if (active) {
 633                                 struct drm_i915_gem_request *rq;
 634
 635                                 mutex_lock(&i915->drm.struct_mutex);
 636                                 rq = hang_create_request(&h, engine,
 637                                                          i915->kernel_context);
 638                                 if (IS_ERR(rq)) {
 639                                         err = PTR_ERR(rq);
 640                                         mutex_unlock(&i915->drm.struct_mutex);
 641                                         break;
 642                                 }
 643
 644                                 i915_gem_request_get(rq);
 645                                 __i915_add_request(rq, true);
 646                                 mutex_unlock(&i915->drm.struct_mutex);
 647
 648                                 if (!wait_for_hang(&h, rq)) {
 649                                         struct drm_printer p = drm_info_printer(i915->drm.dev);
 650
 651                                         pr_err("%s: Failed to start request %x, at %x\n",
 652                                                __func__, rq->fence.seqno, hws_seqno(&h, rq));
 653                                         intel_engine_dump(engine, &p,
 654                                                           "%s\n", engine->name);
 655
 656                                         i915_gem_request_put(rq);
 657                                         err = -EIO;
 658                                         break;
 659                                 }
 660
 661                                 i915_gem_request_put(rq);
 662                         }
 663
 664                         engine->hangcheck.stalled = true;
 665                         engine->hangcheck.seqno =
 666                                 intel_engine_get_seqno(engine);
 667
 668                         err = i915_reset_engine(engine, I915_RESET_QUIET);
 669                         if (err) {
 670                                 pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
 671                                        engine->name, active ? "active" : "idle", err);
 672                                 break;
 673                         }
 674
 675                         engine->hangcheck.stalled = false;
 676                         count++;
 677                 } while (time_before(jiffies, end_time));
 678                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 679                 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
 680                         engine->name, active ? "active" : "idle", count);
 681
 682                 if (i915_reset_engine_count(&i915->gpu_error, engine) -
 683                     resets[engine->id] != (active ? count : 0)) {
 684                         pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
 685                                engine->name, active ? "active" : "idle", count,
 686                                i915_reset_engine_count(&i915->gpu_error,
 687                                                        engine) - resets[engine->id]);
 688                         if (!err)
 689                                 err = -EINVAL;
 690                 }
 691
 692 unwind:
 693                 for_each_engine(other, i915, tmp) {
 694                         int ret;
 695
 696                         if (!threads[tmp])
 697                                 continue;
 698
 699                         ret = kthread_stop(threads[tmp]);
 700                         if (ret) {
 701                                 pr_err("kthread for other engine %s failed, err=%d\n",
 702                                        other->name, ret);
 703                                 if (!err)
 704                                         err = ret;
 705                         }
 706                         put_task_struct(threads[tmp]);
 707
 708                         if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
 709                                                                    other)) {
 710                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
 711                                        other->name,
 712                                        i915_reset_engine_count(&i915->gpu_error,
 713                                                                other) - resets[tmp]);
 714                                 if (!err)
 715                                         err = -EINVAL;
 716                         }
 717                 }
 718
 719                 if (global != i915_reset_count(&i915->gpu_error)) {
 720                         pr_err("Global reset (count=%ld)!\n",
 721                                i915_reset_count(&i915->gpu_error) - global);
 722                         if (!err)
 723                                 err = -EINVAL;
 724                 }
 725
 726                 if (err)
 727                         break;
 728
 729                 cond_resched();
 730         }
 731
 732         if (i915_terminally_wedged(&i915->gpu_error))
 733                 err = -EIO;
 734
 735         if (active) {
 736                 mutex_lock(&i915->drm.struct_mutex);
 737                 hang_fini(&h);
 738                 mutex_unlock(&i915->drm.struct_mutex);
 739         }
 740
 741         return err;
 742 }
 743
 744 static int igt_reset_idle_engine_others(void *arg)
 745 {
 746         return __igt_reset_engine_others(arg, false);
 747 }
 748
 749 static int igt_reset_active_engine_others(void *arg)
 750 {
 751         return __igt_reset_engine_others(arg, true);
 752 }
 753
 754 static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
 755 {
 756         u32 reset_count;
 757
 758         rq->engine->hangcheck.stalled = true;
 759         rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
 760
 761         reset_count = i915_reset_count(&rq->i915->gpu_error);
 762
 763         set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
 764         wake_up_all(&rq->i915->gpu_error.wait_queue);
 765
 766         return reset_count;
 767 }
 768
 769 static int igt_wait_reset(void *arg)
 770 {
 771         struct drm_i915_private *i915 = arg;
 772         struct drm_i915_gem_request *rq;
 773         unsigned int reset_count;
 774         struct hang h;
 775         long timeout;
 776         int err;
 777
 778         if (!intel_engine_can_store_dword(i915->engine[RCS]))
 779                 return 0;
 780
 781         /* Check that we detect a stuck waiter and issue a reset */
 782
 783         global_reset_lock(i915);
 784
 785         mutex_lock(&i915->drm.struct_mutex);
 786         err = hang_init(&h, i915);
 787         if (err)
 788                 goto unlock;
 789
 790         rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context);
 791         if (IS_ERR(rq)) {
 792                 err = PTR_ERR(rq);
 793                 goto fini;
 794         }
 795
 796         i915_gem_request_get(rq);
 797         __i915_add_request(rq, true);
 798
 799         if (!wait_for_hang(&h, rq)) {
 800                 struct drm_printer p = drm_info_printer(i915->drm.dev);
 801
 802                 pr_err("%s: Failed to start request %x, at %x\n",
 803                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
 804                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
 805
 806                 i915_reset(i915, 0);
 807                 i915_gem_set_wedged(i915);
 808
 809                 err = -EIO;
 810                 goto out_rq;
 811         }
 812
 813         reset_count = fake_hangcheck(rq);
 814
 815         timeout = i915_wait_request(rq, I915_WAIT_LOCKED, 10);
 816         if (timeout < 0) {
 817                 pr_err("i915_wait_request failed on a stuck request: err=%ld\n",
 818                        timeout);
 819                 err = timeout;
 820                 goto out_rq;
 821         }
 822
 823         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 824         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 825                 pr_err("No GPU reset recorded!\n");
 826                 err = -EINVAL;
 827                 goto out_rq;
 828         }
 829
 830 out_rq:
 831         i915_gem_request_put(rq);
 832 fini:
 833         hang_fini(&h);
 834 unlock:
 835         mutex_unlock(&i915->drm.struct_mutex);
 836         global_reset_unlock(i915);
 837
 838         if (i915_terminally_wedged(&i915->gpu_error))
 839                 return -EIO;
 840
 841         return err;
 842 }
 843
 844 static int igt_reset_queue(void *arg)
 845 {
 846         struct drm_i915_private *i915 = arg;
 847         struct intel_engine_cs *engine;
 848         enum intel_engine_id id;
 849         struct hang h;
 850         int err;
 851
 852         /* Check that we replay pending requests following a hang */
 853
 854         global_reset_lock(i915);
 855
 856         mutex_lock(&i915->drm.struct_mutex);
 857         err = hang_init(&h, i915);
 858         if (err)
 859                 goto unlock;
 860
 861         for_each_engine(engine, i915, id) {
 862                 struct drm_i915_gem_request *prev;
 863                 IGT_TIMEOUT(end_time);
 864                 unsigned int count;
 865
 866                 if (!intel_engine_can_store_dword(engine))
 867                         continue;
 868
 869                 prev = hang_create_request(&h, engine, i915->kernel_context);
 870                 if (IS_ERR(prev)) {
 871                         err = PTR_ERR(prev);
 872                         goto fini;
 873                 }
 874
 875                 i915_gem_request_get(prev);
 876                 __i915_add_request(prev, true);
 877
 878                 count = 0;
 879                 do {
 880                         struct drm_i915_gem_request *rq;
 881                         unsigned int reset_count;
 882
 883                         rq = hang_create_request(&h,
 884                                                  engine,
 885                                                  i915->kernel_context);
 886                         if (IS_ERR(rq)) {
 887                                 err = PTR_ERR(rq);
 888                                 goto fini;
 889                         }
 890
 891                         i915_gem_request_get(rq);
 892                         __i915_add_request(rq, true);
 893
 894                         if (!wait_for_hang(&h, prev)) {
 895                                 struct drm_printer p = drm_info_printer(i915->drm.dev);
 896
 897                                 pr_err("%s: Failed to start request %x, at %x\n",
 898                                        __func__, prev->fence.seqno, hws_seqno(&h, prev));
 899                                 intel_engine_dump(prev->engine, &p,
 900                                                   "%s\n", prev->engine->name);
 901
 902                                 i915_gem_request_put(rq);
 903                                 i915_gem_request_put(prev);
 904
 905                                 i915_reset(i915, 0);
 906                                 i915_gem_set_wedged(i915);
 907
 908                                 err = -EIO;
 909                                 goto fini;
 910                         }
 911
 912                         reset_count = fake_hangcheck(prev);
 913
 914                         i915_reset(i915, I915_RESET_QUIET);
 915
 916                         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
 917                                             &i915->gpu_error.flags));
 918
 919                         if (prev->fence.error != -EIO) {
 920                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
 921                                        prev->fence.error);
 922                                 i915_gem_request_put(rq);
 923                                 i915_gem_request_put(prev);
 924                                 err = -EINVAL;
 925                                 goto fini;
 926                         }
 927
 928                         if (rq->fence.error) {
 929                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
 930                                        rq->fence.error);
 931                                 i915_gem_request_put(rq);
 932                                 i915_gem_request_put(prev);
 933                                 err = -EINVAL;
 934                                 goto fini;
 935                         }
 936
 937                         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 938                                 pr_err("No GPU reset recorded!\n");
 939                                 i915_gem_request_put(rq);
 940                                 i915_gem_request_put(prev);
 941                                 err = -EINVAL;
 942                                 goto fini;
 943                         }
 944
 945                         i915_gem_request_put(prev);
 946                         prev = rq;
 947                         count++;
 948                 } while (time_before(jiffies, end_time));
 949                 pr_info("%s: Completed %d resets\n", engine->name, count);
 950
 951                 *h.batch = MI_BATCH_BUFFER_END;
 952                 i915_gem_chipset_flush(i915);
 953
 954                 i915_gem_request_put(prev);
 955         }
 956
 957 fini:
 958         hang_fini(&h);
 959 unlock:
 960         mutex_unlock(&i915->drm.struct_mutex);
 961         global_reset_unlock(i915);
 962
 963         if (i915_terminally_wedged(&i915->gpu_error))
 964                 return -EIO;
 965
 966         return err;
 967 }
 968
 969 static int igt_handle_error(void *arg)
 970 {
 971         struct drm_i915_private *i915 = arg;
 972         struct intel_engine_cs *engine = i915->engine[RCS];
 973         struct hang h;
 974         struct drm_i915_gem_request *rq;
 975         struct i915_gpu_state *error;
 976         int err;
 977
 978         /* Check that we can issue a global GPU and engine reset */
 979
 980         if (!intel_has_reset_engine(i915))
 981                 return 0;
 982
 983         if (!intel_engine_can_store_dword(i915->engine[RCS]))
 984                 return 0;
 985
 986         mutex_lock(&i915->drm.struct_mutex);
 987
 988         err = hang_init(&h, i915);
 989         if (err)
 990                 goto err_unlock;
 991
 992         rq = hang_create_request(&h, engine, i915->kernel_context);
 993         if (IS_ERR(rq)) {
 994                 err = PTR_ERR(rq);
 995                 goto err_fini;
 996         }
 997
 998         i915_gem_request_get(rq);
 999         __i915_add_request(rq, true);
1000
1001         if (!wait_for_hang(&h, rq)) {
1002                 struct drm_printer p = drm_info_printer(i915->drm.dev);
1003
1004                 pr_err("%s: Failed to start request %x, at %x\n",
1005                        __func__, rq->fence.seqno, hws_seqno(&h, rq));
1006                 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1007
1008                 i915_reset(i915, 0);
1009                 i915_gem_set_wedged(i915);
1010
1011                 err = -EIO;
1012                 goto err_request;
1013         }
1014
1015         mutex_unlock(&i915->drm.struct_mutex);
1016
1017         /* Temporarily disable error capture */
1018         error = xchg(&i915->gpu_error.first_error, (void *)-1);
1019
1020         engine->hangcheck.stalled = true;
1021         engine->hangcheck.seqno = intel_engine_get_seqno(engine);
1022
1023         i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
1024
1025         xchg(&i915->gpu_error.first_error, error);
1026
1027         mutex_lock(&i915->drm.struct_mutex);
1028
1029         if (rq->fence.error != -EIO) {
1030                 pr_err("Guilty request not identified!\n");
1031                 err = -EINVAL;
1032                 goto err_request;
1033         }
1034
1035 err_request:
1036         i915_gem_request_put(rq);
1037 err_fini:
1038         hang_fini(&h);
1039 err_unlock:
1040         mutex_unlock(&i915->drm.struct_mutex);
1041         return err;
1042 }
1043
1044 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1045 {
1046         static const struct i915_subtest tests[] = {
1047                 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1048                 SUBTEST(igt_hang_sanitycheck),
1049                 SUBTEST(igt_reset_idle_engine),
1050                 SUBTEST(igt_reset_active_engine),
1051                 SUBTEST(igt_reset_idle_engine_others),
1052                 SUBTEST(igt_reset_active_engine_others),
1053                 SUBTEST(igt_wait_reset),
1054                 SUBTEST(igt_reset_queue),
1055                 SUBTEST(igt_handle_error),
1056         };
1057         bool saved_hangcheck;
1058         int err;
1059
1060         if (!intel_has_gpu_reset(i915))
1061                 return 0;
1062
1063         intel_runtime_pm_get(i915);
1064         saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1065
1066         err = i915_subtests(tests, i915);
1067
1068         i915_modparams.enable_hangcheck = saved_hangcheck;
1069         intel_runtime_pm_put(i915);
1070
1071         return err;
1072 }