drivers/gpu/drm/i915/gt/intel_lrc.c

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Ben Widawsky <ben@bwidawsk.net>
  25  *    Michel Thierry <michel.thierry@intel.com>
  26  *    Thomas Daniel <thomas.daniel@intel.com>
  27  *    Oscar Mateo <oscar.mateo@intel.com>
  28  *
  29  */
  30
  31 /**
  32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
  33  *
  34  * Motivation:
  35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
  36  * These expanded contexts enable a number of new abilities, especially
  37  * "Execlists" (also implemented in this file).
  38  *
  39  * One of the main differences with the legacy HW contexts is that logical
  40  * ring contexts incorporate many more things to the context's state, like
  41  * PDPs or ringbuffer control registers:
  42  *
  43  * The reason why PDPs are included in the context is straightforward: as
  44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
  45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
  46  * instead, the GPU will do it for you on the context switch.
  47  *
  48  * But, what about the ringbuffer control registers (head, tail, etc..)?
  49  * shouldn't we just need a set of those per engine command streamer? This is
  50  * where the name "Logical Rings" starts to make sense: by virtualizing the
  51  * rings, the engine cs shifts to a new "ring buffer" with every context
  52  * switch. When you want to submit a workload to the GPU you: A) choose your
  53  * context, B) find its appropriate virtualized ring, C) write commands to it
  54  * and then, finally, D) tell the GPU to switch to that context.
  55  *
  56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
  57  * to a contexts is via a context execution list, ergo "Execlists".
  58  *
  59  * LRC implementation:
  60  * Regarding the creation of contexts, we have:
  61  *
  62  * - One global default context.
  63  * - One local default context for each opened fd.
  64  * - One local extra context for each context create ioctl call.
  65  *
  66  * Now that ringbuffers belong per-context (and not per-engine, like before)
  67  * and that contexts are uniquely tied to a given engine (and not reusable,
  68  * like before) we need:
  69  *
  70  * - One ringbuffer per-engine inside each context.
  71  * - One backing object per-engine inside each context.
  72  *
  73  * The global default context starts its life with these new objects fully
  74  * allocated and populated. The local default context for each opened fd is
  75  * more complex, because we don't know at creation time which engine is going
  76  * to use them. To handle this, we have implemented a deferred creation of LR
  77  * contexts:
  78  *
  79  * The local context starts its life as a hollow or blank holder, that only
  80  * gets populated for a given engine once we receive an execbuffer. If later
  81  * on we receive another execbuffer ioctl for the same context but a different
  82  * engine, we allocate/populate a new ringbuffer and context backing object and
  83  * so on.
  84  *
  85  * Finally, regarding local contexts created using the ioctl call: as they are
  86  * only allowed with the render ring, we can allocate & populate them right
  87  * away (no need to defer anything, at least for now).
  88  *
  89  * Execlists implementation:
  90  * Execlists are the new method by which, on gen8+ hardware, workloads are
  91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
  92  * This method works as follows:
  93  *
  94  * When a request is committed, its commands (the BB start and any leading or
  95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
  96  * for the appropriate context. The tail pointer in the hardware context is not
  97  * updated at this time, but instead, kept by the driver in the ringbuffer
  98  * structure. A structure representing this request is added to a request queue
  99  * for the appropriate engine: this structure contains a copy of the context's
 100  * tail after the request was written to the ring buffer and a pointer to the
 101  * context itself.
 102  *
 103  * If the engine's request queue was empty before the request was added, the
 104  * queue is processed immediately. Otherwise the queue will be processed during
 105  * a context switch interrupt. In any case, elements on the queue will get sent
 106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 107  * globally unique 20-bits submission ID.
 108  *
 109  * When execution of a request completes, the GPU updates the context status
 110  * buffer with a context complete event and generates a context switch interrupt.
 111  * During the interrupt handling, the driver examines the events in the buffer:
 112  * for each context complete event, if the announced ID matches that on the head
 113  * of the request queue, then that request is retired and removed from the queue.
 114  *
 115  * After processing, if any requests were retired and the queue is not empty
 116  * then a new execution list can be submitted. The two requests at the front of
 117  * the queue are next to be submitted but since a context may not occur twice in
 118  * an execution list, if subsequent requests have the same ID as the first then
 119  * the two requests must be combined. This is done simply by discarding requests
 120  * at the head of the queue until either only one requests is left (in which case
 121  * we use a NULL second context) or the first two requests have unique IDs.
 122  *
 123  * By always executing the first two requests in the queue the driver ensures
 124  * that the GPU is kept as busy as possible. In the case where a single context
 125  * completes but a second context is still executing, the request for this second
 126  * context will be at the head of the queue when we remove the first one. This
 127  * request will then be resubmitted along with a new request for a different context,
 128  * which will cause the hardware to continue executing the second request and queue
 129  * the new request (the GPU detects the condition of a context getting preempted
 130  * with the same context and optimizes the context switch flow by not doing
 131  * preemption, but just sampling the new tail pointer).
 132  *
 133  */
 134 #include <linux/interrupt.h>
 135
 136 #include "i915_drv.h"
 137 #include "i915_perf.h"
 138 #include "i915_trace.h"
 139 #include "i915_vgpu.h"
 140 #include "intel_breadcrumbs.h"
 141 #include "intel_context.h"
 142 #include "intel_engine_pm.h"
 143 #include "intel_gt.h"
 144 #include "intel_gt_pm.h"
 145 #include "intel_gt_requests.h"
 146 #include "intel_lrc_reg.h"
 147 #include "intel_mocs.h"
 148 #include "intel_reset.h"
 149 #include "intel_ring.h"
 150 #include "intel_workarounds.h"
 151 #include "shmem_utils.h"
 152
 153 #define RING_EXECLIST_QFULL             (1 << 0x2)
 154 #define RING_EXECLIST1_VALID            (1 << 0x3)
 155 #define RING_EXECLIST0_VALID            (1 << 0x4)
 156 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
 157 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
 158 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
 159
 160 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
 161 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
 162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
 163 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
 164 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
 165 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
 166
 167 #define GEN8_CTX_STATUS_COMPLETED_MASK \
 168          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
 169
 170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
 171
 172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
 173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
 174 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
 175 #define GEN12_IDLE_CTX_ID               0x7FF
 176 #define GEN12_CSB_CTX_VALID(csb_dw) \
 177         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
 178
 179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
 180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
 181
 182 struct virtual_engine {
 183         struct intel_engine_cs base;
 184         struct intel_context context;
 185         struct rcu_work rcu;
 186
 187         /*
 188          * We allow only a single request through the virtual engine at a time
 189          * (each request in the timeline waits for the completion fence of
 190          * the previous before being submitted). By restricting ourselves to
 191          * only submitting a single request, each request is placed on to a
 192          * physical to maximise load spreading (by virtue of the late greedy
 193          * scheduling -- each real engine takes the next available request
 194          * upon idling).
 195          */
 196         struct i915_request *request;
 197
 198         /*
 199          * We keep a rbtree of available virtual engines inside each physical
 200          * engine, sorted by priority. Here we preallocate the nodes we need
 201          * for the virtual engine, indexed by physical_engine->id.
 202          */
 203         struct ve_node {
 204                 struct rb_node rb;
 205                 int prio;
 206         } nodes[I915_NUM_ENGINES];
 207
 208         /*
 209          * Keep track of bonded pairs -- restrictions upon on our selection
 210          * of physical engines any particular request may be submitted to.
 211          * If we receive a submit-fence from a master engine, we will only
 212          * use one of sibling_mask physical engines.
 213          */
 214         struct ve_bond {
 215                 const struct intel_engine_cs *master;
 216                 intel_engine_mask_t sibling_mask;
 217         } *bonds;
 218         unsigned int num_bonds;
 219
 220         /* And finally, which physical engines this virtual engine maps onto. */
 221         unsigned int num_siblings;
 222         struct intel_engine_cs *siblings[];
 223 };
 224
 225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
 226 {
 227         GEM_BUG_ON(!intel_engine_is_virtual(engine));
 228         return container_of(engine, struct virtual_engine, base);
 229 }
 230
 231 static int __execlists_context_alloc(struct intel_context *ce,
 232                                      struct intel_engine_cs *engine);
 233
 234 static void execlists_init_reg_state(u32 *reg_state,
 235                                      const struct intel_context *ce,
 236                                      const struct intel_engine_cs *engine,
 237                                      const struct intel_ring *ring,
 238                                      bool close);
 239 static void
 240 __execlists_update_reg_state(const struct intel_context *ce,
 241                              const struct intel_engine_cs *engine,
 242                              u32 head);
 243
 244 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
 245 {
 246         if (INTEL_GEN(engine->i915) >= 12)
 247                 return 0x60;
 248         else if (INTEL_GEN(engine->i915) >= 9)
 249                 return 0x54;
 250         else if (engine->class == RENDER_CLASS)
 251                 return 0x58;
 252         else
 253                 return -1;
 254 }
 255
 256 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
 257 {
 258         if (INTEL_GEN(engine->i915) >= 12)
 259                 return 0x74;
 260         else if (INTEL_GEN(engine->i915) >= 9)
 261                 return 0x68;
 262         else if (engine->class == RENDER_CLASS)
 263                 return 0xd8;
 264         else
 265                 return -1;
 266 }
 267
 268 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
 269 {
 270         if (INTEL_GEN(engine->i915) >= 12)
 271                 return 0x12;
 272         else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
 273                 return 0x18;
 274         else
 275                 return -1;
 276 }
 277
 278 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
 279 {
 280         int x;
 281
 282         x = lrc_ring_wa_bb_per_ctx(engine);
 283         if (x < 0)
 284                 return x;
 285
 286         return x + 2;
 287 }
 288
 289 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
 290 {
 291         int x;
 292
 293         x = lrc_ring_indirect_ptr(engine);
 294         if (x < 0)
 295                 return x;
 296
 297         return x + 2;
 298 }
 299
 300 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
 301 {
 302         if (engine->class != RENDER_CLASS)
 303                 return -1;
 304
 305         if (INTEL_GEN(engine->i915) >= 12)
 306                 return 0xb6;
 307         else if (INTEL_GEN(engine->i915) >= 11)
 308                 return 0xaa;
 309         else
 310                 return -1;
 311 }
 312
 313 static u32
 314 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
 315 {
 316         switch (INTEL_GEN(engine->i915)) {
 317         default:
 318                 MISSING_CASE(INTEL_GEN(engine->i915));
 319                 fallthrough;
 320         case 12:
 321                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 322         case 11:
 323                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 324         case 10:
 325                 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 326         case 9:
 327                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 328         case 8:
 329                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
 330         }
 331 }
 332
 333 static void
 334 lrc_ring_setup_indirect_ctx(u32 *regs,
 335                             const struct intel_engine_cs *engine,
 336                             u32 ctx_bb_ggtt_addr,
 337                             u32 size)
 338 {
 339         GEM_BUG_ON(!size);
 340         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
 341         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
 342         regs[lrc_ring_indirect_ptr(engine) + 1] =
 343                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
 344
 345         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
 346         regs[lrc_ring_indirect_offset(engine) + 1] =
 347                 lrc_ring_indirect_offset_default(engine) << 6;
 348 }
 349
 350 static u32 intel_context_get_runtime(const struct intel_context *ce)
 351 {
 352         /*
 353          * We can use either ppHWSP[16] which is recorded before the context
 354          * switch (and so excludes the cost of context switches) or use the
 355          * value from the context image itself, which is saved/restored earlier
 356          * and so includes the cost of the save.
 357          */
 358         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
 359 }
 360
 361 static void mark_eio(struct i915_request *rq)
 362 {
 363         if (i915_request_completed(rq))
 364                 return;
 365
 366         GEM_BUG_ON(i915_request_signaled(rq));
 367
 368         i915_request_set_error_once(rq, -EIO);
 369         i915_request_mark_complete(rq);
 370 }
 371
 372 static struct i915_request *
 373 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
 374 {
 375         struct i915_request *active = rq;
 376
 377         rcu_read_lock();
 378         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
 379                 if (i915_request_completed(rq))
 380                         break;
 381
 382                 active = rq;
 383         }
 384         rcu_read_unlock();
 385
 386         return active;
 387 }
 388
 389 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
 390 {
 391         return (i915_ggtt_offset(engine->status_page.vma) +
 392                 I915_GEM_HWS_PREEMPT_ADDR);
 393 }
 394
 395 static inline void
 396 ring_set_paused(const struct intel_engine_cs *engine, int state)
 397 {
 398         /*
 399          * We inspect HWS_PREEMPT with a semaphore inside
 400          * engine->emit_fini_breadcrumb. If the dword is true,
 401          * the ring is paused as the semaphore will busywait
 402          * until the dword is false.
 403          */
 404         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
 405         if (state)
 406                 wmb();
 407 }
 408
 409 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 410 {
 411         return rb_entry(rb, struct i915_priolist, node);
 412 }
 413
 414 static inline int rq_prio(const struct i915_request *rq)
 415 {
 416         return READ_ONCE(rq->sched.attr.priority);
 417 }
 418
 419 static int effective_prio(const struct i915_request *rq)
 420 {
 421         int prio = rq_prio(rq);
 422
 423         /*
 424          * If this request is special and must not be interrupted at any
 425          * cost, so be it. Note we are only checking the most recent request
 426          * in the context and so may be masking an earlier vip request. It
 427          * is hoped that under the conditions where nopreempt is used, this
 428          * will not matter (i.e. all requests to that context will be
 429          * nopreempt for as long as desired).
 430          */
 431         if (i915_request_has_nopreempt(rq))
 432                 prio = I915_PRIORITY_UNPREEMPTABLE;
 433
 434         return prio;
 435 }
 436
 437 static int queue_prio(const struct intel_engine_execlists *execlists)
 438 {
 439         struct i915_priolist *p;
 440         struct rb_node *rb;
 441
 442         rb = rb_first_cached(&execlists->queue);
 443         if (!rb)
 444                 return INT_MIN;
 445
 446         /*
 447          * As the priolist[] are inverted, with the highest priority in [0],
 448          * we have to flip the index value to become priority.
 449          */
 450         p = to_priolist(rb);
 451         if (!I915_USER_PRIORITY_SHIFT)
 452                 return p->priority;
 453
 454         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 455 }
 456
 457 static inline bool need_preempt(const struct intel_engine_cs *engine,
 458                                 const struct i915_request *rq,
 459                                 struct rb_node *rb)
 460 {
 461         int last_prio;
 462
 463         if (!intel_engine_has_semaphores(engine))
 464                 return false;
 465
 466         /*
 467          * Check if the current priority hint merits a preemption attempt.
 468          *
 469          * We record the highest value priority we saw during rescheduling
 470          * prior to this dequeue, therefore we know that if it is strictly
 471          * less than the current tail of ESLP[0], we do not need to force
 472          * a preempt-to-idle cycle.
 473          *
 474          * However, the priority hint is a mere hint that we may need to
 475          * preempt. If that hint is stale or we may be trying to preempt
 476          * ourselves, ignore the request.
 477          *
 478          * More naturally we would write
 479          *      prio >= max(0, last);
 480          * except that we wish to prevent triggering preemption at the same
 481          * priority level: the task that is running should remain running
 482          * to preserve FIFO ordering of dependencies.
 483          */
 484         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
 485         if (engine->execlists.queue_priority_hint <= last_prio)
 486                 return false;
 487
 488         /*
 489          * Check against the first request in ELSP[1], it will, thanks to the
 490          * power of PI, be the highest priority of that context.
 491          */
 492         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
 493             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
 494                 return true;
 495
 496         if (rb) {
 497                 struct virtual_engine *ve =
 498                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
 499                 bool preempt = false;
 500
 501                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
 502                         struct i915_request *next;
 503
 504                         rcu_read_lock();
 505                         next = READ_ONCE(ve->request);
 506                         if (next)
 507                                 preempt = rq_prio(next) > last_prio;
 508                         rcu_read_unlock();
 509                 }
 510
 511                 if (preempt)
 512                         return preempt;
 513         }
 514
 515         /*
 516          * If the inflight context did not trigger the preemption, then maybe
 517          * it was the set of queued requests? Pick the highest priority in
 518          * the queue (the first active priolist) and see if it deserves to be
 519          * running instead of ELSP[0].
 520          *
 521          * The highest priority request in the queue can not be either
 522          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
 523          * context, it's priority would not exceed ELSP[0] aka last_prio.
 524          */
 525         return queue_prio(&engine->execlists) > last_prio;
 526 }
 527
 528 __maybe_unused static inline bool
 529 assert_priority_queue(const struct i915_request *prev,
 530                       const struct i915_request *next)
 531 {
 532         /*
 533          * Without preemption, the prev may refer to the still active element
 534          * which we refuse to let go.
 535          *
 536          * Even with preemption, there are times when we think it is better not
 537          * to preempt and leave an ostensibly lower priority request in flight.
 538          */
 539         if (i915_request_is_active(prev))
 540                 return true;
 541
 542         return rq_prio(prev) >= rq_prio(next);
 543 }
 544
 545 /*
 546  * The context descriptor encodes various attributes of a context,
 547  * including its GTT address and some flags. Because it's fairly
 548  * expensive to calculate, we'll just do it once and cache the result,
 549  * which remains valid until the context is unpinned.
 550  *
 551  * This is what a descriptor looks like, from LSB to MSB::
 552  *
 553  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
 554  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
 555  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
 556  *      bits 53-54:    mbz, reserved for use by hardware
 557  *      bits 55-63:    group ID, currently unused and set to 0
 558  *
 559  * Starting from Gen11, the upper dword of the descriptor has a new format:
 560  *
 561  *      bits 32-36:    reserved
 562  *      bits 37-47:    SW context ID
 563  *      bits 48:53:    engine instance
 564  *      bit 54:        mbz, reserved for use by hardware
 565  *      bits 55-60:    SW counter
 566  *      bits 61-63:    engine class
 567  *
 568  * engine info, SW context ID and SW counter need to form a unique number
 569  * (Context ID) per lrc.
 570  */
 571 static u32
 572 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
 573 {
 574         u32 desc;
 575
 576         desc = INTEL_LEGACY_32B_CONTEXT;
 577         if (i915_vm_is_4lvl(ce->vm))
 578                 desc = INTEL_LEGACY_64B_CONTEXT;
 579         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
 580
 581         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
 582         if (IS_GEN(engine->i915, 8))
 583                 desc |= GEN8_CTX_L3LLC_COHERENT;
 584
 585         return i915_ggtt_offset(ce->state) | desc;
 586 }
 587
 588 static inline unsigned int dword_in_page(void *addr)
 589 {
 590         return offset_in_page(addr) / sizeof(u32);
 591 }
 592
 593 static void set_offsets(u32 *regs,
 594                         const u8 *data,
 595                         const struct intel_engine_cs *engine,
 596                         bool clear)
 597 #define NOP(x) (BIT(7) | (x))
 598 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
 599 #define POSTED BIT(0)
 600 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
 601 #define REG16(x) \
 602         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
 603         (((x) >> 2) & 0x7f)
 604 #define END(total_state_size) 0, (total_state_size)
 605 {
 606         const u32 base = engine->mmio_base;
 607
 608         while (*data) {
 609                 u8 count, flags;
 610
 611                 if (*data & BIT(7)) { /* skip */
 612                         count = *data++ & ~BIT(7);
 613                         if (clear)
 614                                 memset32(regs, MI_NOOP, count);
 615                         regs += count;
 616                         continue;
 617                 }
 618
 619                 count = *data & 0x3f;
 620                 flags = *data >> 6;
 621                 data++;
 622
 623                 *regs = MI_LOAD_REGISTER_IMM(count);
 624                 if (flags & POSTED)
 625                         *regs |= MI_LRI_FORCE_POSTED;
 626                 if (INTEL_GEN(engine->i915) >= 11)
 627                         *regs |= MI_LRI_LRM_CS_MMIO;
 628                 regs++;
 629
 630                 GEM_BUG_ON(!count);
 631                 do {
 632                         u32 offset = 0;
 633                         u8 v;
 634
 635                         do {
 636                                 v = *data++;
 637                                 offset <<= 7;
 638                                 offset |= v & ~BIT(7);
 639                         } while (v & BIT(7));
 640
 641                         regs[0] = base + (offset << 2);
 642                         if (clear)
 643                                 regs[1] = 0;
 644                         regs += 2;
 645                 } while (--count);
 646         }
 647
 648         if (clear) {
 649                 u8 count = *++data;
 650
 651                 /* Clear past the tail for HW access */
 652                 GEM_BUG_ON(dword_in_page(regs) > count);
 653                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
 654
 655                 /* Close the batch; used mainly by live_lrc_layout() */
 656                 *regs = MI_BATCH_BUFFER_END;
 657                 if (INTEL_GEN(engine->i915) >= 10)
 658                         *regs |= BIT(0);
 659         }
 660 }
 661
 662 static const u8 gen8_xcs_offsets[] = {
 663         NOP(1),
 664         LRI(11, 0),
 665         REG16(0x244),
 666         REG(0x034),
 667         REG(0x030),
 668         REG(0x038),
 669         REG(0x03c),
 670         REG(0x168),
 671         REG(0x140),
 672         REG(0x110),
 673         REG(0x11c),
 674         REG(0x114),
 675         REG(0x118),
 676
 677         NOP(9),
 678         LRI(9, 0),
 679         REG16(0x3a8),
 680         REG16(0x28c),
 681         REG16(0x288),
 682         REG16(0x284),
 683         REG16(0x280),
 684         REG16(0x27c),
 685         REG16(0x278),
 686         REG16(0x274),
 687         REG16(0x270),
 688
 689         NOP(13),
 690         LRI(2, 0),
 691         REG16(0x200),
 692         REG(0x028),
 693
 694         END(80)
 695 };
 696
 697 static const u8 gen9_xcs_offsets[] = {
 698         NOP(1),
 699         LRI(14, POSTED),
 700         REG16(0x244),
 701         REG(0x034),
 702         REG(0x030),
 703         REG(0x038),
 704         REG(0x03c),
 705         REG(0x168),
 706         REG(0x140),
 707         REG(0x110),
 708         REG(0x11c),
 709         REG(0x114),
 710         REG(0x118),
 711         REG(0x1c0),
 712         REG(0x1c4),
 713         REG(0x1c8),
 714
 715         NOP(3),
 716         LRI(9, POSTED),
 717         REG16(0x3a8),
 718         REG16(0x28c),
 719         REG16(0x288),
 720         REG16(0x284),
 721         REG16(0x280),
 722         REG16(0x27c),
 723         REG16(0x278),
 724         REG16(0x274),
 725         REG16(0x270),
 726
 727         NOP(13),
 728         LRI(1, POSTED),
 729         REG16(0x200),
 730
 731         NOP(13),
 732         LRI(44, POSTED),
 733         REG(0x028),
 734         REG(0x09c),
 735         REG(0x0c0),
 736         REG(0x178),
 737         REG(0x17c),
 738         REG16(0x358),
 739         REG(0x170),
 740         REG(0x150),
 741         REG(0x154),
 742         REG(0x158),
 743         REG16(0x41c),
 744         REG16(0x600),
 745         REG16(0x604),
 746         REG16(0x608),
 747         REG16(0x60c),
 748         REG16(0x610),
 749         REG16(0x614),
 750         REG16(0x618),
 751         REG16(0x61c),
 752         REG16(0x620),
 753         REG16(0x624),
 754         REG16(0x628),
 755         REG16(0x62c),
 756         REG16(0x630),
 757         REG16(0x634),
 758         REG16(0x638),
 759         REG16(0x63c),
 760         REG16(0x640),
 761         REG16(0x644),
 762         REG16(0x648),
 763         REG16(0x64c),
 764         REG16(0x650),
 765         REG16(0x654),
 766         REG16(0x658),
 767         REG16(0x65c),
 768         REG16(0x660),
 769         REG16(0x664),
 770         REG16(0x668),
 771         REG16(0x66c),
 772         REG16(0x670),
 773         REG16(0x674),
 774         REG16(0x678),
 775         REG16(0x67c),
 776         REG(0x068),
 777
 778         END(176)
 779 };
 780
 781 static const u8 gen12_xcs_offsets[] = {
 782         NOP(1),
 783         LRI(13, POSTED),
 784         REG16(0x244),
 785         REG(0x034),
 786         REG(0x030),
 787         REG(0x038),
 788         REG(0x03c),
 789         REG(0x168),
 790         REG(0x140),
 791         REG(0x110),
 792         REG(0x1c0),
 793         REG(0x1c4),
 794         REG(0x1c8),
 795         REG(0x180),
 796         REG16(0x2b4),
 797
 798         NOP(5),
 799         LRI(9, POSTED),
 800         REG16(0x3a8),
 801         REG16(0x28c),
 802         REG16(0x288),
 803         REG16(0x284),
 804         REG16(0x280),
 805         REG16(0x27c),
 806         REG16(0x278),
 807         REG16(0x274),
 808         REG16(0x270),
 809
 810         END(80)
 811 };
 812
 813 static const u8 gen8_rcs_offsets[] = {
 814         NOP(1),
 815         LRI(14, POSTED),
 816         REG16(0x244),
 817         REG(0x034),
 818         REG(0x030),
 819         REG(0x038),
 820         REG(0x03c),
 821         REG(0x168),
 822         REG(0x140),
 823         REG(0x110),
 824         REG(0x11c),
 825         REG(0x114),
 826         REG(0x118),
 827         REG(0x1c0),
 828         REG(0x1c4),
 829         REG(0x1c8),
 830
 831         NOP(3),
 832         LRI(9, POSTED),
 833         REG16(0x3a8),
 834         REG16(0x28c),
 835         REG16(0x288),
 836         REG16(0x284),
 837         REG16(0x280),
 838         REG16(0x27c),
 839         REG16(0x278),
 840         REG16(0x274),
 841         REG16(0x270),
 842
 843         NOP(13),
 844         LRI(1, 0),
 845         REG(0x0c8),
 846
 847         END(80)
 848 };
 849
 850 static const u8 gen9_rcs_offsets[] = {
 851         NOP(1),
 852         LRI(14, POSTED),
 853         REG16(0x244),
 854         REG(0x34),
 855         REG(0x30),
 856         REG(0x38),
 857         REG(0x3c),
 858         REG(0x168),
 859         REG(0x140),
 860         REG(0x110),
 861         REG(0x11c),
 862         REG(0x114),
 863         REG(0x118),
 864         REG(0x1c0),
 865         REG(0x1c4),
 866         REG(0x1c8),
 867
 868         NOP(3),
 869         LRI(9, POSTED),
 870         REG16(0x3a8),
 871         REG16(0x28c),
 872         REG16(0x288),
 873         REG16(0x284),
 874         REG16(0x280),
 875         REG16(0x27c),
 876         REG16(0x278),
 877         REG16(0x274),
 878         REG16(0x270),
 879
 880         NOP(13),
 881         LRI(1, 0),
 882         REG(0xc8),
 883
 884         NOP(13),
 885         LRI(44, POSTED),
 886         REG(0x28),
 887         REG(0x9c),
 888         REG(0xc0),
 889         REG(0x178),
 890         REG(0x17c),
 891         REG16(0x358),
 892         REG(0x170),
 893         REG(0x150),
 894         REG(0x154),
 895         REG(0x158),
 896         REG16(0x41c),
 897         REG16(0x600),
 898         REG16(0x604),
 899         REG16(0x608),
 900         REG16(0x60c),
 901         REG16(0x610),
 902         REG16(0x614),
 903         REG16(0x618),
 904         REG16(0x61c),
 905         REG16(0x620),
 906         REG16(0x624),
 907         REG16(0x628),
 908         REG16(0x62c),
 909         REG16(0x630),
 910         REG16(0x634),
 911         REG16(0x638),
 912         REG16(0x63c),
 913         REG16(0x640),
 914         REG16(0x644),
 915         REG16(0x648),
 916         REG16(0x64c),
 917         REG16(0x650),
 918         REG16(0x654),
 919         REG16(0x658),
 920         REG16(0x65c),
 921         REG16(0x660),
 922         REG16(0x664),
 923         REG16(0x668),
 924         REG16(0x66c),
 925         REG16(0x670),
 926         REG16(0x674),
 927         REG16(0x678),
 928         REG16(0x67c),
 929         REG(0x68),
 930
 931         END(176)
 932 };
 933
 934 static const u8 gen11_rcs_offsets[] = {
 935         NOP(1),
 936         LRI(15, POSTED),
 937         REG16(0x244),
 938         REG(0x034),
 939         REG(0x030),
 940         REG(0x038),
 941         REG(0x03c),
 942         REG(0x168),
 943         REG(0x140),
 944         REG(0x110),
 945         REG(0x11c),
 946         REG(0x114),
 947         REG(0x118),
 948         REG(0x1c0),
 949         REG(0x1c4),
 950         REG(0x1c8),
 951         REG(0x180),
 952
 953         NOP(1),
 954         LRI(9, POSTED),
 955         REG16(0x3a8),
 956         REG16(0x28c),
 957         REG16(0x288),
 958         REG16(0x284),
 959         REG16(0x280),
 960         REG16(0x27c),
 961         REG16(0x278),
 962         REG16(0x274),
 963         REG16(0x270),
 964
 965         LRI(1, POSTED),
 966         REG(0x1b0),
 967
 968         NOP(10),
 969         LRI(1, 0),
 970         REG(0x0c8),
 971
 972         END(80)
 973 };
 974
 975 static const u8 gen12_rcs_offsets[] = {
 976         NOP(1),
 977         LRI(13, POSTED),
 978         REG16(0x244),
 979         REG(0x034),
 980         REG(0x030),
 981         REG(0x038),
 982         REG(0x03c),
 983         REG(0x168),
 984         REG(0x140),
 985         REG(0x110),
 986         REG(0x1c0),
 987         REG(0x1c4),
 988         REG(0x1c8),
 989         REG(0x180),
 990         REG16(0x2b4),
 991
 992         NOP(5),
 993         LRI(9, POSTED),
 994         REG16(0x3a8),
 995         REG16(0x28c),
 996         REG16(0x288),
 997         REG16(0x284),
 998         REG16(0x280),
 999         REG16(0x27c),
1000         REG16(0x278),
1001         REG16(0x274),
1002         REG16(0x270),
1003
1004         LRI(3, POSTED),
1005         REG(0x1b0),
1006         REG16(0x5a8),
1007         REG16(0x5ac),
1008
1009         NOP(6),
1010         LRI(1, 0),
1011         REG(0x0c8),
1012         NOP(3 + 9 + 1),
1013
1014         LRI(51, POSTED),
1015         REG16(0x588),
1016         REG16(0x588),
1017         REG16(0x588),
1018         REG16(0x588),
1019         REG16(0x588),
1020         REG16(0x588),
1021         REG(0x028),
1022         REG(0x09c),
1023         REG(0x0c0),
1024         REG(0x178),
1025         REG(0x17c),
1026         REG16(0x358),
1027         REG(0x170),
1028         REG(0x150),
1029         REG(0x154),
1030         REG(0x158),
1031         REG16(0x41c),
1032         REG16(0x600),
1033         REG16(0x604),
1034         REG16(0x608),
1035         REG16(0x60c),
1036         REG16(0x610),
1037         REG16(0x614),
1038         REG16(0x618),
1039         REG16(0x61c),
1040         REG16(0x620),
1041         REG16(0x624),
1042         REG16(0x628),
1043         REG16(0x62c),
1044         REG16(0x630),
1045         REG16(0x634),
1046         REG16(0x638),
1047         REG16(0x63c),
1048         REG16(0x640),
1049         REG16(0x644),
1050         REG16(0x648),
1051         REG16(0x64c),
1052         REG16(0x650),
1053         REG16(0x654),
1054         REG16(0x658),
1055         REG16(0x65c),
1056         REG16(0x660),
1057         REG16(0x664),
1058         REG16(0x668),
1059         REG16(0x66c),
1060         REG16(0x670),
1061         REG16(0x674),
1062         REG16(0x678),
1063         REG16(0x67c),
1064         REG(0x068),
1065         REG(0x084),
1066         NOP(1),
1067
1068         END(192)
1069 };
1070
1071 #undef END
1072 #undef REG16
1073 #undef REG
1074 #undef LRI
1075 #undef NOP
1076
1077 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1078 {
1079         /*
1080          * The gen12+ lists only have the registers we program in the basic
1081          * default state. We rely on the context image using relative
1082          * addressing to automatic fixup the register state between the
1083          * physical engines for virtual engine.
1084          */
1085         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1086                    !intel_engine_has_relative_mmio(engine));
1087
1088         if (engine->class == RENDER_CLASS) {
1089                 if (INTEL_GEN(engine->i915) >= 12)
1090                         return gen12_rcs_offsets;
1091                 else if (INTEL_GEN(engine->i915) >= 11)
1092                         return gen11_rcs_offsets;
1093                 else if (INTEL_GEN(engine->i915) >= 9)
1094                         return gen9_rcs_offsets;
1095                 else
1096                         return gen8_rcs_offsets;
1097         } else {
1098                 if (INTEL_GEN(engine->i915) >= 12)
1099                         return gen12_xcs_offsets;
1100                 else if (INTEL_GEN(engine->i915) >= 9)
1101                         return gen9_xcs_offsets;
1102                 else
1103                         return gen8_xcs_offsets;
1104         }
1105 }
1106
1107 static struct i915_request *
1108 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1109 {
1110         struct i915_request *rq, *rn, *active = NULL;
1111         struct list_head *pl;
1112         int prio = I915_PRIORITY_INVALID;
1113
1114         lockdep_assert_held(&engine->active.lock);
1115
1116         list_for_each_entry_safe_reverse(rq, rn,
1117                                          &engine->active.requests,
1118                                          sched.link) {
1119                 if (i915_request_completed(rq))
1120                         continue; /* XXX */
1121
1122                 __i915_request_unsubmit(rq);
1123
1124                 /*
1125                  * Push the request back into the queue for later resubmission.
1126                  * If this request is not native to this physical engine (i.e.
1127                  * it came from a virtual source), push it back onto the virtual
1128                  * engine so that it can be moved across onto another physical
1129                  * engine as load dictates.
1130                  */
1131                 if (likely(rq->execution_mask == engine->mask)) {
1132                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1133                         if (rq_prio(rq) != prio) {
1134                                 prio = rq_prio(rq);
1135                                 pl = i915_sched_lookup_priolist(engine, prio);
1136                         }
1137                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1138
1139                         list_move(&rq->sched.link, pl);
1140                         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1141
1142                         /* Check in case we rollback so far we wrap [size/2] */
1143                         if (intel_ring_direction(rq->ring,
1144                                                  rq->tail,
1145                                                  rq->ring->tail + 8) > 0)
1146                                 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1147
1148                         active = rq;
1149                 } else {
1150                         struct intel_engine_cs *owner = rq->context->engine;
1151
1152                         WRITE_ONCE(rq->engine, owner);
1153                         owner->submit_request(rq);
1154                         active = NULL;
1155                 }
1156         }
1157
1158         return active;
1159 }
1160
1161 struct i915_request *
1162 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1163 {
1164         struct intel_engine_cs *engine =
1165                 container_of(execlists, typeof(*engine), execlists);
1166
1167         return __unwind_incomplete_requests(engine);
1168 }
1169
1170 static inline void
1171 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1172 {
1173         /*
1174          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1175          * The compiler should eliminate this function as dead-code.
1176          */
1177         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1178                 return;
1179
1180         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1181                                    status, rq);
1182 }
1183
1184 static void intel_engine_context_in(struct intel_engine_cs *engine)
1185 {
1186         unsigned long flags;
1187
1188         if (atomic_add_unless(&engine->stats.active, 1, 0))
1189                 return;
1190
1191         write_seqlock_irqsave(&engine->stats.lock, flags);
1192         if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1193                 engine->stats.start = ktime_get();
1194                 atomic_inc(&engine->stats.active);
1195         }
1196         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1197 }
1198
1199 static void intel_engine_context_out(struct intel_engine_cs *engine)
1200 {
1201         unsigned long flags;
1202
1203         GEM_BUG_ON(!atomic_read(&engine->stats.active));
1204
1205         if (atomic_add_unless(&engine->stats.active, -1, 1))
1206                 return;
1207
1208         write_seqlock_irqsave(&engine->stats.lock, flags);
1209         if (atomic_dec_and_test(&engine->stats.active)) {
1210                 engine->stats.total =
1211                         ktime_add(engine->stats.total,
1212                                   ktime_sub(ktime_get(), engine->stats.start));
1213         }
1214         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1215 }
1216
1217 static void
1218 execlists_check_context(const struct intel_context *ce,
1219                         const struct intel_engine_cs *engine,
1220                         const char *when)
1221 {
1222         const struct intel_ring *ring = ce->ring;
1223         u32 *regs = ce->lrc_reg_state;
1224         bool valid = true;
1225         int x;
1226
1227         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1228                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1229                        engine->name,
1230                        regs[CTX_RING_START],
1231                        i915_ggtt_offset(ring->vma));
1232                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1233                 valid = false;
1234         }
1235
1236         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1237             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1238                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1239                        engine->name,
1240                        regs[CTX_RING_CTL],
1241                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1242                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1243                 valid = false;
1244         }
1245
1246         x = lrc_ring_mi_mode(engine);
1247         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1248                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1249                        engine->name, regs[x + 1]);
1250                 regs[x + 1] &= ~STOP_RING;
1251                 regs[x + 1] |= STOP_RING << 16;
1252                 valid = false;
1253         }
1254
1255         WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1256 }
1257
1258 static void restore_default_state(struct intel_context *ce,
1259                                   struct intel_engine_cs *engine)
1260 {
1261         u32 *regs;
1262
1263         regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1264         execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1265
1266         ce->runtime.last = intel_context_get_runtime(ce);
1267 }
1268
1269 static void reset_active(struct i915_request *rq,
1270                          struct intel_engine_cs *engine)
1271 {
1272         struct intel_context * const ce = rq->context;
1273         u32 head;
1274
1275         /*
1276          * The executing context has been cancelled. We want to prevent
1277          * further execution along this context and propagate the error on
1278          * to anything depending on its results.
1279          *
1280          * In __i915_request_submit(), we apply the -EIO and remove the
1281          * requests' payloads for any banned requests. But first, we must
1282          * rewind the context back to the start of the incomplete request so
1283          * that we do not jump back into the middle of the batch.
1284          *
1285          * We preserve the breadcrumbs and semaphores of the incomplete
1286          * requests so that inter-timeline dependencies (i.e other timelines)
1287          * remain correctly ordered. And we defer to __i915_request_submit()
1288          * so that all asynchronous waits are correctly handled.
1289          */
1290         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1291                      rq->fence.context, rq->fence.seqno);
1292
1293         /* On resubmission of the active request, payload will be scrubbed */
1294         if (i915_request_completed(rq))
1295                 head = rq->tail;
1296         else
1297                 head = active_request(ce->timeline, rq)->head;
1298         head = intel_ring_wrap(ce->ring, head);
1299
1300         /* Scrub the context image to prevent replaying the previous batch */
1301         restore_default_state(ce, engine);
1302         __execlists_update_reg_state(ce, engine, head);
1303
1304         /* We've switched away, so this should be a no-op, but intent matters */
1305         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1306 }
1307
1308 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1309 {
1310 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1311         ce->runtime.num_underflow += dt < 0;
1312         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1313 #endif
1314 }
1315
1316 static void intel_context_update_runtime(struct intel_context *ce)
1317 {
1318         u32 old;
1319         s32 dt;
1320
1321         if (intel_context_is_barrier(ce))
1322                 return;
1323
1324         old = ce->runtime.last;
1325         ce->runtime.last = intel_context_get_runtime(ce);
1326         dt = ce->runtime.last - old;
1327
1328         if (unlikely(dt <= 0)) {
1329                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1330                          old, ce->runtime.last, dt);
1331                 st_update_runtime_underflow(ce, dt);
1332                 return;
1333         }
1334
1335         ewma_runtime_add(&ce->runtime.avg, dt);
1336         ce->runtime.total += dt;
1337 }
1338
1339 static inline struct intel_engine_cs *
1340 __execlists_schedule_in(struct i915_request *rq)
1341 {
1342         struct intel_engine_cs * const engine = rq->engine;
1343         struct intel_context * const ce = rq->context;
1344
1345         intel_context_get(ce);
1346
1347         if (unlikely(intel_context_is_banned(ce)))
1348                 reset_active(rq, engine);
1349
1350         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1351                 execlists_check_context(ce, engine, "before");
1352
1353         if (ce->tag) {
1354                 /* Use a fixed tag for OA and friends */
1355                 GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1356                 ce->lrc.ccid = ce->tag;
1357         } else {
1358                 /* We don't need a strict matching tag, just different values */
1359                 unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1360
1361                 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1362                 clear_bit(tag - 1, &engine->context_tag);
1363                 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1364
1365                 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1366         }
1367
1368         ce->lrc.ccid |= engine->execlists.ccid;
1369
1370         __intel_gt_pm_get(engine->gt);
1371         if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1372                 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1373         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1374         intel_engine_context_in(engine);
1375
1376         return engine;
1377 }
1378
1379 static inline struct i915_request *
1380 execlists_schedule_in(struct i915_request *rq, int idx)
1381 {
1382         struct intel_context * const ce = rq->context;
1383         struct intel_engine_cs *old;
1384
1385         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1386         trace_i915_request_in(rq, idx);
1387
1388         old = READ_ONCE(ce->inflight);
1389         do {
1390                 if (!old) {
1391                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1392                         break;
1393                 }
1394         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1395
1396         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1397         return i915_request_get(rq);
1398 }
1399
1400 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1401 {
1402         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1403         struct i915_request *next = READ_ONCE(ve->request);
1404
1405         if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1406                 tasklet_hi_schedule(&ve->base.execlists.tasklet);
1407 }
1408
1409 static inline void
1410 __execlists_schedule_out(struct i915_request *rq,
1411                          struct intel_engine_cs * const engine,
1412                          unsigned int ccid)
1413 {
1414         struct intel_context * const ce = rq->context;
1415
1416         /*
1417          * NB process_csb() is not under the engine->active.lock and hence
1418          * schedule_out can race with schedule_in meaning that we should
1419          * refrain from doing non-trivial work here.
1420          */
1421
1422         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1423                 execlists_check_context(ce, engine, "after");
1424
1425         /*
1426          * If we have just completed this context, the engine may now be
1427          * idle and we want to re-enter powersaving.
1428          */
1429         if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1430             i915_request_completed(rq))
1431                 intel_engine_add_retire(engine, ce->timeline);
1432
1433         ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1434         ccid &= GEN12_MAX_CONTEXT_HW_ID;
1435         if (ccid < BITS_PER_LONG) {
1436                 GEM_BUG_ON(ccid == 0);
1437                 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1438                 set_bit(ccid - 1, &engine->context_tag);
1439         }
1440
1441         intel_context_update_runtime(ce);
1442         intel_engine_context_out(engine);
1443         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1444         if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1445                 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1446         intel_gt_pm_put_async(engine->gt);
1447
1448         /*
1449          * If this is part of a virtual engine, its next request may
1450          * have been blocked waiting for access to the active context.
1451          * We have to kick all the siblings again in case we need to
1452          * switch (e.g. the next request is not runnable on this
1453          * engine). Hopefully, we will already have submitted the next
1454          * request before the tasklet runs and do not need to rebuild
1455          * each virtual tree and kick everyone again.
1456          */
1457         if (ce->engine != engine)
1458                 kick_siblings(rq, ce);
1459
1460         intel_context_put(ce);
1461 }
1462
1463 static inline void
1464 execlists_schedule_out(struct i915_request *rq)
1465 {
1466         struct intel_context * const ce = rq->context;
1467         struct intel_engine_cs *cur, *old;
1468         u32 ccid;
1469
1470         trace_i915_request_out(rq);
1471
1472         ccid = rq->context->lrc.ccid;
1473         old = READ_ONCE(ce->inflight);
1474         do
1475                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1476         while (!try_cmpxchg(&ce->inflight, &old, cur));
1477         if (!cur)
1478                 __execlists_schedule_out(rq, old, ccid);
1479
1480         i915_request_put(rq);
1481 }
1482
1483 static u64 execlists_update_context(struct i915_request *rq)
1484 {
1485         struct intel_context *ce = rq->context;
1486         u64 desc = ce->lrc.desc;
1487         u32 tail, prev;
1488
1489         /*
1490          * WaIdleLiteRestore:bdw,skl
1491          *
1492          * We should never submit the context with the same RING_TAIL twice
1493          * just in case we submit an empty ring, which confuses the HW.
1494          *
1495          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1496          * the normal request to be able to always advance the RING_TAIL on
1497          * subsequent resubmissions (for lite restore). Should that fail us,
1498          * and we try and submit the same tail again, force the context
1499          * reload.
1500          *
1501          * If we need to return to a preempted context, we need to skip the
1502          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1503          * HW has a tendency to ignore us rewinding the TAIL to the end of
1504          * an earlier request.
1505          */
1506         GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1507         prev = rq->ring->tail;
1508         tail = intel_ring_set_tail(rq->ring, rq->tail);
1509         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1510                 desc |= CTX_DESC_FORCE_RESTORE;
1511         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1512         rq->tail = rq->wa_tail;
1513
1514         /*
1515          * Make sure the context image is complete before we submit it to HW.
1516          *
1517          * Ostensibly, writes (including the WCB) should be flushed prior to
1518          * an uncached write such as our mmio register access, the empirical
1519          * evidence (esp. on Braswell) suggests that the WC write into memory
1520          * may not be visible to the HW prior to the completion of the UC
1521          * register write and that we may begin execution from the context
1522          * before its image is complete leading to invalid PD chasing.
1523          */
1524         wmb();
1525
1526         ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1527         return desc;
1528 }
1529
1530 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1531 {
1532         if (execlists->ctrl_reg) {
1533                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1534                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1535         } else {
1536                 writel(upper_32_bits(desc), execlists->submit_reg);
1537                 writel(lower_32_bits(desc), execlists->submit_reg);
1538         }
1539 }
1540
1541 static __maybe_unused char *
1542 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1543 {
1544         if (!rq)
1545                 return "";
1546
1547         snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1548                  prefix,
1549                  rq->context->lrc.ccid,
1550                  rq->fence.context, rq->fence.seqno,
1551                  i915_request_completed(rq) ? "!" :
1552                  i915_request_started(rq) ? "*" :
1553                  "",
1554                  rq_prio(rq));
1555
1556         return buf;
1557 }
1558
1559 static __maybe_unused void
1560 trace_ports(const struct intel_engine_execlists *execlists,
1561             const char *msg,
1562             struct i915_request * const *ports)
1563 {
1564         const struct intel_engine_cs *engine =
1565                 container_of(execlists, typeof(*engine), execlists);
1566         char __maybe_unused p0[40], p1[40];
1567
1568         if (!ports[0])
1569                 return;
1570
1571         ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1572                      dump_port(p0, sizeof(p0), "", ports[0]),
1573                      dump_port(p1, sizeof(p1), ", ", ports[1]));
1574 }
1575
1576 static inline bool
1577 reset_in_progress(const struct intel_engine_execlists *execlists)
1578 {
1579         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1580 }
1581
1582 static __maybe_unused bool
1583 assert_pending_valid(const struct intel_engine_execlists *execlists,
1584                      const char *msg)
1585 {
1586         struct intel_engine_cs *engine =
1587                 container_of(execlists, typeof(*engine), execlists);
1588         struct i915_request * const *port, *rq;
1589         struct intel_context *ce = NULL;
1590         bool sentinel = false;
1591         u32 ccid = -1;
1592
1593         trace_ports(execlists, msg, execlists->pending);
1594
1595         /* We may be messing around with the lists during reset, lalala */
1596         if (reset_in_progress(execlists))
1597                 return true;
1598
1599         if (!execlists->pending[0]) {
1600                 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1601                               engine->name);
1602                 return false;
1603         }
1604
1605         if (execlists->pending[execlists_num_ports(execlists)]) {
1606                 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1607                               engine->name, execlists_num_ports(execlists));
1608                 return false;
1609         }
1610
1611         for (port = execlists->pending; (rq = *port); port++) {
1612                 unsigned long flags;
1613                 bool ok = true;
1614
1615                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1616                 GEM_BUG_ON(!i915_request_is_active(rq));
1617
1618                 if (ce == rq->context) {
1619                         GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1620                                       engine->name,
1621                                       ce->timeline->fence_context,
1622                                       port - execlists->pending);
1623                         return false;
1624                 }
1625                 ce = rq->context;
1626
1627                 if (ccid == ce->lrc.ccid) {
1628                         GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1629                                       engine->name,
1630                                       ccid, ce->timeline->fence_context,
1631                                       port - execlists->pending);
1632                         return false;
1633                 }
1634                 ccid = ce->lrc.ccid;
1635
1636                 /*
1637                  * Sentinels are supposed to be the last request so they flush
1638                  * the current execution off the HW. Check that they are the only
1639                  * request in the pending submission.
1640                  */
1641                 if (sentinel) {
1642                         GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1643                                       engine->name,
1644                                       ce->timeline->fence_context,
1645                                       port - execlists->pending);
1646                         return false;
1647                 }
1648                 sentinel = i915_request_has_sentinel(rq);
1649
1650                 /* Hold tightly onto the lock to prevent concurrent retires! */
1651                 if (!spin_trylock_irqsave(&rq->lock, flags))
1652                         continue;
1653
1654                 if (i915_request_completed(rq))
1655                         goto unlock;
1656
1657                 if (i915_active_is_idle(&ce->active) &&
1658                     !intel_context_is_barrier(ce)) {
1659                         GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1660                                       engine->name,
1661                                       ce->timeline->fence_context,
1662                                       port - execlists->pending);
1663                         ok = false;
1664                         goto unlock;
1665                 }
1666
1667                 if (!i915_vma_is_pinned(ce->state)) {
1668                         GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1669                                       engine->name,
1670                                       ce->timeline->fence_context,
1671                                       port - execlists->pending);
1672                         ok = false;
1673                         goto unlock;
1674                 }
1675
1676                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1677                         GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1678                                       engine->name,
1679                                       ce->timeline->fence_context,
1680                                       port - execlists->pending);
1681                         ok = false;
1682                         goto unlock;
1683                 }
1684
1685 unlock:
1686                 spin_unlock_irqrestore(&rq->lock, flags);
1687                 if (!ok)
1688                         return false;
1689         }
1690
1691         return ce;
1692 }
1693
1694 static void execlists_submit_ports(struct intel_engine_cs *engine)
1695 {
1696         struct intel_engine_execlists *execlists = &engine->execlists;
1697         unsigned int n;
1698
1699         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1700
1701         /*
1702          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1703          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1704          * not be relinquished until the device is idle (see
1705          * i915_gem_idle_work_handler()). As a precaution, we make sure
1706          * that all ELSP are drained i.e. we have processed the CSB,
1707          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1708          */
1709         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1710
1711         /*
1712          * ELSQ note: the submit queue is not cleared after being submitted
1713          * to the HW so we need to make sure we always clean it up. This is
1714          * currently ensured by the fact that we always write the same number
1715          * of elsq entries, keep this in mind before changing the loop below.
1716          */
1717         for (n = execlists_num_ports(execlists); n--; ) {
1718                 struct i915_request *rq = execlists->pending[n];
1719
1720                 write_desc(execlists,
1721                            rq ? execlists_update_context(rq) : 0,
1722                            n);
1723         }
1724
1725         /* we need to manually load the submit queue */
1726         if (execlists->ctrl_reg)
1727                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1728 }
1729
1730 static bool ctx_single_port_submission(const struct intel_context *ce)
1731 {
1732         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1733                 intel_context_force_single_submission(ce));
1734 }
1735
1736 static bool can_merge_ctx(const struct intel_context *prev,
1737                           const struct intel_context *next)
1738 {
1739         if (prev != next)
1740                 return false;
1741
1742         if (ctx_single_port_submission(prev))
1743                 return false;
1744
1745         return true;
1746 }
1747
1748 static unsigned long i915_request_flags(const struct i915_request *rq)
1749 {
1750         return READ_ONCE(rq->fence.flags);
1751 }
1752
1753 static bool can_merge_rq(const struct i915_request *prev,
1754                          const struct i915_request *next)
1755 {
1756         GEM_BUG_ON(prev == next);
1757         GEM_BUG_ON(!assert_priority_queue(prev, next));
1758
1759         /*
1760          * We do not submit known completed requests. Therefore if the next
1761          * request is already completed, we can pretend to merge it in
1762          * with the previous context (and we will skip updating the ELSP
1763          * and tracking). Thus hopefully keeping the ELSP full with active
1764          * contexts, despite the best efforts of preempt-to-busy to confuse
1765          * us.
1766          */
1767         if (i915_request_completed(next))
1768                 return true;
1769
1770         if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1771                      (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1772                       BIT(I915_FENCE_FLAG_SENTINEL))))
1773                 return false;
1774
1775         if (!can_merge_ctx(prev->context, next->context))
1776                 return false;
1777
1778         GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1779         return true;
1780 }
1781
1782 static void virtual_update_register_offsets(u32 *regs,
1783                                             struct intel_engine_cs *engine)
1784 {
1785         set_offsets(regs, reg_offsets(engine), engine, false);
1786 }
1787
1788 static bool virtual_matches(const struct virtual_engine *ve,
1789                             const struct i915_request *rq,
1790                             const struct intel_engine_cs *engine)
1791 {
1792         const struct intel_engine_cs *inflight;
1793
1794         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1795                 return false;
1796
1797         /*
1798          * We track when the HW has completed saving the context image
1799          * (i.e. when we have seen the final CS event switching out of
1800          * the context) and must not overwrite the context image before
1801          * then. This restricts us to only using the active engine
1802          * while the previous virtualized request is inflight (so
1803          * we reuse the register offsets). This is a very small
1804          * hystersis on the greedy seelction algorithm.
1805          */
1806         inflight = intel_context_inflight(&ve->context);
1807         if (inflight && inflight != engine)
1808                 return false;
1809
1810         return true;
1811 }
1812
1813 static void virtual_xfer_context(struct virtual_engine *ve,
1814                                  struct intel_engine_cs *engine)
1815 {
1816         unsigned int n;
1817
1818         if (likely(engine == ve->siblings[0]))
1819                 return;
1820
1821         GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1822         if (!intel_engine_has_relative_mmio(engine))
1823                 virtual_update_register_offsets(ve->context.lrc_reg_state,
1824                                                 engine);
1825
1826         /*
1827          * Move the bound engine to the top of the list for
1828          * future execution. We then kick this tasklet first
1829          * before checking others, so that we preferentially
1830          * reuse this set of bound registers.
1831          */
1832         for (n = 1; n < ve->num_siblings; n++) {
1833                 if (ve->siblings[n] == engine) {
1834                         swap(ve->siblings[n], ve->siblings[0]);
1835                         break;
1836                 }
1837         }
1838 }
1839
1840 #define for_each_waiter(p__, rq__) \
1841         list_for_each_entry_lockless(p__, \
1842                                      &(rq__)->sched.waiters_list, \
1843                                      wait_link)
1844
1845 #define for_each_signaler(p__, rq__) \
1846         list_for_each_entry_rcu(p__, \
1847                                 &(rq__)->sched.signalers_list, \
1848                                 signal_link)
1849
1850 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1851 {
1852         LIST_HEAD(list);
1853
1854         /*
1855          * We want to move the interrupted request to the back of
1856          * the round-robin list (i.e. its priority level), but
1857          * in doing so, we must then move all requests that were in
1858          * flight and were waiting for the interrupted request to
1859          * be run after it again.
1860          */
1861         do {
1862                 struct i915_dependency *p;
1863
1864                 GEM_BUG_ON(i915_request_is_active(rq));
1865                 list_move_tail(&rq->sched.link, pl);
1866
1867                 for_each_waiter(p, rq) {
1868                         struct i915_request *w =
1869                                 container_of(p->waiter, typeof(*w), sched);
1870
1871                         if (p->flags & I915_DEPENDENCY_WEAK)
1872                                 continue;
1873
1874                         /* Leave semaphores spinning on the other engines */
1875                         if (w->engine != rq->engine)
1876                                 continue;
1877
1878                         /* No waiter should start before its signaler */
1879                         GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1880                                    i915_request_started(w) &&
1881                                    !i915_request_completed(rq));
1882
1883                         GEM_BUG_ON(i915_request_is_active(w));
1884                         if (!i915_request_is_ready(w))
1885                                 continue;
1886
1887                         if (rq_prio(w) < rq_prio(rq))
1888                                 continue;
1889
1890                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1891                         list_move_tail(&w->sched.link, &list);
1892                 }
1893
1894                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1895         } while (rq);
1896 }
1897
1898 static void defer_active(struct intel_engine_cs *engine)
1899 {
1900         struct i915_request *rq;
1901
1902         rq = __unwind_incomplete_requests(engine);
1903         if (!rq)
1904                 return;
1905
1906         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1907 }
1908
1909 static bool
1910 need_timeslice(const struct intel_engine_cs *engine,
1911                const struct i915_request *rq,
1912                const struct rb_node *rb)
1913 {
1914         int hint;
1915
1916         if (!intel_engine_has_timeslices(engine))
1917                 return false;
1918
1919         hint = engine->execlists.queue_priority_hint;
1920
1921         if (rb) {
1922                 const struct virtual_engine *ve =
1923                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1924                 const struct intel_engine_cs *inflight =
1925                         intel_context_inflight(&ve->context);
1926
1927                 if (!inflight || inflight == engine) {
1928                         struct i915_request *next;
1929
1930                         rcu_read_lock();
1931                         next = READ_ONCE(ve->request);
1932                         if (next)
1933                                 hint = max(hint, rq_prio(next));
1934                         rcu_read_unlock();
1935                 }
1936         }
1937
1938         if (!list_is_last(&rq->sched.link, &engine->active.requests))
1939                 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1940
1941         GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1942         return hint >= effective_prio(rq);
1943 }
1944
1945 static bool
1946 timeslice_yield(const struct intel_engine_execlists *el,
1947                 const struct i915_request *rq)
1948 {
1949         /*
1950          * Once bitten, forever smitten!
1951          *
1952          * If the active context ever busy-waited on a semaphore,
1953          * it will be treated as a hog until the end of its timeslice (i.e.
1954          * until it is scheduled out and replaced by a new submission,
1955          * possibly even its own lite-restore). The HW only sends an interrupt
1956          * on the first miss, and we do know if that semaphore has been
1957          * signaled, or even if it is now stuck on another semaphore. Play
1958          * safe, yield if it might be stuck -- it will be given a fresh
1959          * timeslice in the near future.
1960          */
1961         return rq->context->lrc.ccid == READ_ONCE(el->yield);
1962 }
1963
1964 static bool
1965 timeslice_expired(const struct intel_engine_execlists *el,
1966                   const struct i915_request *rq)
1967 {
1968         return timer_expired(&el->timer) || timeslice_yield(el, rq);
1969 }
1970
1971 static int
1972 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1973 {
1974         if (list_is_last(&rq->sched.link, &engine->active.requests))
1975                 return engine->execlists.queue_priority_hint;
1976
1977         return rq_prio(list_next_entry(rq, sched.link));
1978 }
1979
1980 static inline unsigned long
1981 timeslice(const struct intel_engine_cs *engine)
1982 {
1983         return READ_ONCE(engine->props.timeslice_duration_ms);
1984 }
1985
1986 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1987 {
1988         const struct intel_engine_execlists *execlists = &engine->execlists;
1989         const struct i915_request *rq = *execlists->active;
1990
1991         if (!rq || i915_request_completed(rq))
1992                 return 0;
1993
1994         if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1995                 return 0;
1996
1997         return timeslice(engine);
1998 }
1999
2000 static void set_timeslice(struct intel_engine_cs *engine)
2001 {
2002         unsigned long duration;
2003
2004         if (!intel_engine_has_timeslices(engine))
2005                 return;
2006
2007         duration = active_timeslice(engine);
2008         ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2009
2010         set_timer_ms(&engine->execlists.timer, duration);
2011 }
2012
2013 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2014 {
2015         struct intel_engine_execlists *execlists = &engine->execlists;
2016         unsigned long duration;
2017
2018         if (!intel_engine_has_timeslices(engine))
2019                 return;
2020
2021         WRITE_ONCE(execlists->switch_priority_hint, prio);
2022         if (prio == INT_MIN)
2023                 return;
2024
2025         if (timer_pending(&execlists->timer))
2026                 return;
2027
2028         duration = timeslice(engine);
2029         ENGINE_TRACE(engine,
2030                      "start timeslicing, prio:%d, interval:%lu",
2031                      prio, duration);
2032
2033         set_timer_ms(&execlists->timer, duration);
2034 }
2035
2036 static void record_preemption(struct intel_engine_execlists *execlists)
2037 {
2038         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2039 }
2040
2041 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2042                                             const struct i915_request *rq)
2043 {
2044         if (!rq)
2045                 return 0;
2046
2047         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
2048         if (unlikely(intel_context_is_banned(rq->context)))
2049                 return 1;
2050
2051         return READ_ONCE(engine->props.preempt_timeout_ms);
2052 }
2053
2054 static void set_preempt_timeout(struct intel_engine_cs *engine,
2055                                 const struct i915_request *rq)
2056 {
2057         if (!intel_engine_has_preempt_reset(engine))
2058                 return;
2059
2060         set_timer_ms(&engine->execlists.preempt,
2061                      active_preempt_timeout(engine, rq));
2062 }
2063
2064 static inline void clear_ports(struct i915_request **ports, int count)
2065 {
2066         memset_p((void **)ports, NULL, count);
2067 }
2068
2069 static inline void
2070 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2071 {
2072         /* A memcpy_p() would be very useful here! */
2073         while (count--)
2074                 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2075 }
2076
2077 static void execlists_dequeue(struct intel_engine_cs *engine)
2078 {
2079         struct intel_engine_execlists * const execlists = &engine->execlists;
2080         struct i915_request **port = execlists->pending;
2081         struct i915_request ** const last_port = port + execlists->port_mask;
2082         struct i915_request * const *active;
2083         struct i915_request *last;
2084         struct rb_node *rb;
2085         bool submit = false;
2086
2087         /*
2088          * Hardware submission is through 2 ports. Conceptually each port
2089          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2090          * static for a context, and unique to each, so we only execute
2091          * requests belonging to a single context from each ring. RING_HEAD
2092          * is maintained by the CS in the context image, it marks the place
2093          * where it got up to last time, and through RING_TAIL we tell the CS
2094          * where we want to execute up to this time.
2095          *
2096          * In this list the requests are in order of execution. Consecutive
2097          * requests from the same context are adjacent in the ringbuffer. We
2098          * can combine these requests into a single RING_TAIL update:
2099          *
2100          *              RING_HEAD...req1...req2
2101          *                                    ^- RING_TAIL
2102          * since to execute req2 the CS must first execute req1.
2103          *
2104          * Our goal then is to point each port to the end of a consecutive
2105          * sequence of requests as being the most optimal (fewest wake ups
2106          * and context switches) submission.
2107          */
2108
2109         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2110                 struct virtual_engine *ve =
2111                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2112                 struct i915_request *rq = READ_ONCE(ve->request);
2113
2114                 if (!rq) { /* lazily cleanup after another engine handled rq */
2115                         rb_erase_cached(rb, &execlists->virtual);
2116                         RB_CLEAR_NODE(rb);
2117                         rb = rb_first_cached(&execlists->virtual);
2118                         continue;
2119                 }
2120
2121                 if (!virtual_matches(ve, rq, engine)) {
2122                         rb = rb_next(rb);
2123                         continue;
2124                 }
2125
2126                 break;
2127         }
2128
2129         /*
2130          * If the queue is higher priority than the last
2131          * request in the currently active context, submit afresh.
2132          * We will resubmit again afterwards in case we need to split
2133          * the active context to interject the preemption request,
2134          * i.e. we will retrigger preemption following the ack in case
2135          * of trouble.
2136          */
2137         active = READ_ONCE(execlists->active);
2138
2139         /*
2140          * In theory we can skip over completed contexts that have not
2141          * yet been processed by events (as those events are in flight):
2142          *
2143          * while ((last = *active) && i915_request_completed(last))
2144          *      active++;
2145          *
2146          * However, the GPU cannot handle this as it will ultimately
2147          * find itself trying to jump back into a context it has just
2148          * completed and barf.
2149          */
2150
2151         if ((last = *active)) {
2152                 if (need_preempt(engine, last, rb)) {
2153                         if (i915_request_completed(last)) {
2154                                 tasklet_hi_schedule(&execlists->tasklet);
2155                                 return;
2156                         }
2157
2158                         ENGINE_TRACE(engine,
2159                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2160                                      last->fence.context,
2161                                      last->fence.seqno,
2162                                      last->sched.attr.priority,
2163                                      execlists->queue_priority_hint);
2164                         record_preemption(execlists);
2165
2166                         /*
2167                          * Don't let the RING_HEAD advance past the breadcrumb
2168                          * as we unwind (and until we resubmit) so that we do
2169                          * not accidentally tell it to go backwards.
2170                          */
2171                         ring_set_paused(engine, 1);
2172
2173                         /*
2174                          * Note that we have not stopped the GPU at this point,
2175                          * so we are unwinding the incomplete requests as they
2176                          * remain inflight and so by the time we do complete
2177                          * the preemption, some of the unwound requests may
2178                          * complete!
2179                          */
2180                         __unwind_incomplete_requests(engine);
2181
2182                         last = NULL;
2183                 } else if (need_timeslice(engine, last, rb) &&
2184                            timeslice_expired(execlists, last)) {
2185                         if (i915_request_completed(last)) {
2186                                 tasklet_hi_schedule(&execlists->tasklet);
2187                                 return;
2188                         }
2189
2190                         ENGINE_TRACE(engine,
2191                                      "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2192                                      last->fence.context,
2193                                      last->fence.seqno,
2194                                      last->sched.attr.priority,
2195                                      execlists->queue_priority_hint,
2196                                      yesno(timeslice_yield(execlists, last)));
2197
2198                         ring_set_paused(engine, 1);
2199                         defer_active(engine);
2200
2201                         /*
2202                          * Unlike for preemption, if we rewind and continue
2203                          * executing the same context as previously active,
2204                          * the order of execution will remain the same and
2205                          * the tail will only advance. We do not need to
2206                          * force a full context restore, as a lite-restore
2207                          * is sufficient to resample the monotonic TAIL.
2208                          *
2209                          * If we switch to any other context, similarly we
2210                          * will not rewind TAIL of current context, and
2211                          * normal save/restore will preserve state and allow
2212                          * us to later continue executing the same request.
2213                          */
2214                         last = NULL;
2215                 } else {
2216                         /*
2217                          * Otherwise if we already have a request pending
2218                          * for execution after the current one, we can
2219                          * just wait until the next CS event before
2220                          * queuing more. In either case we will force a
2221                          * lite-restore preemption event, but if we wait
2222                          * we hopefully coalesce several updates into a single
2223                          * submission.
2224                          */
2225                         if (!list_is_last(&last->sched.link,
2226                                           &engine->active.requests)) {
2227                                 /*
2228                                  * Even if ELSP[1] is occupied and not worthy
2229                                  * of timeslices, our queue might be.
2230                                  */
2231                                 start_timeslice(engine, queue_prio(execlists));
2232                                 return;
2233                         }
2234                 }
2235         }
2236
2237         while (rb) { /* XXX virtual is always taking precedence */
2238                 struct virtual_engine *ve =
2239                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2240                 struct i915_request *rq;
2241
2242                 spin_lock(&ve->base.active.lock);
2243
2244                 rq = ve->request;
2245                 if (unlikely(!rq)) { /* lost the race to a sibling */
2246                         spin_unlock(&ve->base.active.lock);
2247                         rb_erase_cached(rb, &execlists->virtual);
2248                         RB_CLEAR_NODE(rb);
2249                         rb = rb_first_cached(&execlists->virtual);
2250                         continue;
2251                 }
2252
2253                 GEM_BUG_ON(rq != ve->request);
2254                 GEM_BUG_ON(rq->engine != &ve->base);
2255                 GEM_BUG_ON(rq->context != &ve->context);
2256
2257                 if (rq_prio(rq) >= queue_prio(execlists)) {
2258                         if (!virtual_matches(ve, rq, engine)) {
2259                                 spin_unlock(&ve->base.active.lock);
2260                                 rb = rb_next(rb);
2261                                 continue;
2262                         }
2263
2264                         if (last && !can_merge_rq(last, rq)) {
2265                                 spin_unlock(&ve->base.active.lock);
2266                                 start_timeslice(engine, rq_prio(rq));
2267                                 return; /* leave this for another sibling */
2268                         }
2269
2270                         ENGINE_TRACE(engine,
2271                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
2272                                      rq->fence.context,
2273                                      rq->fence.seqno,
2274                                      i915_request_completed(rq) ? "!" :
2275                                      i915_request_started(rq) ? "*" :
2276                                      "",
2277                                      yesno(engine != ve->siblings[0]));
2278
2279                         WRITE_ONCE(ve->request, NULL);
2280                         WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2281                                    INT_MIN);
2282                         rb_erase_cached(rb, &execlists->virtual);
2283                         RB_CLEAR_NODE(rb);
2284
2285                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2286                         WRITE_ONCE(rq->engine, engine);
2287
2288                         if (__i915_request_submit(rq)) {
2289                                 /*
2290                                  * Only after we confirm that we will submit
2291                                  * this request (i.e. it has not already
2292                                  * completed), do we want to update the context.
2293                                  *
2294                                  * This serves two purposes. It avoids
2295                                  * unnecessary work if we are resubmitting an
2296                                  * already completed request after timeslicing.
2297                                  * But more importantly, it prevents us altering
2298                                  * ve->siblings[] on an idle context, where
2299                                  * we may be using ve->siblings[] in
2300                                  * virtual_context_enter / virtual_context_exit.
2301                                  */
2302                                 virtual_xfer_context(ve, engine);
2303                                 GEM_BUG_ON(ve->siblings[0] != engine);
2304
2305                                 submit = true;
2306                                 last = rq;
2307                         }
2308                         i915_request_put(rq);
2309
2310                         /*
2311                          * Hmm, we have a bunch of virtual engine requests,
2312                          * but the first one was already completed (thanks
2313                          * preempt-to-busy!). Keep looking at the veng queue
2314                          * until we have no more relevant requests (i.e.
2315                          * the normal submit queue has higher priority).
2316                          */
2317                         if (!submit) {
2318                                 spin_unlock(&ve->base.active.lock);
2319                                 rb = rb_first_cached(&execlists->virtual);
2320                                 continue;
2321                         }
2322                 }
2323
2324                 spin_unlock(&ve->base.active.lock);
2325                 break;
2326         }
2327
2328         while ((rb = rb_first_cached(&execlists->queue))) {
2329                 struct i915_priolist *p = to_priolist(rb);
2330                 struct i915_request *rq, *rn;
2331                 int i;
2332
2333                 priolist_for_each_request_consume(rq, rn, p, i) {
2334                         bool merge = true;
2335
2336                         /*
2337                          * Can we combine this request with the current port?
2338                          * It has to be the same context/ringbuffer and not
2339                          * have any exceptions (e.g. GVT saying never to
2340                          * combine contexts).
2341                          *
2342                          * If we can combine the requests, we can execute both
2343                          * by updating the RING_TAIL to point to the end of the
2344                          * second request, and so we never need to tell the
2345                          * hardware about the first.
2346                          */
2347                         if (last && !can_merge_rq(last, rq)) {
2348                                 /*
2349                                  * If we are on the second port and cannot
2350                                  * combine this request with the last, then we
2351                                  * are done.
2352                                  */
2353                                 if (port == last_port)
2354                                         goto done;
2355
2356                                 /*
2357                                  * We must not populate both ELSP[] with the
2358                                  * same LRCA, i.e. we must submit 2 different
2359                                  * contexts if we submit 2 ELSP.
2360                                  */
2361                                 if (last->context == rq->context)
2362                                         goto done;
2363
2364                                 if (i915_request_has_sentinel(last))
2365                                         goto done;
2366
2367                                 /*
2368                                  * If GVT overrides us we only ever submit
2369                                  * port[0], leaving port[1] empty. Note that we
2370                                  * also have to be careful that we don't queue
2371                                  * the same context (even though a different
2372                                  * request) to the second port.
2373                                  */
2374                                 if (ctx_single_port_submission(last->context) ||
2375                                     ctx_single_port_submission(rq->context))
2376                                         goto done;
2377
2378                                 merge = false;
2379                         }
2380
2381                         if (__i915_request_submit(rq)) {
2382                                 if (!merge) {
2383                                         *port = execlists_schedule_in(last, port - execlists->pending);
2384                                         port++;
2385                                         last = NULL;
2386                                 }
2387
2388                                 GEM_BUG_ON(last &&
2389                                            !can_merge_ctx(last->context,
2390                                                           rq->context));
2391                                 GEM_BUG_ON(last &&
2392                                            i915_seqno_passed(last->fence.seqno,
2393                                                              rq->fence.seqno));
2394
2395                                 submit = true;
2396                                 last = rq;
2397                         }
2398                 }
2399
2400                 rb_erase_cached(&p->node, &execlists->queue);
2401                 i915_priolist_free(p);
2402         }
2403
2404 done:
2405         /*
2406          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2407          *
2408          * We choose the priority hint such that if we add a request of greater
2409          * priority than this, we kick the submission tasklet to decide on
2410          * the right order of submitting the requests to hardware. We must
2411          * also be prepared to reorder requests as they are in-flight on the
2412          * HW. We derive the priority hint then as the first "hole" in
2413          * the HW submission ports and if there are no available slots,
2414          * the priority of the lowest executing request, i.e. last.
2415          *
2416          * When we do receive a higher priority request ready to run from the
2417          * user, see queue_request(), the priority hint is bumped to that
2418          * request triggering preemption on the next dequeue (or subsequent
2419          * interrupt for secondary ports).
2420          */
2421         execlists->queue_priority_hint = queue_prio(execlists);
2422
2423         if (submit) {
2424                 *port = execlists_schedule_in(last, port - execlists->pending);
2425                 execlists->switch_priority_hint =
2426                         switch_prio(engine, *execlists->pending);
2427
2428                 /*
2429                  * Skip if we ended up with exactly the same set of requests,
2430                  * e.g. trying to timeslice a pair of ordered contexts
2431                  */
2432                 if (!memcmp(active, execlists->pending,
2433                             (port - execlists->pending + 1) * sizeof(*port))) {
2434                         do
2435                                 execlists_schedule_out(fetch_and_zero(port));
2436                         while (port-- != execlists->pending);
2437
2438                         goto skip_submit;
2439                 }
2440                 clear_ports(port + 1, last_port - port);
2441
2442                 WRITE_ONCE(execlists->yield, -1);
2443                 set_preempt_timeout(engine, *active);
2444                 execlists_submit_ports(engine);
2445         } else {
2446                 start_timeslice(engine, execlists->queue_priority_hint);
2447 skip_submit:
2448                 ring_set_paused(engine, 0);
2449         }
2450 }
2451
2452 static void
2453 cancel_port_requests(struct intel_engine_execlists * const execlists)
2454 {
2455         struct i915_request * const *port;
2456
2457         for (port = execlists->pending; *port; port++)
2458                 execlists_schedule_out(*port);
2459         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2460
2461         /* Mark the end of active before we overwrite *active */
2462         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2463                 execlists_schedule_out(*port);
2464         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2465
2466         smp_wmb(); /* complete the seqlock for execlists_active() */
2467         WRITE_ONCE(execlists->active, execlists->inflight);
2468 }
2469
2470 static inline void
2471 invalidate_csb_entries(const u64 *first, const u64 *last)
2472 {
2473         clflush((void *)first);
2474         clflush((void *)last);
2475 }
2476
2477 /*
2478  * Starting with Gen12, the status has a new format:
2479  *
2480  *     bit  0:     switched to new queue
2481  *     bit  1:     reserved
2482  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2483  *                 switch detail is set to "wait on semaphore"
2484  *     bits 3-5:   engine class
2485  *     bits 6-11:  engine instance
2486  *     bits 12-14: reserved
2487  *     bits 15-25: sw context id of the lrc the GT switched to
2488  *     bits 26-31: sw counter of the lrc the GT switched to
2489  *     bits 32-35: context switch detail
2490  *                  - 0: ctx complete
2491  *                  - 1: wait on sync flip
2492  *                  - 2: wait on vblank
2493  *                  - 3: wait on scanline
2494  *                  - 4: wait on semaphore
2495  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2496  *                       WAIT_FOR_EVENT)
2497  *     bit  36:    reserved
2498  *     bits 37-43: wait detail (for switch detail 1 to 4)
2499  *     bits 44-46: reserved
2500  *     bits 47-57: sw context id of the lrc the GT switched away from
2501  *     bits 58-63: sw counter of the lrc the GT switched away from
2502  */
2503 static inline bool gen12_csb_parse(const u64 csb)
2504 {
2505         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb));
2506         bool new_queue =
2507                 lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2508
2509         /*
2510          * The context switch detail is not guaranteed to be 5 when a preemption
2511          * occurs, so we can't just check for that. The check below works for
2512          * all the cases we care about, including preemptions of WAIT
2513          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2514          * would require some extra handling, but we don't support that.
2515          */
2516         if (!ctx_away_valid || new_queue) {
2517                 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb)));
2518                 return true;
2519         }
2520
2521         /*
2522          * switch detail = 5 is covered by the case above and we do not expect a
2523          * context switch on an unsuccessful wait instruction since we always
2524          * use polling mode.
2525          */
2526         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb)));
2527         return false;
2528 }
2529
2530 static inline bool gen8_csb_parse(const u64 csb)
2531 {
2532         return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2533 }
2534
2535 static noinline u64
2536 wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb)
2537 {
2538         u64 entry;
2539
2540         /*
2541          * Reading from the HWSP has one particular advantage: we can detect
2542          * a stale entry. Since the write into HWSP is broken, we have no reason
2543          * to trust the HW at all, the mmio entry may equally be unordered, so
2544          * we prefer the path that is self-checking and as a last resort,
2545          * return the mmio value.
2546          *
2547          * tgl,dg1:HSDES#22011327657
2548          */
2549         preempt_disable();
2550         if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 10)) {
2551                 int idx = csb - engine->execlists.csb_status;
2552                 int status;
2553
2554                 status = GEN8_EXECLISTS_STATUS_BUF;
2555                 if (idx >= 6) {
2556                         status = GEN11_EXECLISTS_STATUS_BUF2;
2557                         idx -= 6;
2558                 }
2559                 status += sizeof(u64) * idx;
2560
2561                 entry = intel_uncore_read64(engine->uncore,
2562                                             _MMIO(engine->mmio_base + status));
2563         }
2564         preempt_enable();
2565
2566         return entry;
2567 }
2568
2569 static inline u64
2570 csb_read(const struct intel_engine_cs *engine, u64 * const csb)
2571 {
2572         u64 entry = READ_ONCE(*csb);
2573
2574         /*
2575          * Unfortunately, the GPU does not always serialise its write
2576          * of the CSB entries before its write of the CSB pointer, at least
2577          * from the perspective of the CPU, using what is known as a Global
2578          * Observation Point. We may read a new CSB tail pointer, but then
2579          * read the stale CSB entries, causing us to misinterpret the
2580          * context-switch events, and eventually declare the GPU hung.
2581          *
2582          * icl:HSDES#1806554093
2583          * tgl:HSDES#22011248461
2584          */
2585         if (unlikely(entry == -1))
2586                 entry = wa_csb_read(engine, csb);
2587
2588         /* Consume this entry so that we can spot its future reuse. */
2589         WRITE_ONCE(*csb, -1);
2590
2591         /* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */
2592         return entry;
2593 }
2594
2595 static void process_csb(struct intel_engine_cs *engine)
2596 {
2597         struct intel_engine_execlists * const execlists = &engine->execlists;
2598         u64 * const buf = execlists->csb_status;
2599         const u8 num_entries = execlists->csb_size;
2600         u8 head, tail;
2601
2602         /*
2603          * As we modify our execlists state tracking we require exclusive
2604          * access. Either we are inside the tasklet, or the tasklet is disabled
2605          * and we assume that is only inside the reset paths and so serialised.
2606          */
2607         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2608                    !reset_in_progress(execlists));
2609         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2610
2611         /*
2612          * Note that csb_write, csb_status may be either in HWSP or mmio.
2613          * When reading from the csb_write mmio register, we have to be
2614          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2615          * the low 4bits. As it happens we know the next 4bits are always
2616          * zero and so we can simply masked off the low u8 of the register
2617          * and treat it identically to reading from the HWSP (without having
2618          * to use explicit shifting and masking, and probably bifurcating
2619          * the code to handle the legacy mmio read).
2620          */
2621         head = execlists->csb_head;
2622         tail = READ_ONCE(*execlists->csb_write);
2623         if (unlikely(head == tail))
2624                 return;
2625
2626         /*
2627          * We will consume all events from HW, or at least pretend to.
2628          *
2629          * The sequence of events from the HW is deterministic, and derived
2630          * from our writes to the ELSP, with a smidgen of variability for
2631          * the arrival of the asynchronous requests wrt to the inflight
2632          * execution. If the HW sends an event that does not correspond with
2633          * the one we are expecting, we have to abandon all hope as we lose
2634          * all tracking of what the engine is actually executing. We will
2635          * only detect we are out of sequence with the HW when we get an
2636          * 'impossible' event because we have already drained our own
2637          * preemption/promotion queue. If this occurs, we know that we likely
2638          * lost track of execution earlier and must unwind and restart, the
2639          * simplest way is by stop processing the event queue and force the
2640          * engine to reset.
2641          */
2642         execlists->csb_head = tail;
2643         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2644
2645         /*
2646          * Hopefully paired with a wmb() in HW!
2647          *
2648          * We must complete the read of the write pointer before any reads
2649          * from the CSB, so that we do not see stale values. Without an rmb
2650          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2651          * we perform the READ_ONCE(*csb_write).
2652          */
2653         rmb();
2654         do {
2655                 bool promote;
2656                 u64 csb;
2657
2658                 if (++head == num_entries)
2659                         head = 0;
2660
2661                 /*
2662                  * We are flying near dragons again.
2663                  *
2664                  * We hold a reference to the request in execlist_port[]
2665                  * but no more than that. We are operating in softirq
2666                  * context and so cannot hold any mutex or sleep. That
2667                  * prevents us stopping the requests we are processing
2668                  * in port[] from being retired simultaneously (the
2669                  * breadcrumb will be complete before we see the
2670                  * context-switch). As we only hold the reference to the
2671                  * request, any pointer chasing underneath the request
2672                  * is subject to a potential use-after-free. Thus we
2673                  * store all of the bookkeeping within port[] as
2674                  * required, and avoid using unguarded pointers beneath
2675                  * request itself. The same applies to the atomic
2676                  * status notifier.
2677                  */
2678
2679                 csb = csb_read(engine, buf + head);
2680                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2681                              head, upper_32_bits(csb), lower_32_bits(csb));
2682
2683                 if (INTEL_GEN(engine->i915) >= 12)
2684                         promote = gen12_csb_parse(csb);
2685                 else
2686                         promote = gen8_csb_parse(csb);
2687                 if (promote) {
2688                         struct i915_request * const *old = execlists->active;
2689
2690                         if (GEM_WARN_ON(!*execlists->pending)) {
2691                                 execlists->error_interrupt |= ERROR_CSB;
2692                                 break;
2693                         }
2694
2695                         ring_set_paused(engine, 0);
2696
2697                         /* Point active to the new ELSP; prevent overwriting */
2698                         WRITE_ONCE(execlists->active, execlists->pending);
2699                         smp_wmb(); /* notify execlists_active() */
2700
2701                         /* cancel old inflight, prepare for switch */
2702                         trace_ports(execlists, "preempted", old);
2703                         while (*old)
2704                                 execlists_schedule_out(*old++);
2705
2706                         /* switch pending to inflight */
2707                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2708                         copy_ports(execlists->inflight,
2709                                    execlists->pending,
2710                                    execlists_num_ports(execlists));
2711                         smp_wmb(); /* complete the seqlock */
2712                         WRITE_ONCE(execlists->active, execlists->inflight);
2713
2714                         /* XXX Magic delay for tgl */
2715                         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2716
2717                         WRITE_ONCE(execlists->pending[0], NULL);
2718                 } else {
2719                         if (GEM_WARN_ON(!*execlists->active)) {
2720                                 execlists->error_interrupt |= ERROR_CSB;
2721                                 break;
2722                         }
2723
2724                         /* port0 completed, advanced to port1 */
2725                         trace_ports(execlists, "completed", execlists->active);
2726
2727                         /*
2728                          * We rely on the hardware being strongly
2729                          * ordered, that the breadcrumb write is
2730                          * coherent (visible from the CPU) before the
2731                          * user interrupt is processed. One might assume
2732                          * that the breadcrumb write being before the
2733                          * user interrupt and the CS event for the context
2734                          * switch would therefore be before the CS event
2735                          * itself...
2736                          */
2737                         if (GEM_SHOW_DEBUG() &&
2738                             !i915_request_completed(*execlists->active)) {
2739                                 struct i915_request *rq = *execlists->active;
2740                                 const u32 *regs __maybe_unused =
2741                                         rq->context->lrc_reg_state;
2742
2743                                 ENGINE_TRACE(engine,
2744                                              "context completed before request!\n");
2745                                 ENGINE_TRACE(engine,
2746                                              "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2747                                              ENGINE_READ(engine, RING_START),
2748                                              ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2749                                              ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2750                                              ENGINE_READ(engine, RING_CTL),
2751                                              ENGINE_READ(engine, RING_MI_MODE));
2752                                 ENGINE_TRACE(engine,
2753                                              "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2754                                              i915_ggtt_offset(rq->ring->vma),
2755                                              rq->head, rq->tail,
2756                                              rq->fence.context,
2757                                              lower_32_bits(rq->fence.seqno),
2758                                              hwsp_seqno(rq));
2759                                 ENGINE_TRACE(engine,
2760                                              "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2761                                              regs[CTX_RING_START],
2762                                              regs[CTX_RING_HEAD],
2763                                              regs[CTX_RING_TAIL]);
2764                         }
2765
2766                         execlists_schedule_out(*execlists->active++);
2767
2768                         GEM_BUG_ON(execlists->active - execlists->inflight >
2769                                    execlists_num_ports(execlists));
2770                 }
2771         } while (head != tail);
2772
2773         set_timeslice(engine);
2774
2775         /*
2776          * Gen11 has proven to fail wrt global observation point between
2777          * entry and tail update, failing on the ordering and thus
2778          * we see an old entry in the context status buffer.
2779          *
2780          * Forcibly evict out entries for the next gpu csb update,
2781          * to increase the odds that we get a fresh entries with non
2782          * working hardware. The cost for doing so comes out mostly with
2783          * the wash as hardware, working or not, will need to do the
2784          * invalidation before.
2785          */
2786         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2787 }
2788
2789 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2790 {
2791         lockdep_assert_held(&engine->active.lock);
2792         if (!READ_ONCE(engine->execlists.pending[0])) {
2793                 rcu_read_lock(); /* protect peeking at execlists->active */
2794                 execlists_dequeue(engine);
2795                 rcu_read_unlock();
2796         }
2797 }
2798
2799 static void __execlists_hold(struct i915_request *rq)
2800 {
2801         LIST_HEAD(list);
2802
2803         do {
2804                 struct i915_dependency *p;
2805
2806                 if (i915_request_is_active(rq))
2807                         __i915_request_unsubmit(rq);
2808
2809                 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2810                 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2811                 i915_request_set_hold(rq);
2812                 RQ_TRACE(rq, "on hold\n");
2813
2814                 for_each_waiter(p, rq) {
2815                         struct i915_request *w =
2816                                 container_of(p->waiter, typeof(*w), sched);
2817
2818                         /* Leave semaphores spinning on the other engines */
2819                         if (w->engine != rq->engine)
2820                                 continue;
2821
2822                         if (!i915_request_is_ready(w))
2823                                 continue;
2824
2825                         if (i915_request_completed(w))
2826                                 continue;
2827
2828                         if (i915_request_on_hold(w))
2829                                 continue;
2830
2831                         list_move_tail(&w->sched.link, &list);
2832                 }
2833
2834                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2835         } while (rq);
2836 }
2837
2838 static bool execlists_hold(struct intel_engine_cs *engine,
2839                            struct i915_request *rq)
2840 {
2841         if (i915_request_on_hold(rq))
2842                 return false;
2843
2844         spin_lock_irq(&engine->active.lock);
2845
2846         if (i915_request_completed(rq)) { /* too late! */
2847                 rq = NULL;
2848                 goto unlock;
2849         }
2850
2851         if (rq->engine != engine) { /* preempted virtual engine */
2852                 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2853
2854                 /*
2855                  * intel_context_inflight() is only protected by virtue
2856                  * of process_csb() being called only by the tasklet (or
2857                  * directly from inside reset while the tasklet is suspended).
2858                  * Assert that neither of those are allowed to run while we
2859                  * poke at the request queues.
2860                  */
2861                 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2862
2863                 /*
2864                  * An unsubmitted request along a virtual engine will
2865                  * remain on the active (this) engine until we are able
2866                  * to process the context switch away (and so mark the
2867                  * context as no longer in flight). That cannot have happened
2868                  * yet, otherwise we would not be hanging!
2869                  */
2870                 spin_lock(&ve->base.active.lock);
2871                 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2872                 GEM_BUG_ON(ve->request != rq);
2873                 ve->request = NULL;
2874                 spin_unlock(&ve->base.active.lock);
2875                 i915_request_put(rq);
2876
2877                 rq->engine = engine;
2878         }
2879
2880         /*
2881          * Transfer this request onto the hold queue to prevent it
2882          * being resumbitted to HW (and potentially completed) before we have
2883          * released it. Since we may have already submitted following
2884          * requests, we need to remove those as well.
2885          */
2886         GEM_BUG_ON(i915_request_on_hold(rq));
2887         GEM_BUG_ON(rq->engine != engine);
2888         __execlists_hold(rq);
2889         GEM_BUG_ON(list_empty(&engine->active.hold));
2890
2891 unlock:
2892         spin_unlock_irq(&engine->active.lock);
2893         return rq;
2894 }
2895
2896 static bool hold_request(const struct i915_request *rq)
2897 {
2898         struct i915_dependency *p;
2899         bool result = false;
2900
2901         /*
2902          * If one of our ancestors is on hold, we must also be on hold,
2903          * otherwise we will bypass it and execute before it.
2904          */
2905         rcu_read_lock();
2906         for_each_signaler(p, rq) {
2907                 const struct i915_request *s =
2908                         container_of(p->signaler, typeof(*s), sched);
2909
2910                 if (s->engine != rq->engine)
2911                         continue;
2912
2913                 result = i915_request_on_hold(s);
2914                 if (result)
2915                         break;
2916         }
2917         rcu_read_unlock();
2918
2919         return result;
2920 }
2921
2922 static void __execlists_unhold(struct i915_request *rq)
2923 {
2924         LIST_HEAD(list);
2925
2926         do {
2927                 struct i915_dependency *p;
2928
2929                 RQ_TRACE(rq, "hold release\n");
2930
2931                 GEM_BUG_ON(!i915_request_on_hold(rq));
2932                 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2933
2934                 i915_request_clear_hold(rq);
2935                 list_move_tail(&rq->sched.link,
2936                                i915_sched_lookup_priolist(rq->engine,
2937                                                           rq_prio(rq)));
2938                 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2939
2940                 /* Also release any children on this engine that are ready */
2941                 for_each_waiter(p, rq) {
2942                         struct i915_request *w =
2943                                 container_of(p->waiter, typeof(*w), sched);
2944
2945                         /* Propagate any change in error status */
2946                         if (rq->fence.error)
2947                                 i915_request_set_error_once(w, rq->fence.error);
2948
2949                         if (w->engine != rq->engine)
2950                                 continue;
2951
2952                         if (!i915_request_on_hold(w))
2953                                 continue;
2954
2955                         /* Check that no other parents are also on hold */
2956                         if (hold_request(w))
2957                                 continue;
2958
2959                         list_move_tail(&w->sched.link, &list);
2960                 }
2961
2962                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2963         } while (rq);
2964 }
2965
2966 static void execlists_unhold(struct intel_engine_cs *engine,
2967                              struct i915_request *rq)
2968 {
2969         spin_lock_irq(&engine->active.lock);
2970
2971         /*
2972          * Move this request back to the priority queue, and all of its
2973          * children and grandchildren that were suspended along with it.
2974          */
2975         __execlists_unhold(rq);
2976
2977         if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2978                 engine->execlists.queue_priority_hint = rq_prio(rq);
2979                 tasklet_hi_schedule(&engine->execlists.tasklet);
2980         }
2981
2982         spin_unlock_irq(&engine->active.lock);
2983 }
2984
2985 struct execlists_capture {
2986         struct work_struct work;
2987         struct i915_request *rq;
2988         struct i915_gpu_coredump *error;
2989 };
2990
2991 static void execlists_capture_work(struct work_struct *work)
2992 {
2993         struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2994         const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2995         struct intel_engine_cs *engine = cap->rq->engine;
2996         struct intel_gt_coredump *gt = cap->error->gt;
2997         struct intel_engine_capture_vma *vma;
2998
2999         /* Compress all the objects attached to the request, slow! */
3000         vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
3001         if (vma) {
3002                 struct i915_vma_compress *compress =
3003                         i915_vma_capture_prepare(gt);
3004
3005                 intel_engine_coredump_add_vma(gt->engine, vma, compress);
3006                 i915_vma_capture_finish(gt, compress);
3007         }
3008
3009         gt->simulated = gt->engine->simulated;
3010         cap->error->simulated = gt->simulated;
3011
3012         /* Publish the error state, and announce it to the world */
3013         i915_error_state_store(cap->error);
3014         i915_gpu_coredump_put(cap->error);
3015
3016         /* Return this request and all that depend upon it for signaling */
3017         execlists_unhold(engine, cap->rq);
3018         i915_request_put(cap->rq);
3019
3020         kfree(cap);
3021 }
3022
3023 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
3024 {
3025         const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
3026         struct execlists_capture *cap;
3027
3028         cap = kmalloc(sizeof(*cap), gfp);
3029         if (!cap)
3030                 return NULL;
3031
3032         cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
3033         if (!cap->error)
3034                 goto err_cap;
3035
3036         cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
3037         if (!cap->error->gt)
3038                 goto err_gpu;
3039
3040         cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
3041         if (!cap->error->gt->engine)
3042                 goto err_gt;
3043
3044         cap->error->gt->engine->hung = true;
3045
3046         return cap;
3047
3048 err_gt:
3049         kfree(cap->error->gt);
3050 err_gpu:
3051         kfree(cap->error);
3052 err_cap:
3053         kfree(cap);
3054         return NULL;
3055 }
3056
3057 static struct i915_request *
3058 active_context(struct intel_engine_cs *engine, u32 ccid)
3059 {
3060         const struct intel_engine_execlists * const el = &engine->execlists;
3061         struct i915_request * const *port, *rq;
3062
3063         /*
3064          * Use the most recent result from process_csb(), but just in case
3065          * we trigger an error (via interrupt) before the first CS event has
3066          * been written, peek at the next submission.
3067          */
3068
3069         for (port = el->active; (rq = *port); port++) {
3070                 if (rq->context->lrc.ccid == ccid) {
3071                         ENGINE_TRACE(engine,
3072                                      "ccid found at active:%zd\n",
3073                                      port - el->active);
3074                         return rq;
3075                 }
3076         }
3077
3078         for (port = el->pending; (rq = *port); port++) {
3079                 if (rq->context->lrc.ccid == ccid) {
3080                         ENGINE_TRACE(engine,
3081                                      "ccid found at pending:%zd\n",
3082                                      port - el->pending);
3083                         return rq;
3084                 }
3085         }
3086
3087         ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3088         return NULL;
3089 }
3090
3091 static u32 active_ccid(struct intel_engine_cs *engine)
3092 {
3093         return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3094 }
3095
3096 static void execlists_capture(struct intel_engine_cs *engine)
3097 {
3098         struct execlists_capture *cap;
3099
3100         if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3101                 return;
3102
3103         /*
3104          * We need to _quickly_ capture the engine state before we reset.
3105          * We are inside an atomic section (softirq) here and we are delaying
3106          * the forced preemption event.
3107          */
3108         cap = capture_regs(engine);
3109         if (!cap)
3110                 return;
3111
3112         spin_lock_irq(&engine->active.lock);
3113         cap->rq = active_context(engine, active_ccid(engine));
3114         if (cap->rq) {
3115                 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3116                 cap->rq = i915_request_get_rcu(cap->rq);
3117         }
3118         spin_unlock_irq(&engine->active.lock);
3119         if (!cap->rq)
3120                 goto err_free;
3121
3122         /*
3123          * Remove the request from the execlists queue, and take ownership
3124          * of the request. We pass it to our worker who will _slowly_ compress
3125          * all the pages the _user_ requested for debugging their batch, after
3126          * which we return it to the queue for signaling.
3127          *
3128          * By removing them from the execlists queue, we also remove the
3129          * requests from being processed by __unwind_incomplete_requests()
3130          * during the intel_engine_reset(), and so they will *not* be replayed
3131          * afterwards.
3132          *
3133          * Note that because we have not yet reset the engine at this point,
3134          * it is possible for the request that we have identified as being
3135          * guilty, did in fact complete and we will then hit an arbitration
3136          * point allowing the outstanding preemption to succeed. The likelihood
3137          * of that is very low (as capturing of the engine registers should be
3138          * fast enough to run inside an irq-off atomic section!), so we will
3139          * simply hold that request accountable for being non-preemptible
3140          * long enough to force the reset.
3141          */
3142         if (!execlists_hold(engine, cap->rq))
3143                 goto err_rq;
3144
3145         INIT_WORK(&cap->work, execlists_capture_work);
3146         schedule_work(&cap->work);
3147         return;
3148
3149 err_rq:
3150         i915_request_put(cap->rq);
3151 err_free:
3152         i915_gpu_coredump_put(cap->error);
3153         kfree(cap);
3154 }
3155
3156 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3157 {
3158         const unsigned int bit = I915_RESET_ENGINE + engine->id;
3159         unsigned long *lock = &engine->gt->reset.flags;
3160
3161         if (!intel_has_reset_engine(engine->gt))
3162                 return;
3163
3164         if (test_and_set_bit(bit, lock))
3165                 return;
3166
3167         ENGINE_TRACE(engine, "reset for %s\n", msg);
3168
3169         /* Mark this tasklet as disabled to avoid waiting for it to complete */
3170         tasklet_disable_nosync(&engine->execlists.tasklet);
3171
3172         ring_set_paused(engine, 1); /* Freeze the current request in place */
3173         execlists_capture(engine);
3174         intel_engine_reset(engine, msg);
3175
3176         tasklet_enable(&engine->execlists.tasklet);
3177         clear_and_wake_up_bit(bit, lock);
3178 }
3179
3180 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3181 {
3182         const struct timer_list *t = &engine->execlists.preempt;
3183
3184         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3185                 return false;
3186
3187         if (!timer_expired(t))
3188                 return false;
3189
3190         return READ_ONCE(engine->execlists.pending[0]);
3191 }
3192
3193 /*
3194  * Check the unread Context Status Buffers and manage the submission of new
3195  * contexts to the ELSP accordingly.
3196  */
3197 static void execlists_submission_tasklet(unsigned long data)
3198 {
3199         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3200         bool timeout = preempt_timeout(engine);
3201
3202         process_csb(engine);
3203
3204         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3205                 const char *msg;
3206
3207                 /* Generate the error message in priority wrt to the user! */
3208                 if (engine->execlists.error_interrupt & GENMASK(15, 0))
3209                         msg = "CS error"; /* thrown by a user payload */
3210                 else if (engine->execlists.error_interrupt & ERROR_CSB)
3211                         msg = "invalid CSB event";
3212                 else
3213                         msg = "internal error";
3214
3215                 engine->execlists.error_interrupt = 0;
3216                 execlists_reset(engine, msg);
3217         }
3218
3219         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3220                 unsigned long flags;
3221
3222                 spin_lock_irqsave(&engine->active.lock, flags);
3223                 __execlists_submission_tasklet(engine);
3224                 spin_unlock_irqrestore(&engine->active.lock, flags);
3225
3226                 /* Recheck after serialising with direct-submission */
3227                 if (unlikely(timeout && preempt_timeout(engine))) {
3228                         cancel_timer(&engine->execlists.preempt);
3229                         execlists_reset(engine, "preemption time out");
3230                 }
3231         }
3232 }
3233
3234 static void __execlists_kick(struct intel_engine_execlists *execlists)
3235 {
3236         /* Kick the tasklet for some interrupt coalescing and reset handling */
3237         tasklet_hi_schedule(&execlists->tasklet);
3238 }
3239
3240 #define execlists_kick(t, member) \
3241         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
3242
3243 static void execlists_timeslice(struct timer_list *timer)
3244 {
3245         execlists_kick(timer, timer);
3246 }
3247
3248 static void execlists_preempt(struct timer_list *timer)
3249 {
3250         execlists_kick(timer, preempt);
3251 }
3252
3253 static void queue_request(struct intel_engine_cs *engine,
3254                           struct i915_request *rq)
3255 {
3256         GEM_BUG_ON(!list_empty(&rq->sched.link));
3257         list_add_tail(&rq->sched.link,
3258                       i915_sched_lookup_priolist(engine, rq_prio(rq)));
3259         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3260 }
3261
3262 static void __submit_queue_imm(struct intel_engine_cs *engine)
3263 {
3264         struct intel_engine_execlists * const execlists = &engine->execlists;
3265
3266         if (reset_in_progress(execlists))
3267                 return; /* defer until we restart the engine following reset */
3268
3269         __execlists_submission_tasklet(engine);
3270 }
3271
3272 static void submit_queue(struct intel_engine_cs *engine,
3273                          const struct i915_request *rq)
3274 {
3275         struct intel_engine_execlists *execlists = &engine->execlists;
3276
3277         if (rq_prio(rq) <= execlists->queue_priority_hint)
3278                 return;
3279
3280         execlists->queue_priority_hint = rq_prio(rq);
3281         __submit_queue_imm(engine);
3282 }
3283
3284 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3285                              const struct i915_request *rq)
3286 {
3287         GEM_BUG_ON(i915_request_on_hold(rq));
3288         return !list_empty(&engine->active.hold) && hold_request(rq);
3289 }
3290
3291 static void flush_csb(struct intel_engine_cs *engine)
3292 {
3293         struct intel_engine_execlists *el = &engine->execlists;
3294
3295         if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3296                 if (!reset_in_progress(el))
3297                         process_csb(engine);
3298                 tasklet_unlock(&el->tasklet);
3299         }
3300 }
3301
3302 static void execlists_submit_request(struct i915_request *request)
3303 {
3304         struct intel_engine_cs *engine = request->engine;
3305         unsigned long flags;
3306
3307         /* Hopefully we clear execlists->pending[] to let us through */
3308         flush_csb(engine);
3309
3310         /* Will be called from irq-context when using foreign fences. */
3311         spin_lock_irqsave(&engine->active.lock, flags);
3312
3313         if (unlikely(ancestor_on_hold(engine, request))) {
3314                 RQ_TRACE(request, "ancestor on hold\n");
3315                 list_add_tail(&request->sched.link, &engine->active.hold);
3316                 i915_request_set_hold(request);
3317         } else {
3318                 queue_request(engine, request);
3319
3320                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3321                 GEM_BUG_ON(list_empty(&request->sched.link));
3322
3323                 submit_queue(engine, request);
3324         }
3325
3326         spin_unlock_irqrestore(&engine->active.lock, flags);
3327 }
3328
3329 static void __execlists_context_fini(struct intel_context *ce)
3330 {
3331         intel_ring_put(ce->ring);
3332         i915_vma_put(ce->state);
3333 }
3334
3335 static void execlists_context_destroy(struct kref *kref)
3336 {
3337         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3338
3339         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3340         GEM_BUG_ON(intel_context_is_pinned(ce));
3341
3342         if (ce->state)
3343                 __execlists_context_fini(ce);
3344
3345         intel_context_fini(ce);
3346         intel_context_free(ce);
3347 }
3348
3349 static void
3350 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3351 {
3352         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3353                 return;
3354
3355         vaddr += engine->context_size;
3356
3357         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3358 }
3359
3360 static void
3361 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3362 {
3363         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3364                 return;
3365
3366         vaddr += engine->context_size;
3367
3368         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3369                 drm_err_once(&engine->i915->drm,
3370                              "%s context redzone overwritten!\n",
3371                              engine->name);
3372 }
3373
3374 static void execlists_context_unpin(struct intel_context *ce)
3375 {
3376         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3377                       ce->engine);
3378 }
3379
3380 static void execlists_context_post_unpin(struct intel_context *ce)
3381 {
3382         i915_gem_object_unpin_map(ce->state->obj);
3383 }
3384
3385 static u32 *
3386 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3387 {
3388         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3389                 MI_SRM_LRM_GLOBAL_GTT |
3390                 MI_LRI_LRM_CS_MMIO;
3391         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3392         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3393                 CTX_TIMESTAMP * sizeof(u32);
3394         *cs++ = 0;
3395
3396         *cs++ = MI_LOAD_REGISTER_REG |
3397                 MI_LRR_SOURCE_CS_MMIO |
3398                 MI_LRI_LRM_CS_MMIO;
3399         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3400         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3401
3402         *cs++ = MI_LOAD_REGISTER_REG |
3403                 MI_LRR_SOURCE_CS_MMIO |
3404                 MI_LRI_LRM_CS_MMIO;
3405         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3406         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3407
3408         return cs;
3409 }
3410
3411 static u32 *
3412 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3413 {
3414         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3415
3416         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3417                 MI_SRM_LRM_GLOBAL_GTT |
3418                 MI_LRI_LRM_CS_MMIO;
3419         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3420         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3421                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3422         *cs++ = 0;
3423
3424         return cs;
3425 }
3426
3427 static u32 *
3428 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3429 {
3430         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3431
3432         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3433                 MI_SRM_LRM_GLOBAL_GTT |
3434                 MI_LRI_LRM_CS_MMIO;
3435         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3436         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3437                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3438         *cs++ = 0;
3439
3440         *cs++ = MI_LOAD_REGISTER_REG |
3441                 MI_LRR_SOURCE_CS_MMIO |
3442                 MI_LRI_LRM_CS_MMIO;
3443         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3444         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3445
3446         return cs;
3447 }
3448
3449 static u32 *
3450 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3451 {
3452         cs = gen12_emit_timestamp_wa(ce, cs);
3453         cs = gen12_emit_cmd_buf_wa(ce, cs);
3454         cs = gen12_emit_restore_scratch(ce, cs);
3455
3456         return cs;
3457 }
3458
3459 static u32 *
3460 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3461 {
3462         cs = gen12_emit_timestamp_wa(ce, cs);
3463         cs = gen12_emit_restore_scratch(ce, cs);
3464
3465         return cs;
3466 }
3467
3468 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3469 {
3470         return PAGE_SIZE * ce->wa_bb_page;
3471 }
3472
3473 static u32 *context_indirect_bb(const struct intel_context *ce)
3474 {
3475         void *ptr;
3476
3477         GEM_BUG_ON(!ce->wa_bb_page);
3478
3479         ptr = ce->lrc_reg_state;
3480         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3481         ptr += context_wa_bb_offset(ce);
3482
3483         return ptr;
3484 }
3485
3486 static void
3487 setup_indirect_ctx_bb(const struct intel_context *ce,
3488                       const struct intel_engine_cs *engine,
3489                       u32 *(*emit)(const struct intel_context *, u32 *))
3490 {
3491         u32 * const start = context_indirect_bb(ce);
3492         u32 *cs;
3493
3494         cs = emit(ce, start);
3495         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3496         while ((unsigned long)cs % CACHELINE_BYTES)
3497                 *cs++ = MI_NOOP;
3498
3499         lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3500                                     i915_ggtt_offset(ce->state) +
3501                                     context_wa_bb_offset(ce),
3502                                     (cs - start) * sizeof(*cs));
3503 }
3504
3505 static void
3506 __execlists_update_reg_state(const struct intel_context *ce,
3507                              const struct intel_engine_cs *engine,
3508                              u32 head)
3509 {
3510         struct intel_ring *ring = ce->ring;
3511         u32 *regs = ce->lrc_reg_state;
3512
3513         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3514         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3515
3516         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3517         regs[CTX_RING_HEAD] = head;
3518         regs[CTX_RING_TAIL] = ring->tail;
3519         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3520
3521         /* RPCS */
3522         if (engine->class == RENDER_CLASS) {
3523                 regs[CTX_R_PWR_CLK_STATE] =
3524                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3525
3526                 i915_oa_init_reg_state(ce, engine);
3527         }
3528
3529         if (ce->wa_bb_page) {
3530                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3531
3532                 fn = gen12_emit_indirect_ctx_xcs;
3533                 if (ce->engine->class == RENDER_CLASS)
3534                         fn = gen12_emit_indirect_ctx_rcs;
3535
3536                 /* Mutually exclusive wrt to global indirect bb */
3537                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3538                 setup_indirect_ctx_bb(ce, engine, fn);
3539         }
3540 }
3541
3542 static int
3543 execlists_context_pre_pin(struct intel_context *ce,
3544                           struct i915_gem_ww_ctx *ww, void **vaddr)
3545 {
3546         GEM_BUG_ON(!ce->state);
3547         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3548
3549         *vaddr = i915_gem_object_pin_map(ce->state->obj,
3550                                         i915_coherent_map_type(ce->engine->i915) |
3551                                         I915_MAP_OVERRIDE);
3552
3553         return PTR_ERR_OR_ZERO(*vaddr);
3554 }
3555
3556 static int
3557 __execlists_context_pin(struct intel_context *ce,
3558                         struct intel_engine_cs *engine,
3559                         void *vaddr)
3560 {
3561         ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3562         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3563         __execlists_update_reg_state(ce, engine, ce->ring->tail);
3564
3565         return 0;
3566 }
3567
3568 static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3569 {
3570         return __execlists_context_pin(ce, ce->engine, vaddr);
3571 }
3572
3573 static int execlists_context_alloc(struct intel_context *ce)
3574 {
3575         return __execlists_context_alloc(ce, ce->engine);
3576 }
3577
3578 static void execlists_context_reset(struct intel_context *ce)
3579 {
3580         CE_TRACE(ce, "reset\n");
3581         GEM_BUG_ON(!intel_context_is_pinned(ce));
3582
3583         intel_ring_reset(ce->ring, ce->ring->emit);
3584
3585         /* Scrub away the garbage */
3586         execlists_init_reg_state(ce->lrc_reg_state,
3587                                  ce, ce->engine, ce->ring, true);
3588         __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3589
3590         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3591 }
3592
3593 static const struct intel_context_ops execlists_context_ops = {
3594         .alloc = execlists_context_alloc,
3595
3596         .pre_pin = execlists_context_pre_pin,
3597         .pin = execlists_context_pin,
3598         .unpin = execlists_context_unpin,
3599         .post_unpin = execlists_context_post_unpin,
3600
3601         .enter = intel_context_enter_engine,
3602         .exit = intel_context_exit_engine,
3603
3604         .reset = execlists_context_reset,
3605         .destroy = execlists_context_destroy,
3606 };
3607
3608 static u32 hwsp_offset(const struct i915_request *rq)
3609 {
3610         const struct intel_timeline_cacheline *cl;
3611
3612         /* Before the request is executed, the timeline/cachline is fixed */
3613
3614         cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
3615         if (cl)
3616                 return cl->ggtt_offset;
3617
3618         return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
3619 }
3620
3621 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3622 {
3623         u32 *cs;
3624
3625         GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3626         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3627                 return 0;
3628
3629         cs = intel_ring_begin(rq, 6);
3630         if (IS_ERR(cs))
3631                 return PTR_ERR(cs);
3632
3633         /*
3634          * Check if we have been preempted before we even get started.
3635          *
3636          * After this point i915_request_started() reports true, even if
3637          * we get preempted and so are no longer running.
3638          */
3639         *cs++ = MI_ARB_CHECK;
3640         *cs++ = MI_NOOP;
3641
3642         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3643         *cs++ = hwsp_offset(rq);
3644         *cs++ = 0;
3645         *cs++ = rq->fence.seqno - 1;
3646
3647         intel_ring_advance(rq, cs);
3648
3649         /* Record the updated position of the request's payload */
3650         rq->infix = intel_ring_offset(rq, cs);
3651
3652         __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3653
3654         return 0;
3655 }
3656
3657 static int emit_pdps(struct i915_request *rq)
3658 {
3659         const struct intel_engine_cs * const engine = rq->engine;
3660         struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3661         int err, i;
3662         u32 *cs;
3663
3664         GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3665
3666         /*
3667          * Beware ye of the dragons, this sequence is magic!
3668          *
3669          * Small changes to this sequence can cause anything from
3670          * GPU hangs to forcewake errors and machine lockups!
3671          */
3672
3673         /* Flush any residual operations from the context load */
3674         err = engine->emit_flush(rq, EMIT_FLUSH);
3675         if (err)
3676                 return err;
3677
3678         /* Magic required to prevent forcewake errors! */
3679         err = engine->emit_flush(rq, EMIT_INVALIDATE);
3680         if (err)
3681                 return err;
3682
3683         cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3684         if (IS_ERR(cs))
3685                 return PTR_ERR(cs);
3686
3687         /* Ensure the LRI have landed before we invalidate & continue */
3688         *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3689         for (i = GEN8_3LVL_PDPES; i--; ) {
3690                 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3691                 u32 base = engine->mmio_base;
3692
3693                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3694                 *cs++ = upper_32_bits(pd_daddr);
3695                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3696                 *cs++ = lower_32_bits(pd_daddr);
3697         }
3698         *cs++ = MI_NOOP;
3699
3700         intel_ring_advance(rq, cs);
3701
3702         return 0;
3703 }
3704
3705 static int execlists_request_alloc(struct i915_request *request)
3706 {
3707         int ret;
3708
3709         GEM_BUG_ON(!intel_context_is_pinned(request->context));
3710
3711         /*
3712          * Flush enough space to reduce the likelihood of waiting after
3713          * we start building the request - in which case we will just
3714          * have to repeat work.
3715          */
3716         request->reserved_space += EXECLISTS_REQUEST_SIZE;
3717
3718         /*
3719          * Note that after this point, we have committed to using
3720          * this request as it is being used to both track the
3721          * state of engine initialisation and liveness of the
3722          * golden renderstate above. Think twice before you try
3723          * to cancel/unwind this request now.
3724          */
3725
3726         if (!i915_vm_is_4lvl(request->context->vm)) {
3727                 ret = emit_pdps(request);
3728                 if (ret)
3729                         return ret;
3730         }
3731
3732         /* Unconditionally invalidate GPU caches and TLBs. */
3733         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3734         if (ret)
3735                 return ret;
3736
3737         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3738         return 0;
3739 }
3740
3741 /*
3742  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3743  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3744  * but there is a slight complication as this is applied in WA batch where the
3745  * values are only initialized once so we cannot take register value at the
3746  * beginning and reuse it further; hence we save its value to memory, upload a
3747  * constant value with bit21 set and then we restore it back with the saved value.
3748  * To simplify the WA, a constant value is formed by using the default value
3749  * of this register. This shouldn't be a problem because we are only modifying
3750  * it for a short period and this batch in non-premptible. We can ofcourse
3751  * use additional instructions that read the actual value of the register
3752  * at that time and set our bit of interest but it makes the WA complicated.
3753  *
3754  * This WA is also required for Gen9 so extracting as a function avoids
3755  * code duplication.
3756  */
3757 static u32 *
3758 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3759 {
3760         /* NB no one else is allowed to scribble over scratch + 256! */
3761         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3762         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3763         *batch++ = intel_gt_scratch_offset(engine->gt,
3764                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3765         *batch++ = 0;
3766
3767         *batch++ = MI_LOAD_REGISTER_IMM(1);
3768         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3769         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3770
3771         batch = gen8_emit_pipe_control(batch,
3772                                        PIPE_CONTROL_CS_STALL |
3773                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
3774                                        0);
3775
3776         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3777         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3778         *batch++ = intel_gt_scratch_offset(engine->gt,
3779                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3780         *batch++ = 0;
3781
3782         return batch;
3783 }
3784
3785 /*
3786  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3787  * initialized at the beginning and shared across all contexts but this field
3788  * helps us to have multiple batches at different offsets and select them based
3789  * on a criteria. At the moment this batch always start at the beginning of the page
3790  * and at this point we don't have multiple wa_ctx batch buffers.
3791  *
3792  * The number of WA applied are not known at the beginning; we use this field
3793  * to return the no of DWORDS written.
3794  *
3795  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3796  * so it adds NOOPs as padding to make it cacheline aligned.
3797  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3798  * makes a complete batch buffer.
3799  */
3800 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3801 {
3802         /* WaDisableCtxRestoreArbitration:bdw,chv */
3803         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3804
3805         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3806         if (IS_BROADWELL(engine->i915))
3807                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3808
3809         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3810         /* Actual scratch location is at 128 bytes offset */
3811         batch = gen8_emit_pipe_control(batch,
3812                                        PIPE_CONTROL_FLUSH_L3 |
3813                                        PIPE_CONTROL_STORE_DATA_INDEX |
3814                                        PIPE_CONTROL_CS_STALL |
3815                                        PIPE_CONTROL_QW_WRITE,
3816                                        LRC_PPHWSP_SCRATCH_ADDR);
3817
3818         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3819
3820         /* Pad to end of cacheline */
3821         while ((unsigned long)batch % CACHELINE_BYTES)
3822                 *batch++ = MI_NOOP;
3823
3824         /*
3825          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3826          * execution depends on the length specified in terms of cache lines
3827          * in the register CTX_RCS_INDIRECT_CTX
3828          */
3829
3830         return batch;
3831 }
3832
3833 struct lri {
3834         i915_reg_t reg;
3835         u32 value;
3836 };
3837
3838 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3839 {
3840         GEM_BUG_ON(!count || count > 63);
3841
3842         *batch++ = MI_LOAD_REGISTER_IMM(count);
3843         do {
3844                 *batch++ = i915_mmio_reg_offset(lri->reg);
3845                 *batch++ = lri->value;
3846         } while (lri++, --count);
3847         *batch++ = MI_NOOP;
3848
3849         return batch;
3850 }
3851
3852 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3853 {
3854         static const struct lri lri[] = {
3855                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3856                 {
3857                         COMMON_SLICE_CHICKEN2,
3858                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3859                                        0),
3860                 },
3861
3862                 /* BSpec: 11391 */
3863                 {
3864                         FF_SLICE_CHICKEN,
3865                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3866                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3867                 },
3868
3869                 /* BSpec: 11299 */
3870                 {
3871                         _3D_CHICKEN3,
3872                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3873                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3874                 }
3875         };
3876
3877         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3878
3879         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3880         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3881
3882         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3883         batch = gen8_emit_pipe_control(batch,
3884                                        PIPE_CONTROL_FLUSH_L3 |
3885                                        PIPE_CONTROL_STORE_DATA_INDEX |
3886                                        PIPE_CONTROL_CS_STALL |
3887                                        PIPE_CONTROL_QW_WRITE,
3888                                        LRC_PPHWSP_SCRATCH_ADDR);
3889
3890         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3891
3892         /* WaMediaPoolStateCmdInWABB:bxt,glk */
3893         if (HAS_POOLED_EU(engine->i915)) {
3894                 /*
3895                  * EU pool configuration is setup along with golden context
3896                  * during context initialization. This value depends on
3897                  * device type (2x6 or 3x6) and needs to be updated based
3898                  * on which subslice is disabled especially for 2x6
3899                  * devices, however it is safe to load default
3900                  * configuration of 3x6 device instead of masking off
3901                  * corresponding bits because HW ignores bits of a disabled
3902                  * subslice and drops down to appropriate config. Please
3903                  * see render_state_setup() in i915_gem_render_state.c for
3904                  * possible configurations, to avoid duplication they are
3905                  * not shown here again.
3906                  */
3907                 *batch++ = GEN9_MEDIA_POOL_STATE;
3908                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3909                 *batch++ = 0x00777000;
3910                 *batch++ = 0;
3911                 *batch++ = 0;
3912                 *batch++ = 0;
3913         }
3914
3915         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3916
3917         /* Pad to end of cacheline */
3918         while ((unsigned long)batch % CACHELINE_BYTES)
3919                 *batch++ = MI_NOOP;
3920
3921         return batch;
3922 }
3923
3924 static u32 *
3925 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3926 {
3927         int i;
3928
3929         /*
3930          * WaPipeControlBefore3DStateSamplePattern: cnl
3931          *
3932          * Ensure the engine is idle prior to programming a
3933          * 3DSTATE_SAMPLE_PATTERN during a context restore.
3934          */
3935         batch = gen8_emit_pipe_control(batch,
3936                                        PIPE_CONTROL_CS_STALL,
3937                                        0);
3938         /*
3939          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3940          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3941          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3942          * confusing. Since gen8_emit_pipe_control() already advances the
3943          * batch by 6 dwords, we advance the other 10 here, completing a
3944          * cacheline. It's not clear if the workaround requires this padding
3945          * before other commands, or if it's just the regular padding we would
3946          * already have for the workaround bb, so leave it here for now.
3947          */
3948         for (i = 0; i < 10; i++)
3949                 *batch++ = MI_NOOP;
3950
3951         /* Pad to end of cacheline */
3952         while ((unsigned long)batch % CACHELINE_BYTES)
3953                 *batch++ = MI_NOOP;
3954
3955         return batch;
3956 }
3957
3958 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3959
3960 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3961 {
3962         struct drm_i915_gem_object *obj;
3963         struct i915_vma *vma;
3964         int err;
3965
3966         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3967         if (IS_ERR(obj))
3968                 return PTR_ERR(obj);
3969
3970         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3971         if (IS_ERR(vma)) {
3972                 err = PTR_ERR(vma);
3973                 goto err;
3974         }
3975
3976         err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3977         if (err)
3978                 goto err;
3979
3980         engine->wa_ctx.vma = vma;
3981         return 0;
3982
3983 err:
3984         i915_gem_object_put(obj);
3985         return err;
3986 }
3987
3988 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3989 {
3990         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3991 }
3992
3993 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3994
3995 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3996 {
3997         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3998         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3999                                             &wa_ctx->per_ctx };
4000         wa_bb_func_t wa_bb_fn[2];
4001         void *batch, *batch_ptr;
4002         unsigned int i;
4003         int ret;
4004
4005         if (engine->class != RENDER_CLASS)
4006                 return 0;
4007
4008         switch (INTEL_GEN(engine->i915)) {
4009         case 12:
4010         case 11:
4011                 return 0;
4012         case 10:
4013                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
4014                 wa_bb_fn[1] = NULL;
4015                 break;
4016         case 9:
4017                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
4018                 wa_bb_fn[1] = NULL;
4019                 break;
4020         case 8:
4021                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
4022                 wa_bb_fn[1] = NULL;
4023                 break;
4024         default:
4025                 MISSING_CASE(INTEL_GEN(engine->i915));
4026                 return 0;
4027         }
4028
4029         ret = lrc_setup_wa_ctx(engine);
4030         if (ret) {
4031                 drm_dbg(&engine->i915->drm,
4032                         "Failed to setup context WA page: %d\n", ret);
4033                 return ret;
4034         }
4035
4036         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
4037
4038         /*
4039          * Emit the two workaround batch buffers, recording the offset from the
4040          * start of the workaround batch buffer object for each and their
4041          * respective sizes.
4042          */
4043         batch_ptr = batch;
4044         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
4045                 wa_bb[i]->offset = batch_ptr - batch;
4046                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
4047                                                   CACHELINE_BYTES))) {
4048                         ret = -EINVAL;
4049                         break;
4050                 }
4051                 if (wa_bb_fn[i])
4052                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
4053                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
4054         }
4055         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
4056
4057         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
4058         __i915_gem_object_release_map(wa_ctx->vma->obj);
4059         if (ret)
4060                 lrc_destroy_wa_ctx(engine);
4061
4062         return ret;
4063 }
4064
4065 static void reset_csb_pointers(struct intel_engine_cs *engine)
4066 {
4067         struct intel_engine_execlists * const execlists = &engine->execlists;
4068         const unsigned int reset_value = execlists->csb_size - 1;
4069
4070         ring_set_paused(engine, 0);
4071
4072         /*
4073          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4074          * Bludgeon them with a mmio update to be sure.
4075          */
4076         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4077                      0xffff << 16 | reset_value << 8 | reset_value);
4078         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4079
4080         /*
4081          * After a reset, the HW starts writing into CSB entry [0]. We
4082          * therefore have to set our HEAD pointer back one entry so that
4083          * the *first* entry we check is entry 0. To complicate this further,
4084          * as we don't wait for the first interrupt after reset, we have to
4085          * fake the HW write to point back to the last entry so that our
4086          * inline comparison of our cached head position against the last HW
4087          * write works even before the first interrupt.
4088          */
4089         execlists->csb_head = reset_value;
4090         WRITE_ONCE(*execlists->csb_write, reset_value);
4091         wmb(); /* Make sure this is visible to HW (paranoia?) */
4092
4093         /* Check that the GPU does indeed update the CSB entries! */
4094         memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4095         invalidate_csb_entries(&execlists->csb_status[0],
4096                                &execlists->csb_status[reset_value]);
4097
4098         /* Once more for luck and our trusty paranoia */
4099         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4100                      0xffff << 16 | reset_value << 8 | reset_value);
4101         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4102
4103         GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4104 }
4105
4106 static void execlists_sanitize(struct intel_engine_cs *engine)
4107 {
4108         GEM_BUG_ON(execlists_active(&engine->execlists));
4109
4110         /*
4111          * Poison residual state on resume, in case the suspend didn't!
4112          *
4113          * We have to assume that across suspend/resume (or other loss
4114          * of control) that the contents of our pinned buffers has been
4115          * lost, replaced by garbage. Since this doesn't always happen,
4116          * let's poison such state so that we more quickly spot when
4117          * we falsely assume it has been preserved.
4118          */
4119         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4120                 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4121
4122         reset_csb_pointers(engine);
4123
4124         /*
4125          * The kernel_context HWSP is stored in the status_page. As above,
4126          * that may be lost on resume/initialisation, and so we need to
4127          * reset the value in the HWSP.
4128          */
4129         intel_timeline_reset_seqno(engine->kernel_context->timeline);
4130
4131         /* And scrub the dirty cachelines for the HWSP */
4132         clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4133 }
4134
4135 static void enable_error_interrupt(struct intel_engine_cs *engine)
4136 {
4137         u32 status;
4138
4139         engine->execlists.error_interrupt = 0;
4140         ENGINE_WRITE(engine, RING_EMR, ~0u);
4141         ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4142
4143         status = ENGINE_READ(engine, RING_ESR);
4144         if (unlikely(status)) {
4145                 drm_err(&engine->i915->drm,
4146                         "engine '%s' resumed still in error: %08x\n",
4147                         engine->name, status);
4148                 __intel_gt_reset(engine->gt, engine->mask);
4149         }
4150
4151         /*
4152          * On current gen8+, we have 2 signals to play with
4153          *
4154          * - I915_ERROR_INSTUCTION (bit 0)
4155          *
4156          *    Generate an error if the command parser encounters an invalid
4157          *    instruction
4158          *
4159          *    This is a fatal error.
4160          *
4161          * - CP_PRIV (bit 2)
4162          *
4163          *    Generate an error on privilege violation (where the CP replaces
4164          *    the instruction with a no-op). This also fires for writes into
4165          *    read-only scratch pages.
4166          *
4167          *    This is a non-fatal error, parsing continues.
4168          *
4169          * * there are a few others defined for odd HW that we do not use
4170          *
4171          * Since CP_PRIV fires for cases where we have chosen to ignore the
4172          * error (as the HW is validating and suppressing the mistakes), we
4173          * only unmask the instruction error bit.
4174          */
4175         ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4176 }
4177
4178 static void enable_execlists(struct intel_engine_cs *engine)
4179 {
4180         u32 mode;
4181
4182         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4183
4184         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4185
4186         if (INTEL_GEN(engine->i915) >= 11)
4187                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4188         else
4189                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4190         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4191
4192         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4193
4194         ENGINE_WRITE_FW(engine,
4195                         RING_HWS_PGA,
4196                         i915_ggtt_offset(engine->status_page.vma));
4197         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4198
4199         enable_error_interrupt(engine);
4200
4201         engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4202 }
4203
4204 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4205 {
4206         bool unexpected = false;
4207
4208         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4209                 drm_dbg(&engine->i915->drm,
4210                         "STOP_RING still set in RING_MI_MODE\n");
4211                 unexpected = true;
4212         }
4213
4214         return unexpected;
4215 }
4216
4217 static int execlists_resume(struct intel_engine_cs *engine)
4218 {
4219         intel_mocs_init_engine(engine);
4220
4221         intel_breadcrumbs_reset(engine->breadcrumbs);
4222
4223         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4224                 struct drm_printer p = drm_debug_printer(__func__);
4225
4226                 intel_engine_dump(engine, &p, NULL);
4227         }
4228
4229         enable_execlists(engine);
4230
4231         return 0;
4232 }
4233
4234 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4235 {
4236         struct intel_engine_execlists * const execlists = &engine->execlists;
4237         unsigned long flags;
4238
4239         ENGINE_TRACE(engine, "depth<-%d\n",
4240                      atomic_read(&execlists->tasklet.count));
4241
4242         /*
4243          * Prevent request submission to the hardware until we have
4244          * completed the reset in i915_gem_reset_finish(). If a request
4245          * is completed by one engine, it may then queue a request
4246          * to a second via its execlists->tasklet *just* as we are
4247          * calling engine->resume() and also writing the ELSP.
4248          * Turning off the execlists->tasklet until the reset is over
4249          * prevents the race.
4250          */
4251         __tasklet_disable_sync_once(&execlists->tasklet);
4252         GEM_BUG_ON(!reset_in_progress(execlists));
4253
4254         /* And flush any current direct submission. */
4255         spin_lock_irqsave(&engine->active.lock, flags);
4256         spin_unlock_irqrestore(&engine->active.lock, flags);
4257
4258         /*
4259          * We stop engines, otherwise we might get failed reset and a
4260          * dead gpu (on elk). Also as modern gpu as kbl can suffer
4261          * from system hang if batchbuffer is progressing when
4262          * the reset is issued, regardless of READY_TO_RESET ack.
4263          * Thus assume it is best to stop engines on all gens
4264          * where we have a gpu reset.
4265          *
4266          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4267          *
4268          * FIXME: Wa for more modern gens needs to be validated
4269          */
4270         ring_set_paused(engine, 1);
4271         intel_engine_stop_cs(engine);
4272
4273         engine->execlists.reset_ccid = active_ccid(engine);
4274 }
4275
4276 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4277 {
4278         int x;
4279
4280         x = lrc_ring_mi_mode(engine);
4281         if (x != -1) {
4282                 regs[x + 1] &= ~STOP_RING;
4283                 regs[x + 1] |= STOP_RING << 16;
4284         }
4285 }
4286
4287 static void __execlists_reset_reg_state(const struct intel_context *ce,
4288                                         const struct intel_engine_cs *engine)
4289 {
4290         u32 *regs = ce->lrc_reg_state;
4291
4292         __reset_stop_ring(regs, engine);
4293 }
4294
4295 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4296 {
4297         struct intel_engine_execlists * const execlists = &engine->execlists;
4298         struct intel_context *ce;
4299         struct i915_request *rq;
4300         u32 head;
4301
4302         mb(); /* paranoia: read the CSB pointers from after the reset */
4303         clflush(execlists->csb_write);
4304         mb();
4305
4306         process_csb(engine); /* drain preemption events */
4307
4308         /* Following the reset, we need to reload the CSB read/write pointers */
4309         reset_csb_pointers(engine);
4310
4311         /*
4312          * Save the currently executing context, even if we completed
4313          * its request, it was still running at the time of the
4314          * reset and will have been clobbered.
4315          */
4316         rq = active_context(engine, engine->execlists.reset_ccid);
4317         if (!rq)
4318                 goto unwind;
4319
4320         ce = rq->context;
4321         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4322
4323         if (i915_request_completed(rq)) {
4324                 /* Idle context; tidy up the ring so we can restart afresh */
4325                 head = intel_ring_wrap(ce->ring, rq->tail);
4326                 goto out_replay;
4327         }
4328
4329         /* We still have requests in-flight; the engine should be active */
4330         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4331
4332         /* Context has requests still in-flight; it should not be idle! */
4333         GEM_BUG_ON(i915_active_is_idle(&ce->active));
4334
4335         rq = active_request(ce->timeline, rq);
4336         head = intel_ring_wrap(ce->ring, rq->head);
4337         GEM_BUG_ON(head == ce->ring->tail);
4338
4339         /*
4340          * If this request hasn't started yet, e.g. it is waiting on a
4341          * semaphore, we need to avoid skipping the request or else we
4342          * break the signaling chain. However, if the context is corrupt
4343          * the request will not restart and we will be stuck with a wedged
4344          * device. It is quite often the case that if we issue a reset
4345          * while the GPU is loading the context image, that the context
4346          * image becomes corrupt.
4347          *
4348          * Otherwise, if we have not started yet, the request should replay
4349          * perfectly and we do not need to flag the result as being erroneous.
4350          */
4351         if (!i915_request_started(rq))
4352                 goto out_replay;
4353
4354         /*
4355          * If the request was innocent, we leave the request in the ELSP
4356          * and will try to replay it on restarting. The context image may
4357          * have been corrupted by the reset, in which case we may have
4358          * to service a new GPU hang, but more likely we can continue on
4359          * without impact.
4360          *
4361          * If the request was guilty, we presume the context is corrupt
4362          * and have to at least restore the RING register in the context
4363          * image back to the expected values to skip over the guilty request.
4364          */
4365         __i915_request_reset(rq, stalled);
4366
4367         /*
4368          * We want a simple context + ring to execute the breadcrumb update.
4369          * We cannot rely on the context being intact across the GPU hang,
4370          * so clear it and rebuild just what we need for the breadcrumb.
4371          * All pending requests for this context will be zapped, and any
4372          * future request will be after userspace has had the opportunity
4373          * to recreate its own state.
4374          */
4375 out_replay:
4376         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4377                      head, ce->ring->tail);
4378         __execlists_reset_reg_state(ce, engine);
4379         __execlists_update_reg_state(ce, engine, head);
4380         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4381
4382 unwind:
4383         /* Push back any incomplete requests for replay after the reset. */
4384         cancel_port_requests(execlists);
4385         __unwind_incomplete_requests(engine);
4386 }
4387
4388 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4389 {
4390         unsigned long flags;
4391
4392         ENGINE_TRACE(engine, "\n");
4393
4394         spin_lock_irqsave(&engine->active.lock, flags);
4395
4396         __execlists_reset(engine, stalled);
4397
4398         spin_unlock_irqrestore(&engine->active.lock, flags);
4399 }
4400
4401 static void nop_submission_tasklet(unsigned long data)
4402 {
4403         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4404
4405         /* The driver is wedged; don't process any more events. */
4406         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4407 }
4408
4409 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4410 {
4411         struct intel_engine_execlists * const execlists = &engine->execlists;
4412         struct i915_request *rq, *rn;
4413         struct rb_node *rb;
4414         unsigned long flags;
4415
4416         ENGINE_TRACE(engine, "\n");
4417
4418         /*
4419          * Before we call engine->cancel_requests(), we should have exclusive
4420          * access to the submission state. This is arranged for us by the
4421          * caller disabling the interrupt generation, the tasklet and other
4422          * threads that may then access the same state, giving us a free hand
4423          * to reset state. However, we still need to let lockdep be aware that
4424          * we know this state may be accessed in hardirq context, so we
4425          * disable the irq around this manipulation and we want to keep
4426          * the spinlock focused on its duties and not accidentally conflate
4427          * coverage to the submission's irq state. (Similarly, although we
4428          * shouldn't need to disable irq around the manipulation of the
4429          * submission's irq state, we also wish to remind ourselves that
4430          * it is irq state.)
4431          */
4432         spin_lock_irqsave(&engine->active.lock, flags);
4433
4434         __execlists_reset(engine, true);
4435
4436         /* Mark all executing requests as skipped. */
4437         list_for_each_entry(rq, &engine->active.requests, sched.link)
4438                 mark_eio(rq);
4439         intel_engine_signal_breadcrumbs(engine);
4440
4441         /* Flush the queued requests to the timeline list (for retiring). */
4442         while ((rb = rb_first_cached(&execlists->queue))) {
4443                 struct i915_priolist *p = to_priolist(rb);
4444                 int i;
4445
4446                 priolist_for_each_request_consume(rq, rn, p, i) {
4447                         mark_eio(rq);
4448                         __i915_request_submit(rq);
4449                 }
4450
4451                 rb_erase_cached(&p->node, &execlists->queue);
4452                 i915_priolist_free(p);
4453         }
4454
4455         /* On-hold requests will be flushed to timeline upon their release */
4456         list_for_each_entry(rq, &engine->active.hold, sched.link)
4457                 mark_eio(rq);
4458
4459         /* Cancel all attached virtual engines */
4460         while ((rb = rb_first_cached(&execlists->virtual))) {
4461                 struct virtual_engine *ve =
4462                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4463
4464                 rb_erase_cached(rb, &execlists->virtual);
4465                 RB_CLEAR_NODE(rb);
4466
4467                 spin_lock(&ve->base.active.lock);
4468                 rq = fetch_and_zero(&ve->request);
4469                 if (rq) {
4470                         mark_eio(rq);
4471
4472                         rq->engine = engine;
4473                         __i915_request_submit(rq);
4474                         i915_request_put(rq);
4475
4476                         ve->base.execlists.queue_priority_hint = INT_MIN;
4477                 }
4478                 spin_unlock(&ve->base.active.lock);
4479         }
4480
4481         /* Remaining _unready_ requests will be nop'ed when submitted */
4482
4483         execlists->queue_priority_hint = INT_MIN;
4484         execlists->queue = RB_ROOT_CACHED;
4485
4486         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4487         execlists->tasklet.func = nop_submission_tasklet;
4488
4489         spin_unlock_irqrestore(&engine->active.lock, flags);
4490 }
4491
4492 static void execlists_reset_finish(struct intel_engine_cs *engine)
4493 {
4494         struct intel_engine_execlists * const execlists = &engine->execlists;
4495
4496         /*
4497          * After a GPU reset, we may have requests to replay. Do so now while
4498          * we still have the forcewake to be sure that the GPU is not allowed
4499          * to sleep before we restart and reload a context.
4500          */
4501         GEM_BUG_ON(!reset_in_progress(execlists));
4502         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4503                 execlists->tasklet.func(execlists->tasklet.data);
4504
4505         if (__tasklet_enable(&execlists->tasklet))
4506                 /* And kick in case we missed a new request submission. */
4507                 tasklet_hi_schedule(&execlists->tasklet);
4508         ENGINE_TRACE(engine, "depth->%d\n",
4509                      atomic_read(&execlists->tasklet.count));
4510 }
4511
4512 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4513                                     u64 offset, u32 len,
4514                                     const unsigned int flags)
4515 {
4516         u32 *cs;
4517
4518         cs = intel_ring_begin(rq, 4);
4519         if (IS_ERR(cs))
4520                 return PTR_ERR(cs);
4521
4522         /*
4523          * WaDisableCtxRestoreArbitration:bdw,chv
4524          *
4525          * We don't need to perform MI_ARB_ENABLE as often as we do (in
4526          * particular all the gen that do not need the w/a at all!), if we
4527          * took care to make sure that on every switch into this context
4528          * (both ordinary and for preemption) that arbitrartion was enabled
4529          * we would be fine.  However, for gen8 there is another w/a that
4530          * requires us to not preempt inside GPGPU execution, so we keep
4531          * arbitration disabled for gen8 batches. Arbitration will be
4532          * re-enabled before we close the request
4533          * (engine->emit_fini_breadcrumb).
4534          */
4535         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4536
4537         /* FIXME(BDW+): Address space and security selectors. */
4538         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4539                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4540         *cs++ = lower_32_bits(offset);
4541         *cs++ = upper_32_bits(offset);
4542
4543         intel_ring_advance(rq, cs);
4544
4545         return 0;
4546 }
4547
4548 static int gen8_emit_bb_start(struct i915_request *rq,
4549                               u64 offset, u32 len,
4550                               const unsigned int flags)
4551 {
4552         u32 *cs;
4553
4554         cs = intel_ring_begin(rq, 6);
4555         if (IS_ERR(cs))
4556                 return PTR_ERR(cs);
4557
4558         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4559
4560         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4561                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4562         *cs++ = lower_32_bits(offset);
4563         *cs++ = upper_32_bits(offset);
4564
4565         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4566         *cs++ = MI_NOOP;
4567
4568         intel_ring_advance(rq, cs);
4569
4570         return 0;
4571 }
4572
4573 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4574 {
4575         ENGINE_WRITE(engine, RING_IMR,
4576                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
4577         ENGINE_POSTING_READ(engine, RING_IMR);
4578 }
4579
4580 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4581 {
4582         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4583 }
4584
4585 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4586 {
4587         u32 cmd, *cs;
4588
4589         cs = intel_ring_begin(request, 4);
4590         if (IS_ERR(cs))
4591                 return PTR_ERR(cs);
4592
4593         cmd = MI_FLUSH_DW + 1;
4594
4595         /* We always require a command barrier so that subsequent
4596          * commands, such as breadcrumb interrupts, are strictly ordered
4597          * wrt the contents of the write cache being flushed to memory
4598          * (and thus being coherent from the CPU).
4599          */
4600         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4601
4602         if (mode & EMIT_INVALIDATE) {
4603                 cmd |= MI_INVALIDATE_TLB;
4604                 if (request->engine->class == VIDEO_DECODE_CLASS)
4605                         cmd |= MI_INVALIDATE_BSD;
4606         }
4607
4608         *cs++ = cmd;
4609         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4610         *cs++ = 0; /* upper addr */
4611         *cs++ = 0; /* value */
4612         intel_ring_advance(request, cs);
4613
4614         return 0;
4615 }
4616
4617 static int gen8_emit_flush_render(struct i915_request *request,
4618                                   u32 mode)
4619 {
4620         bool vf_flush_wa = false, dc_flush_wa = false;
4621         u32 *cs, flags = 0;
4622         int len;
4623
4624         flags |= PIPE_CONTROL_CS_STALL;
4625
4626         if (mode & EMIT_FLUSH) {
4627                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4628                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4629                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4630                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4631         }
4632
4633         if (mode & EMIT_INVALIDATE) {
4634                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4635                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4636                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4637                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4638                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4639                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4640                 flags |= PIPE_CONTROL_QW_WRITE;
4641                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4642
4643                 /*
4644                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4645                  * pipe control.
4646                  */
4647                 if (IS_GEN(request->engine->i915, 9))
4648                         vf_flush_wa = true;
4649
4650                 /* WaForGAMHang:kbl */
4651                 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4652                         dc_flush_wa = true;
4653         }
4654
4655         len = 6;
4656
4657         if (vf_flush_wa)
4658                 len += 6;
4659
4660         if (dc_flush_wa)
4661                 len += 12;
4662
4663         cs = intel_ring_begin(request, len);
4664         if (IS_ERR(cs))
4665                 return PTR_ERR(cs);
4666
4667         if (vf_flush_wa)
4668                 cs = gen8_emit_pipe_control(cs, 0, 0);
4669
4670         if (dc_flush_wa)
4671                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4672                                             0);
4673
4674         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4675
4676         if (dc_flush_wa)
4677                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4678
4679         intel_ring_advance(request, cs);
4680
4681         return 0;
4682 }
4683
4684 static int gen11_emit_flush_render(struct i915_request *request,
4685                                    u32 mode)
4686 {
4687         if (mode & EMIT_FLUSH) {
4688                 u32 *cs;
4689                 u32 flags = 0;
4690
4691                 flags |= PIPE_CONTROL_CS_STALL;
4692
4693                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4694                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4695                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4696                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4697                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4698                 flags |= PIPE_CONTROL_QW_WRITE;
4699                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4700
4701                 cs = intel_ring_begin(request, 6);
4702                 if (IS_ERR(cs))
4703                         return PTR_ERR(cs);
4704
4705                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4706                 intel_ring_advance(request, cs);
4707         }
4708
4709         if (mode & EMIT_INVALIDATE) {
4710                 u32 *cs;
4711                 u32 flags = 0;
4712
4713                 flags |= PIPE_CONTROL_CS_STALL;
4714
4715                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4716                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4717                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4718                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4719                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4720                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4721                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4722                 flags |= PIPE_CONTROL_QW_WRITE;
4723                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4724
4725                 cs = intel_ring_begin(request, 6);
4726                 if (IS_ERR(cs))
4727                         return PTR_ERR(cs);
4728
4729                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4730                 intel_ring_advance(request, cs);
4731         }
4732
4733         return 0;
4734 }
4735
4736 static u32 preparser_disable(bool state)
4737 {
4738         return MI_ARB_CHECK | 1 << 8 | state;
4739 }
4740
4741 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4742 {
4743         static const i915_reg_t vd[] = {
4744                 GEN12_VD0_AUX_NV,
4745                 GEN12_VD1_AUX_NV,
4746                 GEN12_VD2_AUX_NV,
4747                 GEN12_VD3_AUX_NV,
4748         };
4749
4750         static const i915_reg_t ve[] = {
4751                 GEN12_VE0_AUX_NV,
4752                 GEN12_VE1_AUX_NV,
4753         };
4754
4755         if (engine->class == VIDEO_DECODE_CLASS)
4756                 return vd[engine->instance];
4757
4758         if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4759                 return ve[engine->instance];
4760
4761         GEM_BUG_ON("unknown aux_inv_reg\n");
4762
4763         return INVALID_MMIO_REG;
4764 }
4765
4766 static u32 *
4767 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4768 {
4769         *cs++ = MI_LOAD_REGISTER_IMM(1);
4770         *cs++ = i915_mmio_reg_offset(inv_reg);
4771         *cs++ = AUX_INV;
4772         *cs++ = MI_NOOP;
4773
4774         return cs;
4775 }
4776
4777 static int gen12_emit_flush_render(struct i915_request *request,
4778                                    u32 mode)
4779 {
4780         if (mode & EMIT_FLUSH) {
4781                 u32 flags = 0;
4782                 u32 *cs;
4783
4784                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4785                 flags |= PIPE_CONTROL_FLUSH_L3;
4786                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4787                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4788                 /* Wa_1409600907:tgl */
4789                 flags |= PIPE_CONTROL_DEPTH_STALL;
4790                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4791                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4792
4793                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4794                 flags |= PIPE_CONTROL_QW_WRITE;
4795
4796                 flags |= PIPE_CONTROL_CS_STALL;
4797
4798                 cs = intel_ring_begin(request, 6);
4799                 if (IS_ERR(cs))
4800                         return PTR_ERR(cs);
4801
4802                 cs = gen12_emit_pipe_control(cs,
4803                                              PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4804                                              flags, LRC_PPHWSP_SCRATCH_ADDR);
4805                 intel_ring_advance(request, cs);
4806         }
4807
4808         if (mode & EMIT_INVALIDATE) {
4809                 u32 flags = 0;
4810                 u32 *cs;
4811
4812                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4813                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4814                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4815                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4816                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4817                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4818                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4819
4820                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4821                 flags |= PIPE_CONTROL_QW_WRITE;
4822
4823                 flags |= PIPE_CONTROL_CS_STALL;
4824
4825                 cs = intel_ring_begin(request, 8 + 4);
4826                 if (IS_ERR(cs))
4827                         return PTR_ERR(cs);
4828
4829                 /*
4830                  * Prevent the pre-parser from skipping past the TLB
4831                  * invalidate and loading a stale page for the batch
4832                  * buffer / request payload.
4833                  */
4834                 *cs++ = preparser_disable(true);
4835
4836                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4837
4838                 /* hsdes: 1809175790 */
4839                 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4840
4841                 *cs++ = preparser_disable(false);
4842                 intel_ring_advance(request, cs);
4843         }
4844
4845         return 0;
4846 }
4847
4848 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4849 {
4850         intel_engine_mask_t aux_inv = 0;
4851         u32 cmd, *cs;
4852
4853         cmd = 4;
4854         if (mode & EMIT_INVALIDATE)
4855                 cmd += 2;
4856         if (mode & EMIT_INVALIDATE)
4857                 aux_inv = request->engine->mask & ~BIT(BCS0);
4858         if (aux_inv)
4859                 cmd += 2 * hweight8(aux_inv) + 2;
4860
4861         cs = intel_ring_begin(request, cmd);
4862         if (IS_ERR(cs))
4863                 return PTR_ERR(cs);
4864
4865         if (mode & EMIT_INVALIDATE)
4866                 *cs++ = preparser_disable(true);
4867
4868         cmd = MI_FLUSH_DW + 1;
4869
4870         /* We always require a command barrier so that subsequent
4871          * commands, such as breadcrumb interrupts, are strictly ordered
4872          * wrt the contents of the write cache being flushed to memory
4873          * (and thus being coherent from the CPU).
4874          */
4875         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4876
4877         if (mode & EMIT_INVALIDATE) {
4878                 cmd |= MI_INVALIDATE_TLB;
4879                 if (request->engine->class == VIDEO_DECODE_CLASS)
4880                         cmd |= MI_INVALIDATE_BSD;
4881         }
4882
4883         *cs++ = cmd;
4884         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4885         *cs++ = 0; /* upper addr */
4886         *cs++ = 0; /* value */
4887
4888         if (aux_inv) { /* hsdes: 1809175790 */
4889                 struct intel_engine_cs *engine;
4890                 unsigned int tmp;
4891
4892                 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4893                 for_each_engine_masked(engine, request->engine->gt,
4894                                        aux_inv, tmp) {
4895                         *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4896                         *cs++ = AUX_INV;
4897                 }
4898                 *cs++ = MI_NOOP;
4899         }
4900
4901         if (mode & EMIT_INVALIDATE)
4902                 *cs++ = preparser_disable(false);
4903
4904         intel_ring_advance(request, cs);
4905
4906         return 0;
4907 }
4908
4909 static void assert_request_valid(struct i915_request *rq)
4910 {
4911         struct intel_ring *ring __maybe_unused = rq->ring;
4912
4913         /* Can we unwind this request without appearing to go forwards? */
4914         GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4915 }
4916
4917 /*
4918  * Reserve space for 2 NOOPs at the end of each request to be
4919  * used as a workaround for not being allowed to do lite
4920  * restore with HEAD==TAIL (WaIdleLiteRestore).
4921  */
4922 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4923 {
4924         /* Ensure there's always at least one preemption point per-request. */
4925         *cs++ = MI_ARB_CHECK;
4926         *cs++ = MI_NOOP;
4927         request->wa_tail = intel_ring_offset(request, cs);
4928
4929         /* Check that entire request is less than half the ring */
4930         assert_request_valid(request);
4931
4932         return cs;
4933 }
4934
4935 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4936 {
4937         *cs++ = MI_SEMAPHORE_WAIT |
4938                 MI_SEMAPHORE_GLOBAL_GTT |
4939                 MI_SEMAPHORE_POLL |
4940                 MI_SEMAPHORE_SAD_EQ_SDD;
4941         *cs++ = 0;
4942         *cs++ = intel_hws_preempt_address(request->engine);
4943         *cs++ = 0;
4944
4945         return cs;
4946 }
4947
4948 static __always_inline u32*
4949 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4950 {
4951         *cs++ = MI_USER_INTERRUPT;
4952
4953         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4954         if (intel_engine_has_semaphores(request->engine))
4955                 cs = emit_preempt_busywait(request, cs);
4956
4957         request->tail = intel_ring_offset(request, cs);
4958         assert_ring_tail_valid(request->ring, request->tail);
4959
4960         return gen8_emit_wa_tail(request, cs);
4961 }
4962
4963 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
4964 {
4965         return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
4966 }
4967
4968 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4969 {
4970         return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4971 }
4972
4973 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4974 {
4975         cs = gen8_emit_pipe_control(cs,
4976                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4977                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4978                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
4979                                     0);
4980
4981         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4982         cs = gen8_emit_ggtt_write_rcs(cs,
4983                                       request->fence.seqno,
4984                                       hwsp_offset(request),
4985                                       PIPE_CONTROL_FLUSH_ENABLE |
4986                                       PIPE_CONTROL_CS_STALL);
4987
4988         return gen8_emit_fini_breadcrumb_tail(request, cs);
4989 }
4990
4991 static u32 *
4992 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4993 {
4994         cs = gen8_emit_ggtt_write_rcs(cs,
4995                                       request->fence.seqno,
4996                                       hwsp_offset(request),
4997                                       PIPE_CONTROL_CS_STALL |
4998                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
4999                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5000                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5001                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
5002                                       PIPE_CONTROL_FLUSH_ENABLE);
5003
5004         return gen8_emit_fini_breadcrumb_tail(request, cs);
5005 }
5006
5007 /*
5008  * Note that the CS instruction pre-parser will not stall on the breadcrumb
5009  * flush and will continue pre-fetching the instructions after it before the
5010  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
5011  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
5012  * of the next request before the memory has been flushed, we're guaranteed that
5013  * we won't access the batch itself too early.
5014  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
5015  * so, if the current request is modifying an instruction in the next request on
5016  * the same intel_context, we might pre-fetch and then execute the pre-update
5017  * instruction. To avoid this, the users of self-modifying code should either
5018  * disable the parser around the code emitting the memory writes, via a new flag
5019  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
5020  * the in-kernel use-cases we've opted to use a separate context, see
5021  * reloc_gpu() as an example.
5022  * All the above applies only to the instructions themselves. Non-inline data
5023  * used by the instructions is not pre-fetched.
5024  */
5025
5026 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
5027 {
5028         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
5029                 MI_SEMAPHORE_GLOBAL_GTT |
5030                 MI_SEMAPHORE_POLL |
5031                 MI_SEMAPHORE_SAD_EQ_SDD;
5032         *cs++ = 0;
5033         *cs++ = intel_hws_preempt_address(request->engine);
5034         *cs++ = 0;
5035         *cs++ = 0;
5036         *cs++ = MI_NOOP;
5037
5038         return cs;
5039 }
5040
5041 static __always_inline u32*
5042 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
5043 {
5044         *cs++ = MI_USER_INTERRUPT;
5045
5046         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
5047         if (intel_engine_has_semaphores(request->engine))
5048                 cs = gen12_emit_preempt_busywait(request, cs);
5049
5050         request->tail = intel_ring_offset(request, cs);
5051         assert_ring_tail_valid(request->ring, request->tail);
5052
5053         return gen8_emit_wa_tail(request, cs);
5054 }
5055
5056 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
5057 {
5058         /* XXX Stalling flush before seqno write; post-sync not */
5059         cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
5060         return gen12_emit_fini_breadcrumb_tail(rq, cs);
5061 }
5062
5063 static u32 *
5064 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
5065 {
5066         cs = gen12_emit_ggtt_write_rcs(cs,
5067                                        request->fence.seqno,
5068                                        hwsp_offset(request),
5069                                        PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
5070                                        PIPE_CONTROL_CS_STALL |
5071                                        PIPE_CONTROL_TILE_CACHE_FLUSH |
5072                                        PIPE_CONTROL_FLUSH_L3 |
5073                                        PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5074                                        PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5075                                        /* Wa_1409600907:tgl */
5076                                        PIPE_CONTROL_DEPTH_STALL |
5077                                        PIPE_CONTROL_DC_FLUSH_ENABLE |
5078                                        PIPE_CONTROL_FLUSH_ENABLE);
5079
5080         return gen12_emit_fini_breadcrumb_tail(request, cs);
5081 }
5082
5083 static void execlists_park(struct intel_engine_cs *engine)
5084 {
5085         cancel_timer(&engine->execlists.timer);
5086         cancel_timer(&engine->execlists.preempt);
5087 }
5088
5089 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5090 {
5091         engine->submit_request = execlists_submit_request;
5092         engine->schedule = i915_schedule;
5093         engine->execlists.tasklet.func = execlists_submission_tasklet;
5094
5095         engine->reset.prepare = execlists_reset_prepare;
5096         engine->reset.rewind = execlists_reset_rewind;
5097         engine->reset.cancel = execlists_reset_cancel;
5098         engine->reset.finish = execlists_reset_finish;
5099
5100         engine->park = execlists_park;
5101         engine->unpark = NULL;
5102
5103         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5104         if (!intel_vgpu_active(engine->i915)) {
5105                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5106                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5107                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5108                         if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5109                                 engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5110                 }
5111         }
5112
5113         if (INTEL_GEN(engine->i915) >= 12)
5114                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5115
5116         if (intel_engine_has_preemption(engine))
5117                 engine->emit_bb_start = gen8_emit_bb_start;
5118         else
5119                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
5120 }
5121
5122 static void execlists_shutdown(struct intel_engine_cs *engine)
5123 {
5124         /* Synchronise with residual timers and any softirq they raise */
5125         del_timer_sync(&engine->execlists.timer);
5126         del_timer_sync(&engine->execlists.preempt);
5127         tasklet_kill(&engine->execlists.tasklet);
5128 }
5129
5130 static void execlists_release(struct intel_engine_cs *engine)
5131 {
5132         engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5133
5134         execlists_shutdown(engine);
5135
5136         intel_engine_cleanup_common(engine);
5137         lrc_destroy_wa_ctx(engine);
5138 }
5139
5140 static void
5141 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5142 {
5143         /* Default vfuncs which can be overriden by each engine. */
5144
5145         engine->resume = execlists_resume;
5146
5147         engine->cops = &execlists_context_ops;
5148         engine->request_alloc = execlists_request_alloc;
5149
5150         engine->emit_flush = gen8_emit_flush;
5151         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5152         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5153         if (INTEL_GEN(engine->i915) >= 12) {
5154                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5155                 engine->emit_flush = gen12_emit_flush;
5156         }
5157         engine->set_default_submission = intel_execlists_set_default_submission;
5158
5159         if (INTEL_GEN(engine->i915) < 11) {
5160                 engine->irq_enable = gen8_logical_ring_enable_irq;
5161                 engine->irq_disable = gen8_logical_ring_disable_irq;
5162         } else {
5163                 /*
5164                  * TODO: On Gen11 interrupt masks need to be clear
5165                  * to allow C6 entry. Keep interrupts enabled at
5166                  * and take the hit of generating extra interrupts
5167                  * until a more refined solution exists.
5168                  */
5169         }
5170 }
5171
5172 static inline void
5173 logical_ring_default_irqs(struct intel_engine_cs *engine)
5174 {
5175         unsigned int shift = 0;
5176
5177         if (INTEL_GEN(engine->i915) < 11) {
5178                 const u8 irq_shifts[] = {
5179                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
5180                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
5181                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5182                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5183                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
5184                 };
5185
5186                 shift = irq_shifts[engine->id];
5187         }
5188
5189         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5190         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5191         engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5192         engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5193 }
5194
5195 static void rcs_submission_override(struct intel_engine_cs *engine)
5196 {
5197         switch (INTEL_GEN(engine->i915)) {
5198         case 12:
5199                 engine->emit_flush = gen12_emit_flush_render;
5200                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5201                 break;
5202         case 11:
5203                 engine->emit_flush = gen11_emit_flush_render;
5204                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5205                 break;
5206         default:
5207                 engine->emit_flush = gen8_emit_flush_render;
5208                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5209                 break;
5210         }
5211 }
5212
5213 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5214 {
5215         struct intel_engine_execlists * const execlists = &engine->execlists;
5216         struct drm_i915_private *i915 = engine->i915;
5217         struct intel_uncore *uncore = engine->uncore;
5218         u32 base = engine->mmio_base;
5219
5220         tasklet_init(&engine->execlists.tasklet,
5221                      execlists_submission_tasklet, (unsigned long)engine);
5222         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5223         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5224
5225         logical_ring_default_vfuncs(engine);
5226         logical_ring_default_irqs(engine);
5227
5228         if (engine->class == RENDER_CLASS)
5229                 rcs_submission_override(engine);
5230
5231         if (intel_init_workaround_bb(engine))
5232                 /*
5233                  * We continue even if we fail to initialize WA batch
5234                  * because we only expect rare glitches but nothing
5235                  * critical to prevent us from using GPU
5236                  */
5237                 drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5238
5239         if (HAS_LOGICAL_RING_ELSQ(i915)) {
5240                 execlists->submit_reg = uncore->regs +
5241                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5242                 execlists->ctrl_reg = uncore->regs +
5243                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5244         } else {
5245                 execlists->submit_reg = uncore->regs +
5246                         i915_mmio_reg_offset(RING_ELSP(base));
5247         }
5248
5249         execlists->csb_status =
5250                 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5251
5252         execlists->csb_write =
5253                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
5254
5255         if (INTEL_GEN(i915) < 11)
5256                 execlists->csb_size = GEN8_CSB_ENTRIES;
5257         else
5258                 execlists->csb_size = GEN11_CSB_ENTRIES;
5259
5260         if (INTEL_GEN(engine->i915) >= 11) {
5261                 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5262                 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5263         }
5264
5265         /* Finally, take ownership and responsibility for cleanup! */
5266         engine->sanitize = execlists_sanitize;
5267         engine->release = execlists_release;
5268
5269         return 0;
5270 }
5271
5272 static void init_common_reg_state(u32 * const regs,
5273                                   const struct intel_engine_cs *engine,
5274                                   const struct intel_ring *ring,
5275                                   bool inhibit)
5276 {
5277         u32 ctl;
5278
5279         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5280         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5281         if (inhibit)
5282                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5283         if (INTEL_GEN(engine->i915) < 11)
5284                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5285                                            CTX_CTRL_RS_CTX_ENABLE);
5286         regs[CTX_CONTEXT_CONTROL] = ctl;
5287
5288         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5289         regs[CTX_TIMESTAMP] = 0;
5290 }
5291
5292 static void init_wa_bb_reg_state(u32 * const regs,
5293                                  const struct intel_engine_cs *engine)
5294 {
5295         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5296
5297         if (wa_ctx->per_ctx.size) {
5298                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5299
5300                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5301                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5302                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5303         }
5304
5305         if (wa_ctx->indirect_ctx.size) {
5306                 lrc_ring_setup_indirect_ctx(regs, engine,
5307                                             i915_ggtt_offset(wa_ctx->vma) +
5308                                             wa_ctx->indirect_ctx.offset,
5309                                             wa_ctx->indirect_ctx.size);
5310         }
5311 }
5312
5313 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5314 {
5315         if (i915_vm_is_4lvl(&ppgtt->vm)) {
5316                 /* 64b PPGTT (48bit canonical)
5317                  * PDP0_DESCRIPTOR contains the base address to PML4 and
5318                  * other PDP Descriptors are ignored.
5319                  */
5320                 ASSIGN_CTX_PML4(ppgtt, regs);
5321         } else {
5322                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
5323                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
5324                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
5325                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
5326         }
5327 }
5328
5329 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5330 {
5331         if (i915_is_ggtt(vm))
5332                 return i915_vm_to_ggtt(vm)->alias;
5333         else
5334                 return i915_vm_to_ppgtt(vm);
5335 }
5336
5337 static void execlists_init_reg_state(u32 *regs,
5338                                      const struct intel_context *ce,
5339                                      const struct intel_engine_cs *engine,
5340                                      const struct intel_ring *ring,
5341                                      bool inhibit)
5342 {
5343         /*
5344          * A context is actually a big batch buffer with several
5345          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5346          * values we are setting here are only for the first context restore:
5347          * on a subsequent save, the GPU will recreate this batchbuffer with new
5348          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5349          * we are not initializing here).
5350          *
5351          * Must keep consistent with virtual_update_register_offsets().
5352          */
5353         set_offsets(regs, reg_offsets(engine), engine, inhibit);
5354
5355         init_common_reg_state(regs, engine, ring, inhibit);
5356         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5357
5358         init_wa_bb_reg_state(regs, engine);
5359
5360         __reset_stop_ring(regs, engine);
5361 }
5362
5363 static int
5364 populate_lr_context(struct intel_context *ce,
5365                     struct drm_i915_gem_object *ctx_obj,
5366                     struct intel_engine_cs *engine,
5367                     struct intel_ring *ring)
5368 {
5369         bool inhibit = true;
5370         void *vaddr;
5371
5372         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5373         if (IS_ERR(vaddr)) {
5374                 drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5375                 return PTR_ERR(vaddr);
5376         }
5377
5378         set_redzone(vaddr, engine);
5379
5380         if (engine->default_state) {
5381                 shmem_read(engine->default_state, 0,
5382                            vaddr, engine->context_size);
5383                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
5384                 inhibit = false;
5385         }
5386
5387         /* Clear the ppHWSP (inc. per-context counters) */
5388         memset(vaddr, 0, PAGE_SIZE);
5389
5390         /*
5391          * The second page of the context object contains some registers which
5392          * must be set up prior to the first execution.
5393          */
5394         execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5395                                  ce, engine, ring, inhibit);
5396
5397         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5398         i915_gem_object_unpin_map(ctx_obj);
5399         return 0;
5400 }
5401
5402 static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5403 {
5404         struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5405
5406         return intel_timeline_create_from_engine(ce->engine,
5407                                                  page_unmask_bits(tl));
5408 }
5409
5410 static int __execlists_context_alloc(struct intel_context *ce,
5411                                      struct intel_engine_cs *engine)
5412 {
5413         struct drm_i915_gem_object *ctx_obj;
5414         struct intel_ring *ring;
5415         struct i915_vma *vma;
5416         u32 context_size;
5417         int ret;
5418
5419         GEM_BUG_ON(ce->state);
5420         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5421
5422         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5423                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5424
5425         if (INTEL_GEN(engine->i915) == 12) {
5426                 ce->wa_bb_page = context_size / PAGE_SIZE;
5427                 context_size += PAGE_SIZE;
5428         }
5429
5430         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5431         if (IS_ERR(ctx_obj))
5432                 return PTR_ERR(ctx_obj);
5433
5434         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5435         if (IS_ERR(vma)) {
5436                 ret = PTR_ERR(vma);
5437                 goto error_deref_obj;
5438         }
5439
5440         if (!page_mask_bits(ce->timeline)) {
5441                 struct intel_timeline *tl;
5442
5443                 /*
5444                  * Use the static global HWSP for the kernel context, and
5445                  * a dynamically allocated cacheline for everyone else.
5446                  */
5447                 if (unlikely(ce->timeline))
5448                         tl = pinned_timeline(ce);
5449                 else
5450                         tl = intel_timeline_create(engine->gt);
5451                 if (IS_ERR(tl)) {
5452                         ret = PTR_ERR(tl);
5453                         goto error_deref_obj;
5454                 }
5455
5456                 ce->timeline = tl;
5457         }
5458
5459         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5460         if (IS_ERR(ring)) {
5461                 ret = PTR_ERR(ring);
5462                 goto error_deref_obj;
5463         }
5464
5465         ret = populate_lr_context(ce, ctx_obj, engine, ring);
5466         if (ret) {
5467                 drm_dbg(&engine->i915->drm,
5468                         "Failed to populate LRC: %d\n", ret);
5469                 goto error_ring_free;
5470         }
5471
5472         ce->ring = ring;
5473         ce->state = vma;
5474
5475         return 0;
5476
5477 error_ring_free:
5478         intel_ring_put(ring);
5479 error_deref_obj:
5480         i915_gem_object_put(ctx_obj);
5481         return ret;
5482 }
5483
5484 static struct list_head *virtual_queue(struct virtual_engine *ve)
5485 {
5486         return &ve->base.execlists.default_priolist.requests[0];
5487 }
5488
5489 static void rcu_virtual_context_destroy(struct work_struct *wrk)
5490 {
5491         struct virtual_engine *ve =
5492                 container_of(wrk, typeof(*ve), rcu.work);
5493         unsigned int n;
5494
5495         GEM_BUG_ON(ve->context.inflight);
5496
5497         /* Preempt-to-busy may leave a stale request behind. */
5498         if (unlikely(ve->request)) {
5499                 struct i915_request *old;
5500
5501                 spin_lock_irq(&ve->base.active.lock);
5502
5503                 old = fetch_and_zero(&ve->request);
5504                 if (old) {
5505                         GEM_BUG_ON(!i915_request_completed(old));
5506                         __i915_request_submit(old);
5507                         i915_request_put(old);
5508                 }
5509
5510                 spin_unlock_irq(&ve->base.active.lock);
5511         }
5512
5513         /*
5514          * Flush the tasklet in case it is still running on another core.
5515          *
5516          * This needs to be done before we remove ourselves from the siblings'
5517          * rbtrees as in the case it is running in parallel, it may reinsert
5518          * the rb_node into a sibling.
5519          */
5520         tasklet_kill(&ve->base.execlists.tasklet);
5521
5522         /* Decouple ourselves from the siblings, no more access allowed. */
5523         for (n = 0; n < ve->num_siblings; n++) {
5524                 struct intel_engine_cs *sibling = ve->siblings[n];
5525                 struct rb_node *node = &ve->nodes[sibling->id].rb;
5526
5527                 if (RB_EMPTY_NODE(node))
5528                         continue;
5529
5530                 spin_lock_irq(&sibling->active.lock);
5531
5532                 /* Detachment is lazily performed in the execlists tasklet */
5533                 if (!RB_EMPTY_NODE(node))
5534                         rb_erase_cached(node, &sibling->execlists.virtual);
5535
5536                 spin_unlock_irq(&sibling->active.lock);
5537         }
5538         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5539         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5540
5541         if (ve->context.state)
5542                 __execlists_context_fini(&ve->context);
5543         intel_context_fini(&ve->context);
5544
5545         intel_breadcrumbs_free(ve->base.breadcrumbs);
5546         intel_engine_free_request_pool(&ve->base);
5547
5548         kfree(ve->bonds);
5549         kfree(ve);
5550 }
5551
5552 static void virtual_context_destroy(struct kref *kref)
5553 {
5554         struct virtual_engine *ve =
5555                 container_of(kref, typeof(*ve), context.ref);
5556
5557         GEM_BUG_ON(!list_empty(&ve->context.signals));
5558
5559         /*
5560          * When destroying the virtual engine, we have to be aware that
5561          * it may still be in use from an hardirq/softirq context causing
5562          * the resubmission of a completed request (background completion
5563          * due to preempt-to-busy). Before we can free the engine, we need
5564          * to flush the submission code and tasklets that are still potentially
5565          * accessing the engine. Flushing the tasklets requires process context,
5566          * and since we can guard the resubmit onto the engine with an RCU read
5567          * lock, we can delegate the free of the engine to an RCU worker.
5568          */
5569         INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy);
5570         queue_rcu_work(system_wq, &ve->rcu);
5571 }
5572
5573 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5574 {
5575         int swp;
5576
5577         /*
5578          * Pick a random sibling on starting to help spread the load around.
5579          *
5580          * New contexts are typically created with exactly the same order
5581          * of siblings, and often started in batches. Due to the way we iterate
5582          * the array of sibling when submitting requests, sibling[0] is
5583          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5584          * randomised across the system, we also help spread the load by the
5585          * first engine we inspect being different each time.
5586          *
5587          * NB This does not force us to execute on this engine, it will just
5588          * typically be the first we inspect for submission.
5589          */
5590         swp = prandom_u32_max(ve->num_siblings);
5591         if (swp)
5592                 swap(ve->siblings[swp], ve->siblings[0]);
5593 }
5594
5595 static int virtual_context_alloc(struct intel_context *ce)
5596 {
5597         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5598
5599         return __execlists_context_alloc(ce, ve->siblings[0]);
5600 }
5601
5602 static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5603 {
5604         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5605
5606         /* Note: we must use a real engine class for setting up reg state */
5607         return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5608 }
5609
5610 static void virtual_context_enter(struct intel_context *ce)
5611 {
5612         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5613         unsigned int n;
5614
5615         for (n = 0; n < ve->num_siblings; n++)
5616                 intel_engine_pm_get(ve->siblings[n]);
5617
5618         intel_timeline_enter(ce->timeline);
5619 }
5620
5621 static void virtual_context_exit(struct intel_context *ce)
5622 {
5623         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5624         unsigned int n;
5625
5626         intel_timeline_exit(ce->timeline);
5627
5628         for (n = 0; n < ve->num_siblings; n++)
5629                 intel_engine_pm_put(ve->siblings[n]);
5630 }
5631
5632 static const struct intel_context_ops virtual_context_ops = {
5633         .alloc = virtual_context_alloc,
5634
5635         .pre_pin = execlists_context_pre_pin,
5636         .pin = virtual_context_pin,
5637         .unpin = execlists_context_unpin,
5638         .post_unpin = execlists_context_post_unpin,
5639
5640         .enter = virtual_context_enter,
5641         .exit = virtual_context_exit,
5642
5643         .destroy = virtual_context_destroy,
5644 };
5645
5646 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5647 {
5648         struct i915_request *rq;
5649         intel_engine_mask_t mask;
5650
5651         rq = READ_ONCE(ve->request);
5652         if (!rq)
5653                 return 0;
5654
5655         /* The rq is ready for submission; rq->execution_mask is now stable. */
5656         mask = rq->execution_mask;
5657         if (unlikely(!mask)) {
5658                 /* Invalid selection, submit to a random engine in error */
5659                 i915_request_set_error_once(rq, -ENODEV);
5660                 mask = ve->siblings[0]->mask;
5661         }
5662
5663         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5664                      rq->fence.context, rq->fence.seqno,
5665                      mask, ve->base.execlists.queue_priority_hint);
5666
5667         return mask;
5668 }
5669
5670 static void virtual_submission_tasklet(unsigned long data)
5671 {
5672         struct virtual_engine * const ve = (struct virtual_engine *)data;
5673         const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5674         intel_engine_mask_t mask;
5675         unsigned int n;
5676
5677         rcu_read_lock();
5678         mask = virtual_submission_mask(ve);
5679         rcu_read_unlock();
5680         if (unlikely(!mask))
5681                 return;
5682
5683         local_irq_disable();
5684         for (n = 0; n < ve->num_siblings; n++) {
5685                 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5686                 struct ve_node * const node = &ve->nodes[sibling->id];
5687                 struct rb_node **parent, *rb;
5688                 bool first;
5689
5690                 if (!READ_ONCE(ve->request))
5691                         break; /* already handled by a sibling's tasklet */
5692
5693                 if (unlikely(!(mask & sibling->mask))) {
5694                         if (!RB_EMPTY_NODE(&node->rb)) {
5695                                 spin_lock(&sibling->active.lock);
5696                                 rb_erase_cached(&node->rb,
5697                                                 &sibling->execlists.virtual);
5698                                 RB_CLEAR_NODE(&node->rb);
5699                                 spin_unlock(&sibling->active.lock);
5700                         }
5701                         continue;
5702                 }
5703
5704                 spin_lock(&sibling->active.lock);
5705
5706                 if (!RB_EMPTY_NODE(&node->rb)) {
5707                         /*
5708                          * Cheat and avoid rebalancing the tree if we can
5709                          * reuse this node in situ.
5710                          */
5711                         first = rb_first_cached(&sibling->execlists.virtual) ==
5712                                 &node->rb;
5713                         if (prio == node->prio || (prio > node->prio && first))
5714                                 goto submit_engine;
5715
5716                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5717                 }
5718
5719                 rb = NULL;
5720                 first = true;
5721                 parent = &sibling->execlists.virtual.rb_root.rb_node;
5722                 while (*parent) {
5723                         struct ve_node *other;
5724
5725                         rb = *parent;
5726                         other = rb_entry(rb, typeof(*other), rb);
5727                         if (prio > other->prio) {
5728                                 parent = &rb->rb_left;
5729                         } else {
5730                                 parent = &rb->rb_right;
5731                                 first = false;
5732                         }
5733                 }
5734
5735                 rb_link_node(&node->rb, rb, parent);
5736                 rb_insert_color_cached(&node->rb,
5737                                        &sibling->execlists.virtual,
5738                                        first);
5739
5740 submit_engine:
5741                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5742                 node->prio = prio;
5743                 if (first && prio > sibling->execlists.queue_priority_hint)
5744                         tasklet_hi_schedule(&sibling->execlists.tasklet);
5745
5746                 spin_unlock(&sibling->active.lock);
5747         }
5748         local_irq_enable();
5749 }
5750
5751 static void virtual_submit_request(struct i915_request *rq)
5752 {
5753         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5754         struct i915_request *old;
5755         unsigned long flags;
5756
5757         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5758                      rq->fence.context,
5759                      rq->fence.seqno);
5760
5761         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5762
5763         spin_lock_irqsave(&ve->base.active.lock, flags);
5764
5765         old = ve->request;
5766         if (old) { /* background completion event from preempt-to-busy */
5767                 GEM_BUG_ON(!i915_request_completed(old));
5768                 __i915_request_submit(old);
5769                 i915_request_put(old);
5770         }
5771
5772         if (i915_request_completed(rq)) {
5773                 __i915_request_submit(rq);
5774
5775                 ve->base.execlists.queue_priority_hint = INT_MIN;
5776                 ve->request = NULL;
5777         } else {
5778                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5779                 ve->request = i915_request_get(rq);
5780
5781                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5782                 list_move_tail(&rq->sched.link, virtual_queue(ve));
5783
5784                 tasklet_hi_schedule(&ve->base.execlists.tasklet);
5785         }
5786
5787         spin_unlock_irqrestore(&ve->base.active.lock, flags);
5788 }
5789
5790 static struct ve_bond *
5791 virtual_find_bond(struct virtual_engine *ve,
5792                   const struct intel_engine_cs *master)
5793 {
5794         int i;
5795
5796         for (i = 0; i < ve->num_bonds; i++) {
5797                 if (ve->bonds[i].master == master)
5798                         return &ve->bonds[i];
5799         }
5800
5801         return NULL;
5802 }
5803
5804 static void
5805 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5806 {
5807         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5808         intel_engine_mask_t allowed, exec;
5809         struct ve_bond *bond;
5810
5811         allowed = ~to_request(signal)->engine->mask;
5812
5813         bond = virtual_find_bond(ve, to_request(signal)->engine);
5814         if (bond)
5815                 allowed &= bond->sibling_mask;
5816
5817         /* Restrict the bonded request to run on only the available engines */
5818         exec = READ_ONCE(rq->execution_mask);
5819         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5820                 ;
5821
5822         /* Prevent the master from being re-run on the bonded engines */
5823         to_request(signal)->execution_mask &= ~allowed;
5824 }
5825
5826 struct intel_context *
5827 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5828                                unsigned int count)
5829 {
5830         struct virtual_engine *ve;
5831         unsigned int n;
5832         int err;
5833
5834         if (count == 0)
5835                 return ERR_PTR(-EINVAL);
5836
5837         if (count == 1)
5838                 return intel_context_create(siblings[0]);
5839
5840         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5841         if (!ve)
5842                 return ERR_PTR(-ENOMEM);
5843
5844         ve->base.i915 = siblings[0]->i915;
5845         ve->base.gt = siblings[0]->gt;
5846         ve->base.uncore = siblings[0]->uncore;
5847         ve->base.id = -1;
5848
5849         ve->base.class = OTHER_CLASS;
5850         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5851         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5852         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5853
5854         /*
5855          * The decision on whether to submit a request using semaphores
5856          * depends on the saturated state of the engine. We only compute
5857          * this during HW submission of the request, and we need for this
5858          * state to be globally applied to all requests being submitted
5859          * to this engine. Virtual engines encompass more than one physical
5860          * engine and so we cannot accurately tell in advance if one of those
5861          * engines is already saturated and so cannot afford to use a semaphore
5862          * and be pessimized in priority for doing so -- if we are the only
5863          * context using semaphores after all other clients have stopped, we
5864          * will be starved on the saturated system. Such a global switch for
5865          * semaphores is less than ideal, but alas is the current compromise.
5866          */
5867         ve->base.saturated = ALL_ENGINES;
5868
5869         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5870
5871         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5872         intel_engine_init_execlists(&ve->base);
5873
5874         ve->base.cops = &virtual_context_ops;
5875         ve->base.request_alloc = execlists_request_alloc;
5876
5877         ve->base.schedule = i915_schedule;
5878         ve->base.submit_request = virtual_submit_request;
5879         ve->base.bond_execute = virtual_bond_execute;
5880
5881         INIT_LIST_HEAD(virtual_queue(ve));
5882         ve->base.execlists.queue_priority_hint = INT_MIN;
5883         tasklet_init(&ve->base.execlists.tasklet,
5884                      virtual_submission_tasklet,
5885                      (unsigned long)ve);
5886
5887         intel_context_init(&ve->context, &ve->base);
5888
5889         ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5890         if (!ve->base.breadcrumbs) {
5891                 err = -ENOMEM;
5892                 goto err_put;
5893         }
5894
5895         for (n = 0; n < count; n++) {
5896                 struct intel_engine_cs *sibling = siblings[n];
5897
5898                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5899                 if (sibling->mask & ve->base.mask) {
5900                         DRM_DEBUG("duplicate %s entry in load balancer\n",
5901                                   sibling->name);
5902                         err = -EINVAL;
5903                         goto err_put;
5904                 }
5905
5906                 /*
5907                  * The virtual engine implementation is tightly coupled to
5908                  * the execlists backend -- we push out request directly
5909                  * into a tree inside each physical engine. We could support
5910                  * layering if we handle cloning of the requests and
5911                  * submitting a copy into each backend.
5912                  */
5913                 if (sibling->execlists.tasklet.func !=
5914                     execlists_submission_tasklet) {
5915                         err = -ENODEV;
5916                         goto err_put;
5917                 }
5918
5919                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5920                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5921
5922                 ve->siblings[ve->num_siblings++] = sibling;
5923                 ve->base.mask |= sibling->mask;
5924
5925                 /*
5926                  * All physical engines must be compatible for their emission
5927                  * functions (as we build the instructions during request
5928                  * construction and do not alter them before submission
5929                  * on the physical engine). We use the engine class as a guide
5930                  * here, although that could be refined.
5931                  */
5932                 if (ve->base.class != OTHER_CLASS) {
5933                         if (ve->base.class != sibling->class) {
5934                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5935                                           sibling->class, ve->base.class);
5936                                 err = -EINVAL;
5937                                 goto err_put;
5938                         }
5939                         continue;
5940                 }
5941
5942                 ve->base.class = sibling->class;
5943                 ve->base.uabi_class = sibling->uabi_class;
5944                 snprintf(ve->base.name, sizeof(ve->base.name),
5945                          "v%dx%d", ve->base.class, count);
5946                 ve->base.context_size = sibling->context_size;
5947
5948                 ve->base.emit_bb_start = sibling->emit_bb_start;
5949                 ve->base.emit_flush = sibling->emit_flush;
5950                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5951                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5952                 ve->base.emit_fini_breadcrumb_dw =
5953                         sibling->emit_fini_breadcrumb_dw;
5954
5955                 ve->base.flags = sibling->flags;
5956         }
5957
5958         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5959
5960         virtual_engine_initial_hint(ve);
5961         return &ve->context;
5962
5963 err_put:
5964         intel_context_put(&ve->context);
5965         return ERR_PTR(err);
5966 }
5967
5968 struct intel_context *
5969 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5970 {
5971         struct virtual_engine *se = to_virtual_engine(src);
5972         struct intel_context *dst;
5973
5974         dst = intel_execlists_create_virtual(se->siblings,
5975                                              se->num_siblings);
5976         if (IS_ERR(dst))
5977                 return dst;
5978
5979         if (se->num_bonds) {
5980                 struct virtual_engine *de = to_virtual_engine(dst->engine);
5981
5982                 de->bonds = kmemdup(se->bonds,
5983                                     sizeof(*se->bonds) * se->num_bonds,
5984                                     GFP_KERNEL);
5985                 if (!de->bonds) {
5986                         intel_context_put(dst);
5987                         return ERR_PTR(-ENOMEM);
5988                 }
5989
5990                 de->num_bonds = se->num_bonds;
5991         }
5992
5993         return dst;
5994 }
5995
5996 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5997                                      const struct intel_engine_cs *master,
5998                                      const struct intel_engine_cs *sibling)
5999 {
6000         struct virtual_engine *ve = to_virtual_engine(engine);
6001         struct ve_bond *bond;
6002         int n;
6003
6004         /* Sanity check the sibling is part of the virtual engine */
6005         for (n = 0; n < ve->num_siblings; n++)
6006                 if (sibling == ve->siblings[n])
6007                         break;
6008         if (n == ve->num_siblings)
6009                 return -EINVAL;
6010
6011         bond = virtual_find_bond(ve, master);
6012         if (bond) {
6013                 bond->sibling_mask |= sibling->mask;
6014                 return 0;
6015         }
6016
6017         bond = krealloc(ve->bonds,
6018                         sizeof(*bond) * (ve->num_bonds + 1),
6019                         GFP_KERNEL);
6020         if (!bond)
6021                 return -ENOMEM;
6022
6023         bond[ve->num_bonds].master = master;
6024         bond[ve->num_bonds].sibling_mask = sibling->mask;
6025
6026         ve->bonds = bond;
6027         ve->num_bonds++;
6028
6029         return 0;
6030 }
6031
6032 void intel_execlists_show_requests(struct intel_engine_cs *engine,
6033                                    struct drm_printer *m,
6034                                    void (*show_request)(struct drm_printer *m,
6035                                                         struct i915_request *rq,
6036                                                         const char *prefix),
6037                                    unsigned int max)
6038 {
6039         const struct intel_engine_execlists *execlists = &engine->execlists;
6040         struct i915_request *rq, *last;
6041         unsigned long flags;
6042         unsigned int count;
6043         struct rb_node *rb;
6044
6045         spin_lock_irqsave(&engine->active.lock, flags);
6046
6047         last = NULL;
6048         count = 0;
6049         list_for_each_entry(rq, &engine->active.requests, sched.link) {
6050                 if (count++ < max - 1)
6051                         show_request(m, rq, "\t\tE ");
6052                 else
6053                         last = rq;
6054         }
6055         if (last) {
6056                 if (count > max) {
6057                         drm_printf(m,
6058                                    "\t\t...skipping %d executing requests...\n",
6059                                    count - max);
6060                 }
6061                 show_request(m, last, "\t\tE ");
6062         }
6063
6064         if (execlists->switch_priority_hint != INT_MIN)
6065                 drm_printf(m, "\t\tSwitch priority hint: %d\n",
6066                            READ_ONCE(execlists->switch_priority_hint));
6067         if (execlists->queue_priority_hint != INT_MIN)
6068                 drm_printf(m, "\t\tQueue priority hint: %d\n",
6069                            READ_ONCE(execlists->queue_priority_hint));
6070
6071         last = NULL;
6072         count = 0;
6073         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
6074                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
6075                 int i;
6076
6077                 priolist_for_each_request(rq, p, i) {
6078                         if (count++ < max - 1)
6079                                 show_request(m, rq, "\t\tQ ");
6080                         else
6081                                 last = rq;
6082                 }
6083         }
6084         if (last) {
6085                 if (count > max) {
6086                         drm_printf(m,
6087                                    "\t\t...skipping %d queued requests...\n",
6088                                    count - max);
6089                 }
6090                 show_request(m, last, "\t\tQ ");
6091         }
6092
6093         last = NULL;
6094         count = 0;
6095         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
6096                 struct virtual_engine *ve =
6097                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
6098                 struct i915_request *rq = READ_ONCE(ve->request);
6099
6100                 if (rq) {
6101                         if (count++ < max - 1)
6102                                 show_request(m, rq, "\t\tV ");
6103                         else
6104                                 last = rq;
6105                 }
6106         }
6107         if (last) {
6108                 if (count > max) {
6109                         drm_printf(m,
6110                                    "\t\t...skipping %d virtual requests...\n",
6111                                    count - max);
6112                 }
6113                 show_request(m, last, "\t\tV ");
6114         }
6115
6116         spin_unlock_irqrestore(&engine->active.lock, flags);
6117 }
6118
6119 void intel_lr_context_reset(struct intel_engine_cs *engine,
6120                             struct intel_context *ce,
6121                             u32 head,
6122                             bool scrub)
6123 {
6124         GEM_BUG_ON(!intel_context_is_pinned(ce));
6125
6126         /*
6127          * We want a simple context + ring to execute the breadcrumb update.
6128          * We cannot rely on the context being intact across the GPU hang,
6129          * so clear it and rebuild just what we need for the breadcrumb.
6130          * All pending requests for this context will be zapped, and any
6131          * future request will be after userspace has had the opportunity
6132          * to recreate its own state.
6133          */
6134         if (scrub)
6135                 restore_default_state(ce, engine);
6136
6137         /* Rerun the request; its payload has been neutered (if guilty). */
6138         __execlists_update_reg_state(ce, engine, head);
6139 }
6140
6141 bool
6142 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6143 {
6144         return engine->set_default_submission ==
6145                intel_execlists_set_default_submission;
6146 }
6147
6148 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6149 #include "selftest_lrc.c"
6150 #endif